Skip to content

Commit

Permalink
single word heuristic for phrase extraction,
Browse files Browse the repository at this point in the history
and minor modification of SentenceAlignmentWithSyntax constructor
  • Loading branch information
Matthias Huck committed Feb 3, 2016
1 parent 16a49d0 commit 5de88ec
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 19 deletions.
8 changes: 8 additions & 0 deletions phrase-extract/PhraseExtractionOptions.h
Expand Up @@ -51,6 +51,7 @@ class PhraseExtractionOptions
bool gzOutput;
std::string instanceWeightsFile; //weights for each sentence
bool flexScoreFlag;
bool singleWordHeuristicFlag;

public:
std::vector<std::string> placeholders;
Expand All @@ -73,6 +74,7 @@ class PhraseExtractionOptions
onlyOutputSpanInfo(false),
gzOutput(false),
flexScoreFlag(false),
singleWordHeuristicFlag(false),
debug(false) {
}

Expand Down Expand Up @@ -119,6 +121,9 @@ class PhraseExtractionOptions
void initFlexScoreFlag(const bool initflexScoreFlag) {
flexScoreFlag=initflexScoreFlag;
}
void initSingleWordHeuristicFlag(const bool initSingleWordHeuristicFlag) {
singleWordHeuristicFlag = initSingleWordHeuristicFlag;
}

// functions for getting values
bool isAllModelsOutputFlag() const {
Expand Down Expand Up @@ -163,6 +168,9 @@ class PhraseExtractionOptions
bool isFlexScoreFlag() const {
return flexScoreFlag;
}
bool isSingleWordHeuristicFlag() const {
return singleWordHeuristicFlag;
}
};

}
Expand Down
3 changes: 0 additions & 3 deletions phrase-extract/RuleExtractionOptions.h
Expand Up @@ -18,8 +18,6 @@
***********************************************************************/

#pragma once
#ifndef RULEEXTRACTIONOPTIONS_H_INCLUDED_
#define RULEEXTRACTIONOPTIONS_H_INCLUDED_

namespace MosesTraining
{
Expand Down Expand Up @@ -95,4 +93,3 @@ struct RuleExtractionOptions {

}

#endif
4 changes: 2 additions & 2 deletions phrase-extract/SentenceAlignmentWithSyntax.cpp
Expand Up @@ -35,7 +35,7 @@ namespace MosesTraining

bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules)
{
if (!m_options.targetSyntax) {
if (!m_targetSyntax) {
return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules);
}

Expand All @@ -56,7 +56,7 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin

bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules)
{
if (!m_options.sourceSyntax) {
if (!m_sourceSyntax) {
return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules);
}

Expand Down
11 changes: 5 additions & 6 deletions phrase-extract/SentenceAlignmentWithSyntax.h
Expand Up @@ -18,8 +18,6 @@
***********************************************************************/

#pragma once
#ifndef SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_
#define SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_

#include <map>
#include <set>
Expand All @@ -42,18 +40,20 @@ class SentenceAlignmentWithSyntax : public SentenceAlignment
std::set<std::string> & m_sourceLabelCollection;
std::map<std::string, int> & m_targetTopLabelCollection;
std::map<std::string, int> & m_sourceTopLabelCollection;
const RuleExtractionOptions & m_options;
const bool m_targetSyntax, m_sourceSyntax;

SentenceAlignmentWithSyntax(std::set<std::string> & tgtLabelColl,
std::set<std::string> & srcLabelColl,
std::map<std::string,int> & tgtTopLabelColl,
std::map<std::string,int> & srcTopLabelColl,
const RuleExtractionOptions & options)
bool targetSyntax,
bool sourceSyntax)
: m_targetLabelCollection(tgtLabelColl)
, m_sourceLabelCollection(srcLabelColl)
, m_targetTopLabelCollection(tgtTopLabelColl)
, m_sourceTopLabelCollection(srcTopLabelColl)
, m_options(options) {
, m_targetSyntax(targetSyntax)
, m_sourceSyntax(sourceSyntax) {
}

virtual ~SentenceAlignmentWithSyntax() {}
Expand All @@ -67,4 +67,3 @@ class SentenceAlignmentWithSyntax : public SentenceAlignment

}

#endif
20 changes: 13 additions & 7 deletions phrase-extract/extract-main.cpp
Expand Up @@ -155,6 +155,8 @@ int main(int argc, char* argv[])
options.initOrientationFlag(true);
} else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
options.initFlexScoreFlag(true);
} else if (strcmp(argv[i],"--SingleWordHeuristic") == 0) {
options.initSingleWordHeuristicFlag(true);
} else if (strcmp(argv[i],"--NoTTable") == 0) {
options.initTranslationFlag(false);
} else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
Expand Down Expand Up @@ -413,18 +415,22 @@ void ExtractTask::extract(SentenceAlignment &sentence)
}

// cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
if (!out_of_bounds) {
if (!out_of_bounds ||
( m_options.isSingleWordHeuristicFlag() && (endE==startE) && (minF==maxF) )) // extraction of single word phrases even if inconsistent wrt. word alignment
{
// start point of source phrase may retreat over unaligned
for(int startF=minF;
(startF>=0 &&
(relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
(startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
((startF>=0 &&
(relaxLimit || startF>maxF-m_options.maxPhraseLength) && // within length limit
(startF==minF || sentence.alignedCountS[startF]==0)) && // unaligned
(!out_of_bounds || (startF==minF))); // if out of bounds, but single word heuristic: don't retreat over unaligned
startF--)
// end point of source phrase may advance over unaligned
for(int endF=maxF;
(endF<countF &&
(relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
(endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
((endF<countF &&
(relaxLimit || endF<startF+m_options.maxPhraseLength) && // within length limit
(endF==maxF || sentence.alignedCountS[endF]==0)) && // unaligned
(!out_of_bounds || (endF==maxF))); // if out of bounds, but single word heuristic: don't advance over unaligned
endF++) { // at this point we have extracted a phrase
if(buildExtraStructure) { // phrase || hier
if(endE-startE < m_options.maxPhraseLength && endF-startF < m_options.maxPhraseLength) { // within limit
Expand Down
3 changes: 2 additions & 1 deletion phrase-extract/extract-rules-main.cpp
Expand Up @@ -347,7 +347,8 @@ int main(int argc, char* argv[])

SentenceAlignmentWithSyntax sentence
(targetLabelCollection, sourceLabelCollection,
targetTopLabelCollection, sourceTopLabelCollection, options);
targetTopLabelCollection, sourceTopLabelCollection,
options.targetSyntax, options.sourceSyntax);
//az: output src, tgt, and alingment line
if (options.onlyOutputSpanInfo) {
cout << "LOG: SRC: " << sourceString << endl;
Expand Down

0 comments on commit 5de88ec

Please sign in to comment.