diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index 525808f989..1e8316cadb 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -21,6 +21,7 @@ #include "moses/FF/SourceWordDeletionFeature.h" #include "moses/FF/GlobalLexicalModel.h" #include "moses/FF/GlobalLexicalModelUnlimited.h" +#include "moses/FF/Model1Feature.h" #include "moses/FF/UnknownWordPenaltyProducer.h" #include "moses/FF/WordTranslationFeature.h" #include "moses/FF/TargetBigramFeature.h" @@ -202,6 +203,7 @@ FeatureRegistry::FeatureRegistry() MOSES_FNAME(GlobalLexicalModel); //MOSES_FNAME(GlobalLexicalModelUnlimited); This was commented out in the original + MOSES_FNAME(Model1Feature); MOSES_FNAME(SourceWordDeletionFeature); MOSES_FNAME(TargetWordInsertionFeature); MOSES_FNAME(PhraseBoundaryFeature); diff --git a/moses/FF/Model1Feature.cpp b/moses/FF/Model1Feature.cpp new file mode 100644 index 0000000000..d6b9a0a8bf --- /dev/null +++ b/moses/FF/Model1Feature.cpp @@ -0,0 +1,209 @@ +#include +#include "util/exception.hh" +#include "Model1Feature.h" +#include "moses/StaticData.h" +#include "moses/InputFileStream.h" +#include "moses/ScoreComponentCollection.h" +#include "moses/Hypothesis.h" +#include "moses/ChartHypothesis.h" +#include "moses/ChartManager.h" +#include "moses/FactorCollection.h" + + +using namespace std; + +namespace Moses +{ + +const std::string Model1Vocabulary::GIZANULL = "GIZANULL"; + +Model1Vocabulary::Model1Vocabulary() +{ + FactorCollection &factorCollection = FactorCollection::Instance(); + m_NULL = factorCollection.AddFactor(GIZANULL,false); + Store(m_NULL,0); +} + +bool Model1Vocabulary::Store(const Factor* word, const unsigned id) +{ + boost::unordered_map::iterator iter = m_lookup.find( word ); + if ( iter != m_lookup.end() ) { + return false; + } + m_lookup[ word ] = id; + if ( m_vocab.size() <= id ) { + m_vocab.resize(id+1); + } + m_vocab[id] = word; + return true; +} + +unsigned Model1Vocabulary::StoreIfNew(const Factor* word) +{ + boost::unordered_map::iterator iter = m_lookup.find( word ); + + if ( iter != m_lookup.end() ) { + return iter->second; + } + + unsigned id = m_vocab.size(); + m_vocab.push_back( word ); + m_lookup[ word ] = id; + return id; +} + +unsigned Model1Vocabulary::GetWordID(const Factor* word) const +{ + boost::unordered_map::const_iterator iter = m_lookup.find( word ); + if ( iter == m_lookup.end() ) { + return INVALID_ID; + } + return iter->second; +} + +const Factor* Model1Vocabulary::GetWord(unsigned id) const +{ + if (id >= m_vocab.size()) { + return NULL; + } + return m_vocab[ id ]; +} + +void Model1Vocabulary::Load(const std::string& fileName) +{ + InputFileStream inFile(fileName); + FactorCollection &factorCollection = FactorCollection::Instance(); + std::string line; + + unsigned i = 0; + while ( getline(inFile, line) ) + { + ++i; + std::vector tokens = Tokenize(line); + UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens."); + unsigned id = Scan(tokens[0]); + const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading? + bool stored = Store(factor, id); + UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry."); + } + inFile.Close(); +} + + +void Model1LexicalTable::Load(const std::string &fileName, const Model1Vocabulary& vcbS, const Model1Vocabulary& vcbT) +{ + InputFileStream inFile(fileName); + std::string line; + + unsigned i = 0; + while ( getline(inFile, line) ) + { + ++i; + std::vector tokens = Tokenize(line); + UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens."); + unsigned idS = Scan(tokens[0]); + unsigned idT = Scan(tokens[1]); + const Factor* wordS = vcbS.GetWord(idS); + const Factor* wordT = vcbT.GetWord(idT); + float prob = Scan(tokens[2]); + if ( (wordS != NULL) && (wordT != NULL) ) { + m_ltable[ wordS ][ wordT ] = prob; + } + UTIL_THROW_IF2((wordS == NULL) || (wordT == NULL), "Line " << i << " in " << fileName << " has unknown vocabulary."); // TODO: can we assume that the vocabulary is know and filter the model on loading? Then remove this line. + } + inFile.Close(); +} + +// p( wordT | wordS ) +float Model1LexicalTable::GetProbability(const Factor* wordS, const Factor* wordT) const +{ + float prob = m_floor; + + boost::unordered_map< const Factor*, boost::unordered_map< const Factor*, float > >::const_iterator iter1 = m_ltable.find( wordS ); + + if ( iter1 != m_ltable.end() ) { + boost::unordered_map< const Factor*, float >::const_iterator iter2 = iter1->second.find( wordT ); + if ( iter2 != iter1->second.end() ) { + prob = iter2->second; + if ( prob < m_floor ) { + prob = m_floor; + } + } + } + return prob; +} + + +Model1Feature::Model1Feature(const std::string &line) + : StatelessFeatureFunction(1, line) +{ + VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ..."); + ReadParameters(); + VERBOSE(1, " Done."); +} + +void Model1Feature::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "path") { + m_fileNameModel1 = value; + } else if (key == "sourceVocabulary") { + m_fileNameVcbS = value; + } else if (key == "targetVocabulary") { + m_fileNameVcbT = value; + } else { + StatelessFeatureFunction::SetParameter(key, value); + } +} + +void Model1Feature::Load() +{ + FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading source vocabulary from file " << m_fileNameVcbS << " ..."); + Model1Vocabulary vcbS; + vcbS.Load(m_fileNameVcbS); + FEATUREVERBOSE2(2, " Done." << std::endl); + FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading target vocabulary from file " << m_fileNameVcbT << " ..."); + Model1Vocabulary vcbT; + vcbT.Load(m_fileNameVcbT); + FEATUREVERBOSE2(2, " Done." << std::endl); + FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading model 1 lexical translation table from file " << m_fileNameModel1 << " ..."); + m_model1.Load(m_fileNameModel1,vcbS,vcbT); + FEATUREVERBOSE2(2, " Done." << std::endl); + FactorCollection &factorCollection = FactorCollection::Instance(); + m_emptyWord = factorCollection.GetFactor(Model1Vocabulary::GIZANULL,false); + UTIL_THROW_IF2(m_emptyWord==NULL, GetScoreProducerDescription() + << ": Factor for GIZA empty word does not exist."); +} + +void Model1Feature::EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedFutureScore) const +{ + const Sentence& sentence = static_cast(input); + float score = 0.0; + float norm = TransformScore(1+sentence.GetSize()); + + for (size_t posT=0; posT and + { + const Word &wordS = sentence.GetWord(posS); + float modelProb = m_model1.GetProbability(wordS[0],wordT[0]); + FEATUREVERBOSE(3, "p( " << wordT << " | " << wordS << " ) = " << modelProb << std::endl); + thisWordProb += modelProb; + } + score += TransformScore(thisWordProb) - norm; + } + } + + scoreBreakdown.PlusEquals(this, score); +} + +} + diff --git a/moses/FF/Model1Feature.h b/moses/FF/Model1Feature.h new file mode 100644 index 0000000000..7df941e7a8 --- /dev/null +++ b/moses/FF/Model1Feature.h @@ -0,0 +1,102 @@ +#pragma once + +#include +#include +#include +#include "StatelessFeatureFunction.h" +#include "FFState.h" +#include "moses/Factor.h" + +namespace Moses +{ + +class Model1Vocabulary +{ +public: + + #define INVALID_ID std::numeric_limits::max() // UINT_MAX + static const std::string GIZANULL; + + Model1Vocabulary(); + bool Store(const Factor* word, const unsigned id); + unsigned StoreIfNew(const Factor* word); + unsigned GetWordID(const Factor* word) const; + const Factor* GetWord(unsigned id) const; + void Load(const std::string& fileName); + +protected: + boost::unordered_map m_lookup; + std::vector< const Factor* > m_vocab; + const Factor* m_NULL; +}; + + +class Model1LexicalTable +{ +public: + Model1LexicalTable(float floor=1e-7) : m_floor(floor) + {} + + void Load(const std::string& fileName, const Model1Vocabulary& vcbS, const Model1Vocabulary& vcbT); + + // p( wordT | wordS ) + float GetProbability(const Factor* wordS, const Factor* wordT) const; + +protected: + boost::unordered_map< const Factor*, boost::unordered_map< const Factor*, float > > m_ltable; + const float m_floor; +}; + + + +class Model1Feature : public StatelessFeatureFunction +{ +public: + Model1Feature(const std::string &line); + + bool IsUseable(const FactorMask &mask) const { + return true; + } + + void SetParameter(const std::string& key, const std::string& value); + + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const + {}; + + void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedFutureScore = NULL) const; + + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const + {} + + void EvaluateWhenApplied( + const Hypothesis& cur_hypo, + ScoreComponentCollection* accumulator) const + {} + + void EvaluateWhenApplied( + const ChartHypothesis& cur_hypo, + ScoreComponentCollection* accumulator) const + {} + +private: + std::string m_fileNameVcbS; + std::string m_fileNameVcbT; + std::string m_fileNameModel1; + Model1LexicalTable m_model1; + const Factor* m_emptyWord; + + void Load(); +}; + + +} +