Skip to content

Commit

Permalink
Model1Feature: a simple IBM Model 1 scorer,
Browse files Browse the repository at this point in the history
source-to-target with global source-sentence context
  • Loading branch information
Matthias Huck committed Feb 25, 2015
1 parent 28fbf07 commit 3c8d48f
Show file tree
Hide file tree
Showing 3 changed files with 313 additions and 0 deletions.
2 changes: 2 additions & 0 deletions moses/FF/Factory.cpp
Expand Up @@ -21,6 +21,7 @@
#include "moses/FF/SourceWordDeletionFeature.h"
#include "moses/FF/GlobalLexicalModel.h"
#include "moses/FF/GlobalLexicalModelUnlimited.h"
#include "moses/FF/Model1Feature.h"
#include "moses/FF/UnknownWordPenaltyProducer.h"
#include "moses/FF/WordTranslationFeature.h"
#include "moses/FF/TargetBigramFeature.h"
Expand Down Expand Up @@ -202,6 +203,7 @@ FeatureRegistry::FeatureRegistry()

MOSES_FNAME(GlobalLexicalModel);
//MOSES_FNAME(GlobalLexicalModelUnlimited); This was commented out in the original
MOSES_FNAME(Model1Feature);
MOSES_FNAME(SourceWordDeletionFeature);
MOSES_FNAME(TargetWordInsertionFeature);
MOSES_FNAME(PhraseBoundaryFeature);
Expand Down
209 changes: 209 additions & 0 deletions moses/FF/Model1Feature.cpp
@@ -0,0 +1,209 @@
#include <assert.h>
#include "util/exception.hh"
#include "Model1Feature.h"
#include "moses/StaticData.h"
#include "moses/InputFileStream.h"
#include "moses/ScoreComponentCollection.h"
#include "moses/Hypothesis.h"
#include "moses/ChartHypothesis.h"
#include "moses/ChartManager.h"
#include "moses/FactorCollection.h"


using namespace std;

namespace Moses
{

const std::string Model1Vocabulary::GIZANULL = "GIZANULL";

Model1Vocabulary::Model1Vocabulary()
{
FactorCollection &factorCollection = FactorCollection::Instance();
m_NULL = factorCollection.AddFactor(GIZANULL,false);
Store(m_NULL,0);
}

bool Model1Vocabulary::Store(const Factor* word, const unsigned id)
{
boost::unordered_map<const Factor*, unsigned>::iterator iter = m_lookup.find( word );
if ( iter != m_lookup.end() ) {
return false;
}
m_lookup[ word ] = id;
if ( m_vocab.size() <= id ) {
m_vocab.resize(id+1);
}
m_vocab[id] = word;
return true;
}

unsigned Model1Vocabulary::StoreIfNew(const Factor* word)
{
boost::unordered_map<const Factor*, unsigned>::iterator iter = m_lookup.find( word );

if ( iter != m_lookup.end() ) {
return iter->second;
}

unsigned id = m_vocab.size();
m_vocab.push_back( word );
m_lookup[ word ] = id;
return id;
}

unsigned Model1Vocabulary::GetWordID(const Factor* word) const
{
boost::unordered_map<const Factor*, unsigned>::const_iterator iter = m_lookup.find( word );
if ( iter == m_lookup.end() ) {
return INVALID_ID;
}
return iter->second;
}

const Factor* Model1Vocabulary::GetWord(unsigned id) const
{
if (id >= m_vocab.size()) {
return NULL;
}
return m_vocab[ id ];
}

void Model1Vocabulary::Load(const std::string& fileName)
{
InputFileStream inFile(fileName);
FactorCollection &factorCollection = FactorCollection::Instance();
std::string line;

unsigned i = 0;
while ( getline(inFile, line) )
{
++i;
std::vector<std::string> tokens = Tokenize(line);
UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
unsigned id = Scan<unsigned>(tokens[0]);
const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
bool stored = Store(factor, id);
UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
}
inFile.Close();
}


void Model1LexicalTable::Load(const std::string &fileName, const Model1Vocabulary& vcbS, const Model1Vocabulary& vcbT)
{
InputFileStream inFile(fileName);
std::string line;

unsigned i = 0;
while ( getline(inFile, line) )
{
++i;
std::vector<std::string> tokens = Tokenize(line);
UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
unsigned idS = Scan<unsigned>(tokens[0]);
unsigned idT = Scan<unsigned>(tokens[1]);
const Factor* wordS = vcbS.GetWord(idS);
const Factor* wordT = vcbT.GetWord(idT);
float prob = Scan<float>(tokens[2]);
if ( (wordS != NULL) && (wordT != NULL) ) {
m_ltable[ wordS ][ wordT ] = prob;
}
UTIL_THROW_IF2((wordS == NULL) || (wordT == NULL), "Line " << i << " in " << fileName << " has unknown vocabulary."); // TODO: can we assume that the vocabulary is know and filter the model on loading? Then remove this line.
}
inFile.Close();
}

// p( wordT | wordS )
float Model1LexicalTable::GetProbability(const Factor* wordS, const Factor* wordT) const
{
float prob = m_floor;

boost::unordered_map< const Factor*, boost::unordered_map< const Factor*, float > >::const_iterator iter1 = m_ltable.find( wordS );

if ( iter1 != m_ltable.end() ) {
boost::unordered_map< const Factor*, float >::const_iterator iter2 = iter1->second.find( wordT );
if ( iter2 != iter1->second.end() ) {
prob = iter2->second;
if ( prob < m_floor ) {
prob = m_floor;
}
}
}
return prob;
}


Model1Feature::Model1Feature(const std::string &line)
: StatelessFeatureFunction(1, line)
{
VERBOSE(1, "Initializing feature " << GetScoreProducerDescription() << " ...");
ReadParameters();
VERBOSE(1, " Done.");
}

void Model1Feature::SetParameter(const std::string& key, const std::string& value)
{
if (key == "path") {
m_fileNameModel1 = value;
} else if (key == "sourceVocabulary") {
m_fileNameVcbS = value;
} else if (key == "targetVocabulary") {
m_fileNameVcbT = value;
} else {
StatelessFeatureFunction::SetParameter(key, value);
}
}

void Model1Feature::Load()
{
FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading source vocabulary from file " << m_fileNameVcbS << " ...");
Model1Vocabulary vcbS;
vcbS.Load(m_fileNameVcbS);
FEATUREVERBOSE2(2, " Done." << std::endl);
FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading target vocabulary from file " << m_fileNameVcbT << " ...");
Model1Vocabulary vcbT;
vcbT.Load(m_fileNameVcbT);
FEATUREVERBOSE2(2, " Done." << std::endl);
FEATUREVERBOSE(2, GetScoreProducerDescription() << ": Loading model 1 lexical translation table from file " << m_fileNameModel1 << " ...");
m_model1.Load(m_fileNameModel1,vcbS,vcbT);
FEATUREVERBOSE2(2, " Done." << std::endl);
FactorCollection &factorCollection = FactorCollection::Instance();
m_emptyWord = factorCollection.GetFactor(Model1Vocabulary::GIZANULL,false);
UTIL_THROW_IF2(m_emptyWord==NULL, GetScoreProducerDescription()
<< ": Factor for GIZA empty word does not exist.");
}

void Model1Feature::EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore) const
{
const Sentence& sentence = static_cast<const Sentence&>(input);
float score = 0.0;
float norm = TransformScore(1+sentence.GetSize());

for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT)
{
const Word &wordT = targetPhrase.GetWord(posT);
if ( !wordT.IsNonTerminal() )
{
float thisWordProb = m_model1.GetProbability(m_emptyWord,wordT[0]); // probability conditioned on empty word
for (size_t posS=1; posS<sentence.GetSize()-1; ++posS) // ignore <s> and </s>
{
const Word &wordS = sentence.GetWord(posS);
float modelProb = m_model1.GetProbability(wordS[0],wordT[0]);
FEATUREVERBOSE(3, "p( " << wordT << " | " << wordS << " ) = " << modelProb << std::endl);
thisWordProb += modelProb;
}
score += TransformScore(thisWordProb) - norm;
}
}

scoreBreakdown.PlusEquals(this, score);
}

}

102 changes: 102 additions & 0 deletions moses/FF/Model1Feature.h
@@ -0,0 +1,102 @@
#pragma once

#include <string>
#include <limits>
#include <boost/unordered_map.hpp>
#include "StatelessFeatureFunction.h"
#include "FFState.h"
#include "moses/Factor.h"

namespace Moses
{

class Model1Vocabulary
{
public:

#define INVALID_ID std::numeric_limits<unsigned>::max() // UINT_MAX
static const std::string GIZANULL;

Model1Vocabulary();
bool Store(const Factor* word, const unsigned id);
unsigned StoreIfNew(const Factor* word);
unsigned GetWordID(const Factor* word) const;
const Factor* GetWord(unsigned id) const;
void Load(const std::string& fileName);

protected:
boost::unordered_map<const Factor*, unsigned> m_lookup;
std::vector< const Factor* > m_vocab;
const Factor* m_NULL;
};


class Model1LexicalTable
{
public:
Model1LexicalTable(float floor=1e-7) : m_floor(floor)
{}

void Load(const std::string& fileName, const Model1Vocabulary& vcbS, const Model1Vocabulary& vcbT);

// p( wordT | wordS )
float GetProbability(const Factor* wordS, const Factor* wordT) const;

protected:
boost::unordered_map< const Factor*, boost::unordered_map< const Factor*, float > > m_ltable;
const float m_floor;
};



class Model1Feature : public StatelessFeatureFunction
{
public:
Model1Feature(const std::string &line);

bool IsUseable(const FactorMask &mask) const {
return true;
}

void SetParameter(const std::string& key, const std::string& value);

void EvaluateInIsolation(const Phrase &source
, const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
{};

void EvaluateWithSourceContext(const InputType &input
, const InputPath &inputPath
, const TargetPhrase &targetPhrase
, const StackVec *stackVec
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection *estimatedFutureScore = NULL) const;

void EvaluateTranslationOptionListWithSourceContext(const InputType &input
, const TranslationOptionList &translationOptionList) const
{}

void EvaluateWhenApplied(
const Hypothesis& cur_hypo,
ScoreComponentCollection* accumulator) const
{}

void EvaluateWhenApplied(
const ChartHypothesis& cur_hypo,
ScoreComponentCollection* accumulator) const
{}

private:
std::string m_fileNameVcbS;
std::string m_fileNameVcbT;
std::string m_fileNameModel1;
Model1LexicalTable m_model1;
const Factor* m_emptyWord;

void Load();
};


}

0 comments on commit 3c8d48f

Please sign in to comment.