Skip to content

Commit

Permalink
Merge branch 'master' of github.com:moses-smt/mosesdecoder
Browse files Browse the repository at this point in the history
  • Loading branch information
hieuhoang committed Nov 21, 2014
2 parents c0be182 + c46fb10 commit 573f030
Show file tree
Hide file tree
Showing 11 changed files with 422 additions and 58 deletions.
23 changes: 20 additions & 3 deletions phrase-extract/extract-ghkm/ExtractGHKM.cpp
Expand Up @@ -31,6 +31,8 @@
#include "ScfgRule.h"
#include "ScfgRuleWriter.h"
#include "Span.h"
#include "StsgRule.h"
#include "StsgRuleWriter.h"
#include "SyntaxTree.h"
#include "tables-core.h"
#include "XmlException.h"
Expand Down Expand Up @@ -133,7 +135,8 @@ int ExtractGHKM::Main(int argc, char *argv[])
Alignment alignment;
XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
// XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
ScfgRuleWriter writer(fwdExtractStream, invExtractStream, options);
ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
size_t lineNum = options.sentenceOffset;
while (true) {
std::getline(targetStream, targetLine);
Expand Down Expand Up @@ -273,6 +276,15 @@ int ExtractGHKM::Main(int argc, char *argv[])

for (std::vector<const Subgraph *>::const_iterator q = rules.begin();
q != rules.end(); ++q) {
// STSG output.
if (options.stsg) {
StsgRule rule(**q);
if (rule.Scope() <= options.maxScope) {
stsgWriter.Write(rule);
}
continue;
}
// SCFG output.
ScfgRule *r = 0;
if (options.sourceLabels) {
r = new ScfgRule(**q, &sourceSyntaxTree);
Expand All @@ -282,9 +294,9 @@ int ExtractGHKM::Main(int argc, char *argv[])
// TODO Can scope pruning be done earlier?
if (r->Scope() <= options.maxScope) {
if (!options.treeFragments) {
writer.Write(*r,false);
scfgWriter.Write(*r,false);
} else {
writer.Write(*r,**q,false);
scfgWriter.Write(*r,**q,false);
}
if (options.phraseOrientation) {
fwdExtractStream << " {{Orientation ";
Expand Down Expand Up @@ -449,6 +461,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
"include score based on PCFG scores in target corpus")
("PhraseOrientation",
"output phrase orientation information")
("STSG",
"output STSG rules (default is SCFG)")
("T2S",
"enable tree-to-string rule extraction (string-to-tree is assumed by default)")
("TreeFragments",
Expand Down Expand Up @@ -558,6 +572,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
if (vm.count("PhraseOrientation")) {
options.phraseOrientation = true;
}
if (vm.count("STSG")) {
options.stsg = true;
}
if (vm.count("T2S")) {
options.t2s = true;
}
Expand Down
2 changes: 2 additions & 0 deletions phrase-extract/extract-ghkm/Options.h
Expand Up @@ -43,6 +43,7 @@ struct Options {
, phraseOrientation(false)
, sentenceOffset(0)
, sourceLabels(false)
, stsg(false)
, t2s(false)
, treeFragments(false)
, unknownWordMinRelFreq(0.03f)
Expand Down Expand Up @@ -71,6 +72,7 @@ struct Options {
bool sourceLabels;
std::string sourceLabelSetFile;
std::string sourceUnknownWordFile;
bool stsg;
bool t2s;
std::string targetUnknownWordFile;
bool treeFragments;
Expand Down
41 changes: 41 additions & 0 deletions phrase-extract/extract-ghkm/Rule.cpp
@@ -0,0 +1,41 @@
#include "Rule.h"

#include "Node.h"
#include "Subgraph.h"

namespace Moses
{
namespace GHKM
{

int Rule::Scope(const std::vector<Symbol> &symbols)
{
int scope = 0;
bool predIsNonTerm = false;
if (symbols[0].GetType() == NonTerminal) {
++scope;
predIsNonTerm = true;
}
for (std::size_t i = 1; i < symbols.size(); ++i) {
bool isNonTerm = symbols[i].GetType() == NonTerminal;
if (isNonTerm && predIsNonTerm) {
++scope;
}
predIsNonTerm = isNonTerm;
}
if (predIsNonTerm) {
++scope;
}
return scope;
}

bool Rule::PartitionOrderComp(const Node *a, const Node *b)
{
const Span &aSpan = a->GetSpan();
const Span &bSpan = b->GetSpan();
assert(!aSpan.empty() && !bSpan.empty());
return *(aSpan.begin()) < *(bSpan.begin());
}

} // namespace GHKM
} // namespace Moses
58 changes: 58 additions & 0 deletions phrase-extract/extract-ghkm/Rule.h
@@ -0,0 +1,58 @@
#pragma once
#ifndef EXTRACT_GHKM_RULE_H_
#define EXTRACT_GHKM_RULE_H_

#include <string>
#include <vector>

#include "Alignment.h"

namespace Moses
{
namespace GHKM
{

class Node;

enum SymbolType { Terminal, NonTerminal };

class Symbol {
public:
Symbol(const std::string &v, SymbolType t) : m_value(v) , m_type(t) {}

const std::string &GetValue() const {
return m_value;
}
SymbolType GetType() const {
return m_type;
}

private:
std::string m_value;
SymbolType m_type;
};

// Base class for ScfgRule and StsgRule.
class Rule
{
public:
virtual ~Rule() {}

const Alignment &GetAlignment() const {
return m_alignment;
}

virtual int Scope() const = 0;

protected:
static bool PartitionOrderComp(const Node *, const Node *);

static int Scope(const std::vector<Symbol>&);

Alignment m_alignment;
};

} // namespace GHKM
} // namespace Moses

#endif
29 changes: 0 additions & 29 deletions phrase-extract/extract-ghkm/ScfgRule.cpp
Expand Up @@ -194,34 +194,5 @@ void ScfgRule::UpdateSourceLabelCoocCounts(std::map< std::string, std::map<std::
}
}

int ScfgRule::Scope() const
{
int scope = 0;
bool predIsNonTerm = false;
if (m_sourceRHS[0].GetType() == NonTerminal) {
++scope;
predIsNonTerm = true;
}
for (size_t i = 1; i < m_sourceRHS.size(); ++i) {
bool isNonTerm = m_sourceRHS[i].GetType() == NonTerminal;
if (isNonTerm && predIsNonTerm) {
++scope;
}
predIsNonTerm = isNonTerm;
}
if (predIsNonTerm) {
++scope;
}
return scope;
}

bool ScfgRule::PartitionOrderComp(const Node *a, const Node *b)
{
const Span &aSpan = a->GetSpan();
const Span &bSpan = b->GetSpan();
assert(!aSpan.empty() && !bSpan.empty());
return *(aSpan.begin()) < *(bSpan.begin());
}

} // namespace GHKM
} // namespace Moses
31 changes: 5 additions & 26 deletions phrase-extract/extract-ghkm/ScfgRule.h
Expand Up @@ -22,6 +22,7 @@
#define EXTRACT_GHKM_SCFG_RULE_H_

#include "Alignment.h"
#include "Rule.h"
#include "SyntaxTree.h"

#include <string>
Expand All @@ -38,25 +39,7 @@ namespace GHKM
class Node;
class Subgraph;

enum SymbolType { Terminal, NonTerminal };

class Symbol {
public:
Symbol(const std::string &v, SymbolType t) : m_value(v) , m_type(t) {}

const std::string &GetValue() const {
return m_value;
}
SymbolType GetType() const {
return m_type;
}

private:
std::string m_value;
SymbolType m_type;
};

class ScfgRule
class ScfgRule : public Rule
{
public:
ScfgRule(const Subgraph &fragment,
Expand All @@ -74,9 +57,6 @@ class ScfgRule
const std::vector<Symbol> &GetTargetRHS() const {
return m_targetRHS;
}
const Alignment &GetAlignment() const {
return m_alignment;
}
float GetPcfgScore() const {
return m_pcfgScore;
}
Expand All @@ -92,11 +72,11 @@ class ScfgRule
void UpdateSourceLabelCoocCounts(std::map< std::string, std::map<std::string,float>* > &coocCounts,
float count) const;

int Scope() const;
int Scope() const {
return Rule::Scope(m_sourceRHS);
}

private:
static bool PartitionOrderComp(const Node *, const Node *);

void PushSourceLabel(const MosesTraining::SyntaxTree *sourceSyntaxTree,
const Node *node,
const std::string &nonMatchingLabel);
Expand All @@ -105,7 +85,6 @@ class ScfgRule
Symbol m_targetLHS;
std::vector<Symbol> m_sourceRHS;
std::vector<Symbol> m_targetRHS;
Alignment m_alignment;
float m_pcfgScore;
bool m_hasSourceLabels;
std::vector<std::string> m_sourceLabels;
Expand Down
95 changes: 95 additions & 0 deletions phrase-extract/extract-ghkm/StsgRule.cpp
@@ -0,0 +1,95 @@
#include "StsgRule.h"

#include "Node.h"
#include "Subgraph.h"
#include "SyntaxTree.h"

#include <algorithm>

namespace Moses
{
namespace GHKM
{

StsgRule::StsgRule(const Subgraph &fragment)
: m_targetSide(fragment, true)
{
// Source side

const std::set<const Node *> &sinkNodes = fragment.GetLeaves();

// Collect the subset of sink nodes that excludes target nodes with
// empty spans.
std::vector<const Node *> productiveSinks;
productiveSinks.reserve(sinkNodes.size());
for (std::set<const Node *>::const_iterator p = sinkNodes.begin();
p != sinkNodes.end(); ++p) {
const Node *sink = *p;
if (!sink->GetSpan().empty()) {
productiveSinks.push_back(sink);
}
}

// Sort them into the order defined by their spans.
std::sort(productiveSinks.begin(), productiveSinks.end(), PartitionOrderComp);

// Build a map from target nodes to source-order indices, so that we
// can construct the Alignment object later.
std::map<const Node *, std::vector<int> > sinkToSourceIndices;
std::map<const Node *, int> nonTermSinkToSourceIndex;

m_sourceSide.reserve(productiveSinks.size());
int srcIndex = 0;
int nonTermCount = 0;
for (std::vector<const Node *>::const_iterator p = productiveSinks.begin();
p != productiveSinks.end(); ++p, ++srcIndex) {
const Node &sink = **p;
if (sink.GetType() == TREE) {
m_sourceSide.push_back(Symbol("X", NonTerminal));
sinkToSourceIndices[&sink].push_back(srcIndex);
nonTermSinkToSourceIndex[&sink] = nonTermCount++;
} else {
assert(sink.GetType() == SOURCE);
m_sourceSide.push_back(Symbol(sink.GetLabel(), Terminal));
// Add all aligned target words to the sinkToSourceIndices map
const std::vector<Node *> &parents(sink.GetParents());
for (std::vector<Node *>::const_iterator q = parents.begin();
q != parents.end(); ++q) {
if ((*q)->GetType() == TARGET) {
sinkToSourceIndices[*q].push_back(srcIndex);
}
}
}
}

// Alignment

std::vector<const Node *> targetLeaves;
m_targetSide.GetTargetLeaves(targetLeaves);

m_alignment.reserve(targetLeaves.size());
m_nonTermAlignment.resize(nonTermCount);

for (int i = 0, j = 0; i < targetLeaves.size(); ++i) {
const Node *leaf = targetLeaves[i];
assert(leaf->GetType() != SOURCE);
if (leaf->GetSpan().empty()) {
continue;
}
std::map<const Node *, std::vector<int> >::iterator p =
sinkToSourceIndices.find(leaf);
assert(p != sinkToSourceIndices.end());
std::vector<int> &sourceNodes = p->second;
for (std::vector<int>::iterator r = sourceNodes.begin();
r != sourceNodes.end(); ++r) {
int srcIndex = *r;
m_alignment.push_back(std::make_pair(srcIndex, i));
}
if (leaf->GetType() == TREE) {
m_nonTermAlignment[nonTermSinkToSourceIndex[leaf]] = j++;
}
}
}

} // namespace GHKM
} // namespace Moses

0 comments on commit 573f030

Please sign in to comment.