From ab35f6d4a38917e26098b2dec8be43fd0e29fe87 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sun, 31 Aug 2014 13:27:17 +0100 Subject: [PATCH 01/13] Back to the standard default max phrase length of 20 with --with-mm. --- moses/TypeDef.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/moses/TypeDef.h b/moses/TypeDef.h index a5c434d4b3..cdcb99dbce 100644 --- a/moses/TypeDef.h +++ b/moses/TypeDef.h @@ -60,12 +60,12 @@ const size_t DEFAULT_MAX_HYPOSTACK_SIZE = 200; const size_t DEFAULT_MAX_TRANS_OPT_CACHE_SIZE = 10000; const size_t DEFAULT_MAX_TRANS_OPT_SIZE = 5000; const size_t DEFAULT_MAX_PART_TRANS_OPT_SIZE = 10000; -#ifdef PT_UG +//#ifdef PT_UG // setting to std::numeric_limits::max() makes the regression test for (deprecated) PhraseDictionaryDynamicSuffixArray fail. - const size_t DEFAULT_MAX_PHRASE_LENGTH = 100000; -#else - const size_t DEFAULT_MAX_PHRASE_LENGTH = 20; -#endif +// const size_t DEFAULT_MAX_PHRASE_LENGTH = 100000; +//#else +const size_t DEFAULT_MAX_PHRASE_LENGTH = 20; +//#endif const size_t DEFAULT_MAX_CHART_SPAN = 10; const size_t ARRAY_SIZE_INCR = 10; //amount by which a phrase gets resized when necessary const float LOWEST_SCORE = -100.0f; From a028fec7af57bb6237295aa505d88729e9c88aa6 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 4 Aug 2014 17:30:46 +0100 Subject: [PATCH 02/13] Work in progress. --- moses/TranslationModel/UG/try-align.cc | 451 +++++++++++++++++++++++-- 1 file changed, 425 insertions(+), 26 deletions(-) diff --git a/moses/TranslationModel/UG/try-align.cc b/moses/TranslationModel/UG/try-align.cc index 483ad2c34a..6f5fb15a27 100644 --- a/moses/TranslationModel/UG/try-align.cc +++ b/moses/TranslationModel/UG/try-align.cc @@ -1,34 +1,433 @@ -#include "mmsapt.h" +#include "mm/ug_bitext.h" +#include using namespace std; using namespace Moses; +using namespace ugdiss; -// currently broken +typedef L2R_Token Token; +typedef mmTtrack ttrack_t; +typedef mmTSA tsa_t; + +TokenIndex V1,V2; +boost::shared_ptr T1,T2; +tsa_t I1,I2; + +namespace stats +{ + using namespace Moses::bitext; + float + pmi(size_t j,size_t m1, size_t m2, size_t N) + { + float p1 = lbop(N,m1,.1); + float p2 = lbop(N,m2,.1); + float p12 = lbop(N,j,.1); + return log(p12) - log(p1) - log(p2); + // return log(j) + log(N) - log(m1) - log(m2); + } + + float + npmi(size_t j,size_t m1, size_t m2, size_t N) + { + + float p1 = lbop(N,m1,.1); + float p2 = lbop(N,m2,.1); + float p12 = lbop(N,j,.1); + return (log(p12) - log(p1) - log(p2)) / -log(p12); + // return pmi(j,m1,m2,N) / (log(N) - log(j)); + } + + float + mi(size_t j,size_t m1, size_t m2, size_t N) + { + float ret = 0; + if (j) ret += float(j)/N * pmi(j,m1,m2,N); + if (m1>j) ret += float(m1-j)/N * pmi(m1-j,m1,N-m2,N); + if (m2>j) ret += float(m2-j)/N * pmi(m2-j,N-m1,m2,N); + if (N>m1+m2-j) ret += float(N-m1-m2+j)/N * pmi(N-m1-m2+j,N-m1,N-m2,N); + return ret; + } +} + +struct SinglePhrase +{ + typedef map > cache_t; + uint64_t pid; // phrase id + vector occs; // occurrences +}; + + +struct PhrasePair +{ + struct score_t; + uint64_t p1,p2; + ushort s1,e1,s2,e2; + int parent; + + struct stats_t + { + typedef map, sptr > cache_t; + size_t m1,m2,j; + float npmi; // normalized point-wise mutual information + float pmi; // point-wise mutual information + float mi; // mutual information + float score; + + void + set(vector const& o1, + vector const& o2, + size_t const N) + { + m1 = m2 = j = 0; + size_t i1=0,i2=0; + while (i1 < o1.size() && i2 < o2.size()) + { + if (i1 && o1[i1].sid == o1[i1-1].sid) { ++i1; continue; } + if (i2 && o2[i2].sid == o2[i2-1].sid) { ++i2; continue; } + + if (o1[i1].sid == o2[i2].sid) { ++j; ++i1; ++i2; ++m1; ++m2; } + else if (o1[i1].sid < o2[i2].sid) { ++i1; ++m1; } + else { ++i2; ++m2; } + } + // for (++i1; i1 < o1.size(); ++i1) + // if (o1[i1-1].sid != o1[i1].sid) ++m1; + // for (++i2; i2 < o2.size(); ++i2) + // if (o2[i2-1].sid != o2[i2].sid) ++m2; + + m1 = 1; m2 = 1; + for (i1=1; i1 < o1.size(); ++i1) + if (o1[i1-1].sid != o1[i1].sid) ++m1; + for (i2=1; i2 < o2.size(); ++i2) + if (o2[i2-1].sid != o2[i2].sid) ++m2; + + this->mi = stats::mi(j,m1,m2,N); + this->pmi = stats::pmi(j,m1,m2,N); + this->npmi = stats::npmi(j,m1,m2,N); + // float z = float(m1)/N * float(m2)/N; + float hmean = 2.*j/(m1+m2); + this->score = npmi; // hmean; // /sqrt(z); + } + } stats; + + PhrasePair(ushort s1_=0, ushort e1_=0, ushort s2_=0, ushort e2_=0) + : s1(s1_), e1(e1_), s2(s2_), e2(e2_), parent(-1) { } + + + bool + operator<(PhrasePair const& other) const + { + return (this->stats.score == other.stats.score + ? (e1-s1 + e2-s2 > other.e1-other.s1 + other.e2-other.s2) + : (this->stats.score > other.stats.score)); + } + +}; + +SinglePhrase::cache_t cache1,cache2; +PhrasePair::stats_t::cache_t ppcache; + + +struct SortByPositionInCorpus +{ + bool + operator()(ttrack::Position const& a, + ttrack::Position const& b) const + { + return a.sid != b.sid ? a.sid < b.sid : a.offset < b.offset; + } +}; + + +void +getoccs(tsa_t::tree_iterator const& m, + vector& occs) +{ + occs.clear(); + occs.reserve(m.approxOccurrenceCount()+10); + tsa::ArrayEntry I(m.lower_bound(-1)); + char const* stop = m.upper_bound(-1); + do { + m.root->readEntry(I.next,I); + occs.push_back(I); + } while (I.next != stop); + sort(occs.begin(),occs.end(),SortByPositionInCorpus()); +} + +void +lookup_phrases(vector const& snt, + TokenIndex& V, ttrack_t const& T, + tsa_t const& I, SinglePhrase::cache_t& cache, + vector > >& dest) +{ + dest.resize(snt.size()); + for (size_t i = 0; i < snt.size(); ++i) + { + tsa_t::tree_iterator m(&I); + dest[i].clear(); + for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k) + { + sptr& o = cache[m.getPid()]; + if (!o) + { + o.reset(new SinglePhrase()); + o->pid = m.getPid(); + getoccs(m,o->occs); + } + dest[i].push_back(o); + } + } +} + +struct +RowIndexSorter +{ + vector > const& M; + size_t const my_col; + RowIndexSorter(vector > const& m, size_t const c) + : M(m), my_col(c) { } + + template + bool + operator()(T const& a, T const& b) const + { + return M.at(a).at(my_col) > M.at(b).at(my_col); + } +}; + +struct +ColIndexSorter +{ + vector > const& M; + size_t const my_row; + ColIndexSorter(vector > const& m, size_t const r) + : M(m), my_row(r) { } + + template + bool + operator()(T const& a, T const& b) const + { + return M.at(my_row).at(a) > M[my_row].at(b); + } + +}; -Mmsapt* PT; int main(int argc, char* argv[]) { - // string base = argv[1]; - // string L1 = argv[2]; - // string L2 = argv[3]; - // ostringstream buf; - // buf << "Mmsapt name=PT0 output-factor=0 num-features=5 base=" - // << base << " L1=" << L1 << " L2=" << L2; - // string configline = buf.str(); - // PT = new Mmsapt(configline); - // PT->Load(); - // float w[] = { 0.0582634, 0.0518865, 0.0229819, 0.00640856, 0.647506 }; - // vector weights(w,w+5); - // PT->setWeights(weights); - // // these values are taken from a moses.ini file; - // // is there a convenient way of accessing them from within mmsapt ??? - // string eline,fline; - // // TokenIndex V; V.open("crp/trn/mm/de.tdx"); - // while (getline(cin,eline) && getline(cin,fline)) - // { - // cout << eline << endl; - // cout << fline << endl; - // PT->align(eline,fline); - // } - // delete PT; + string base = argv[1]; + string L1 = argv[2]; + string L2 = argv[3]; + + T1.reset(new ttrack_t()); + T2.reset(new ttrack_t()); + + V1.open(base + L1 + ".tdx"); + T1->open(base + L1 + ".mct"); + I1.open(base + L1 + ".sfa", T1); + + V2.open(base + L2 + ".tdx"); + T2->open(base + L2 + ".mct"); + I2.open(base + L2 + ".sfa", T2); + + tsa_t::tree_iterator m1(&I1); + tsa_t::tree_iterator m2(&I1); + string line1, line2; + while (getline(cin,line1) and getline(cin,line2)) + { + cout << line1 << "\n" << line2 << endl; + vector > > M1,M2; + vector snt1,snt2; + V1.fillIdSeq(line1,snt1); + V2.fillIdSeq(line2,snt2); + lookup_phrases(snt1,V1,*T1,I1,cache1,M1); + lookup_phrases(snt2,V2,*T2,I2,cache2,M2); + + vector pp_all,pp_good; + vector a1(snt1.size(),-1); + vector a2(snt2.size(),-1); + + vector > z1(snt1.size(),vector(snt1.size(),-1)); + vector > z2(snt2.size(),vector(snt2.size(),-1)); + vector > >ppm1(M1.size()),ppm2(M2.size()); + vector > M(snt1.size(), vector(snt2.size(),0)); + vector > best1(snt1.size()), best2(snt2.size()); + for (size_t i1 = 0; i1 < M1.size(); ++i1) + { + PhrasePair pp; + pp.s1 = i1; + ppm1[i1].resize(M1[i1].size()); + for (size_t i2 = 0; i2 < M2.size(); ++i2) + { + pp.s2 = i2; + pp.stats.j = 1; + ppm2[i2].resize(M2[i2].size()); + for (size_t k1 = 0; k1 < M1[i1].size(); ++k1) + { + pp.e1 = i1 + k1 + 1; + // if (pp.stats.j == 0) break; + for (size_t k2 = 0; k2 < M2[i2].size(); ++k2) + { + pp.e2 = i2 + k2 + 1; + sptr & s + = ppcache[make_pair(M1[i1][k1]->pid,M2[i2][k2]->pid)]; + if (!s) + { + s.reset(new PhrasePair::stats_t()); + s->set(M1[i1][k1]->occs,M2[i2][k2]->occs,T1->size()); + } + pp.stats = *s; + if (pp.stats.j == 0) break; + // ppm1[i1][k1].push_back(pp); + // ppm2[i2][k2].push_back(pp); + if (pp.stats.score > 0) + pp_all.push_back(pp); + } + } + } + } + sort(pp_all.begin(),pp_all.end()); + BOOST_FOREACH(PhrasePair const& pp,pp_all) + { + if (pp.stats.npmi < 0) continue; + for (size_t r = pp.s1; r < pp.e1; ++r) + for (size_t c = pp.s2; c < pp.e2; ++c) + { + // M[r][c] += log(1-pp.stats.npmi); + M[r][c] += log(1-pp.stats.mi); + } + } + for (size_t r = 0; r < M.size(); ++r) + for (size_t c = 0; c < M[r].size(); ++c) + M[r][c] = 1.-exp(M[r][c]); + for (size_t r = 0; r < best1.size(); ++r) + { + best1[r].resize(snt2.size()); + for (size_t c = 0; c < best1[r].size(); ++c) + best1[r][c] = c; + sort(best1[r].begin(),best1[r].end(),ColIndexSorter(M,r)); + } + for (size_t c = 0; c < best2.size(); ++c) + { + best2[c].resize(snt1.size()); + for (size_t r = 0; r < best2[c].size(); ++r) + best2[c][r] = r; + sort(best2[c].begin(),best2[c].end(),RowIndexSorter(M,c)); + } + for (size_t r = 0; r < best1.size(); ++r) + { + cout << V1[snt1[r]] << ":"; + for (size_t i = 0; i < min(3UL,M[r].size()); ++i) + { + size_t k = best1[r][i]; + // if (M[r][k] >= M[best2[k][min(2UL,M.size())]][k]) + cout << " " << k << ":" << V2[snt2[k]] << " " << M[r][k]; + } + cout << endl; + } + +#if 0 + for (size_t k = 1; k < pp_all.size(); ++k) + for (size_t i = k; i--;) + if (pp_all[i].s1 >= pp_all[k].s1 && + pp_all[i].e1 <= pp_all[k].e1 && + pp_all[i].s2 >= pp_all[k].s2 && + pp_all[i].e2 <= pp_all[k].e2) + pp_all[i].stats.score += pp_all[k].stats.score; + sort(pp_all.begin(),pp_all.end()); +#endif + +#if 1 + for (size_t p = 0; p < pp_all.size(); ++p) + { + PhrasePair const& x = pp_all[p]; + // if (x.stats.npmi < .7) break; + // if (z1[x.s1][x.e1-1] >= 0 || z2[x.s2][x.e2-1] >=0) + // continue; + z1[x.s1][x.e1-1] = p; + z2[x.s2][x.e2-1] = p; + cout << (boost::format("%.4f %.4f %.4f") + % x.stats.score + % x.stats.mi + % x.stats.npmi); + for (size_t z = x.s1; z < x.e1; ++z) + cout << " " << V1[snt1[z]]; + cout << " :::"; + for (size_t z = x.s2; z < x.e2; ++z) + cout << " " << V2[snt2[z]]; + cout << " [" + << x.stats.m1 << "/" << x.stats.j << "/" << x.stats.m2 + << "]" << endl; + } +#endif + // sort(pp_all.begin(),pp_all.end()); + // BOOST_FOREACH(PhrasePair const& pp, pp_all) + // { + // while (ppm1[pp.s1].size() < pp.e1 - pp.s1) + // ppm1[pp.s1].push_back(vector()); + // vector& v1 = ppm1[pp.s1][pp.e1-pp.s1-1]; + // if (v1.size() && v1[0].stats.score > pp.stats.score) + // continue; + // while (ppm2[pp.s2].size() < pp.e2 - pp.s2) + // ppm2[pp.s2].push_back(vector()); + // vector& v2 = ppm2[pp.s2][pp.e2-pp.s2-1]; + // if (v2.size() && v2[0].stats.score > pp.stats.score) + // continue; + // v1.push_back(pp); + // v2.push_back(pp); + // } + + + // BOOST_FOREACH(vector >& vv, ppm1) + // { + // BOOST_FOREACH(vector& v, vv) + // { + // sort(v.begin(),v.end()); + // if (v.size() > 1 && v[0].stats.score == v[1].stats.score) + // v.clear(); + // } + // } + // for (size_t i2 = 0; i2 < ppm2.size(); ++i2) + // { + // for (size_t k2 = 0; k2 < ppm2[i2].size(); ++k2) + // { + // vector& v2 = ppm2[i2][k2]; + // sort(v2.begin(),v2.end()); + // if (v2.size() > 1 && v2[0].stats.score == v2[1].stats.score) + // { + // v2.clear(); + // continue; + // } + // ushort i1 = v2[0].s1; + // ushort k1 = v2[0].e1 - i1 -1; + + // if (ppm1[i1][k1].size() == 0 || + // ppm1[i1][k1][0].s2 != i2 || + // ppm1[i1][k1][0].e2 != i2 + k2 + 1) + // { v2.clear(); } + // else pp_good.push_back(ppm2[i2][k2][0]); + // } + // } + // BOOST_FOREACH(PhrasePair const& pp, pp_good) + // { + // cout << pp.stats.mi << " "; + // for (size_t z = pp.s1; z < pp.e1; ++z) + // cout << V1[snt1[z]] << " "; + // cout << " ::: "; + // for (size_t z = pp.s2; z < pp.e2; ++z) + // cout << V2[snt2[z]] << " "; + // cout << pp.stats.m1 << "/" << pp.stats.j << "/" << pp.stats.m2 << endl; + // } + // // cout << string(80,'=') << endl; + // // sort(pp_all.begin(),pp_all.end()); + // // BOOST_FOREACH(PhrasePair const& pp, pp_all) + // // { + // // cout << pp.mi << " "; + // // for (size_t z = pp.s1; z < pp.e1; ++z) + // // cout << V1[snt1[z]] << " "; + // // cout << " ::: "; + // // for (size_t z = pp.s2; z < pp.e2; ++z) + // // cout << V2[snt2[z]] << " "; + // // cout << pp.m1 << "/" << pp.j << "/" << pp.m2 << endl; + // // } + + } } From 9af3a61678cde6d8073211f7d7af85e40731dfb3 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 30 Aug 2014 07:18:48 +0100 Subject: [PATCH 03/13] Added try-align2. --- moses/TranslationModel/UG/Jamfile | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/Jamfile b/moses/TranslationModel/UG/Jamfile index c36d4a072b..2f1816f51e 100644 --- a/moses/TranslationModel/UG/Jamfile +++ b/moses/TranslationModel/UG/Jamfile @@ -9,6 +9,18 @@ $(TOP)/moses/TranslationModel/UG//mmsapt $(TOP)/util//kenutil ; +exe try-align2 : +try-align2.cc +$(TOP)/moses//moses +$(TOP)/moses/TranslationModel/UG/generic//generic +$(TOP)/moses/TranslationModel/UG/generic//stringdist +$(TOP)//boost_iostreams +$(TOP)//boost_program_options +$(TOP)/moses/TranslationModel/UG/mm//mm +$(TOP)/moses/TranslationModel/UG//mmsapt +$(TOP)/util//kenutil +; + exe ptable-lookup : ptable-lookup.cc $(TOP)/moses//moses @@ -53,6 +65,6 @@ $(TOP)/moses/TranslationModel/UG//mmsapt $(TOP)/util//kenutil ; -install $(PREFIX)/bin : try-align ; +install $(PREFIX)/bin : try-align try-align2 ; fakelib mmsapt : [ glob *.cpp mmsapt*.cc sapt*.cc ] ; From 90c91ae9bb45cd69b9b44e285b3a05c13b59da93 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 30 Aug 2014 07:19:32 +0100 Subject: [PATCH 04/13] Added fakelib stringdist. --- moses/TranslationModel/UG/generic/Jamfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/generic/Jamfile b/moses/TranslationModel/UG/generic/Jamfile index ed7b551d4d..2a118c9c03 100644 --- a/moses/TranslationModel/UG/generic/Jamfile +++ b/moses/TranslationModel/UG/generic/Jamfile @@ -1 +1,2 @@ -fakelib generic : [ glob */*.cc */*.cpp ] ; +fakelib generic : [ glob */*.cc */*.cpp : stringdist/* ] ; +fakelib stringdist : [ glob stringdist/*.cc ] ; From 2405293aaa11b114a7de5b71db89069f22468317 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 30 Aug 2014 07:21:22 +0100 Subject: [PATCH 05/13] Fiddling around with the code. Not for production. --- moses/TranslationModel/UG/try-align.cc | 113 +++++++++++++++++++++---- 1 file changed, 97 insertions(+), 16 deletions(-) diff --git a/moses/TranslationModel/UG/try-align.cc b/moses/TranslationModel/UG/try-align.cc index 6f5fb15a27..daafec5457 100644 --- a/moses/TranslationModel/UG/try-align.cc +++ b/moses/TranslationModel/UG/try-align.cc @@ -12,28 +12,35 @@ TokenIndex V1,V2; boost::shared_ptr T1,T2; tsa_t I1,I2; +float lbop_level = .05; +#define smooth 1 namespace stats { using namespace Moses::bitext; float pmi(size_t j,size_t m1, size_t m2, size_t N) { - float p1 = lbop(N,m1,.1); - float p2 = lbop(N,m2,.1); - float p12 = lbop(N,j,.1); +#if smooth + float p1 = lbop(N,m1,lbop_level); + float p2 = lbop(N,m2,lbop_level); + float p12 = lbop(N,j,lbop_level); return log(p12) - log(p1) - log(p2); - // return log(j) + log(N) - log(m1) - log(m2); +#else + return log(j) + log(N) - log(m1) - log(m2); +#endif } float npmi(size_t j,size_t m1, size_t m2, size_t N) { - - float p1 = lbop(N,m1,.1); - float p2 = lbop(N,m2,.1); - float p12 = lbop(N,j,.1); +#if smooth + float p1 = lbop(N,m1,lbop_level); + float p2 = lbop(N,m2,lbop_level); + float p12 = lbop(N,j,lbop_level); return (log(p12) - log(p1) - log(p2)) / -log(p12); - // return pmi(j,m1,m2,N) / (log(N) - log(j)); +#else + return pmi(j,m1,m2,N) / (log(N) - log(j)); +#endif } float @@ -103,8 +110,8 @@ struct PhrasePair this->pmi = stats::pmi(j,m1,m2,N); this->npmi = stats::npmi(j,m1,m2,N); // float z = float(m1)/N * float(m2)/N; - float hmean = 2.*j/(m1+m2); - this->score = npmi; // hmean; // /sqrt(z); + // float hmean = 2.*j/(m1+m2); + this->score = npmi; // npmi; // hmean; // /sqrt(z); } } stats; @@ -120,6 +127,13 @@ struct PhrasePair : (this->stats.score > other.stats.score)); } + size_t len1() const { return e1 - s1; } + size_t len2() const { return e2 - s2; } + bool includes(PhrasePair const& o) const + { + return s1 <= o.s1 && e1 >= o.e1 && s2 <= o.s2 && e2 >= o.e2; + } + }; SinglePhrase::cache_t cache1,cache2; @@ -165,6 +179,8 @@ lookup_phrases(vector const& snt, dest[i].clear(); for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k) { + if (m.approxOccurrenceCount() < 3) break; + // if (k - i > 0) break; sptr& o = cache[m.getPid()]; if (!o) { @@ -232,7 +248,7 @@ int main(int argc, char* argv[]) string line1, line2; while (getline(cin,line1) and getline(cin,line2)) { - cout << line1 << "\n" << line2 << endl; + cout << "\n" << line1 << "\n" << line2 << endl; vector > > M1,M2; vector snt1,snt2; V1.fillIdSeq(line1,snt1); @@ -277,13 +293,17 @@ int main(int argc, char* argv[]) if (pp.stats.j == 0) break; // ppm1[i1][k1].push_back(pp); // ppm2[i2][k2].push_back(pp); - if (pp.stats.score > 0) - pp_all.push_back(pp); + size_t J = pp.stats.j * 100; + if (pp.stats.score > 0 + && J >= pp.stats.m1 + && J > pp.stats.m2) + { pp_all.push_back(pp); } } } } } sort(pp_all.begin(),pp_all.end()); +#if 0 BOOST_FOREACH(PhrasePair const& pp,pp_all) { if (pp.stats.npmi < 0) continue; @@ -322,7 +342,7 @@ int main(int argc, char* argv[]) } cout << endl; } - +#endif #if 0 for (size_t k = 1; k < pp_all.size(); ++k) for (size_t i = k; i--;) @@ -335,15 +355,39 @@ int main(int argc, char* argv[]) #endif #if 1 + vector assoc1(snt1.size(),-1), assoc2(snt2.size(),-1); for (size_t p = 0; p < pp_all.size(); ++p) { PhrasePair const& x = pp_all[p]; // if (x.stats.npmi < .7) break; // if (z1[x.s1][x.e1-1] >= 0 || z2[x.s2][x.e2-1] >=0) // continue; + for (size_t i = x.s1; i < x.e1; ++i) + { + if (assoc1[i] < 0) + assoc1[i] = p; + else + { + // PhrasePair& y = pp_all[assoc1[i]]; + // if (y.includes(x)) + // assoc1[i] = p; + } + } + for (size_t i = x.s2; i < x.e2; ++i) + { + if (assoc2[i] < 0) + assoc2[i] = p; + else + { + // PhrasePair& y = pp_all[assoc2[i]]; + // if (y.includes(x)) + // assoc2[i] = p; + } + } z1[x.s1][x.e1-1] = p; z2[x.s2][x.e2-1] = p; - cout << (boost::format("%.4f %.4f %.4f") + continue; + cout << (boost::format("%.4f %.8f %.4f") % x.stats.score % x.stats.mi % x.stats.npmi); @@ -356,6 +400,43 @@ int main(int argc, char* argv[]) << x.stats.m1 << "/" << x.stats.j << "/" << x.stats.m2 << "]" << endl; } + vector done(pp_all.size(),false); + for (size_t i = 0; i < snt1.size(); ++i) + { + if (assoc1[i] < 0 || done[assoc1[i]]) + continue; + // for (size_t k = 0; k < snt2.size(); ++k) + // if (assoc1[i] == assoc2[k]) + { + done[assoc1[i]] = true; + PhrasePair& p = pp_all[assoc1[i]]; + for (size_t j = p.s1; j < p.e1; ++j) + cout << j << ":" << V1[snt1[j]] << " "; + cout << " ::: "; + for (size_t j = p.s2; j < p.e2; ++j) + cout << j << ":" << V2[snt2[j]] << " "; + cout << "[" + << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2 + << "] "<< p.stats.score << endl; + // break; + } + } + cout << endl; + for (size_t i = 0; i < snt2.size(); ++i) + { + if (assoc2[i] < 0 || done[assoc2[i]]) + continue; + done[assoc2[i]] = true; + PhrasePair& p = pp_all[assoc2[i]]; + for (size_t j = p.s1; j < p.e1; ++j) + cout << j << ":" << V1[snt1[j]] << " "; + cout << " ::: "; + for (size_t j = p.s2; j < p.e2; ++j) + cout << j << ":" << V2[snt2[j]] << " "; + cout << "[" + << p.stats.m1 << "/" << p.stats.j << "/" << p.stats.m2 + << "] "<< p.stats.score << endl; + } #endif // sort(pp_all.begin(),pp_all.end()); // BOOST_FOREACH(PhrasePair const& pp, pp_all) From b588df77f0e76905d25b055830b5ad564a1e803f Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 30 Aug 2014 07:23:05 +0100 Subject: [PATCH 06/13] Bug fix related to threading. --- moses/TranslationModel/UG/mm/ug_bitext.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc index 2a3fe50ec8..8c27db784d 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext.cc @@ -30,7 +30,8 @@ namespace Moses pstats:: ~pstats() { - --active; + try { --active; } catch (...) {} + // counter may not exist any more at destruction time } void From d7645cb7f15c699400e88d22e547d7daeec01aeb Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 30 Aug 2014 07:25:14 +0100 Subject: [PATCH 07/13] Commented out unused variable. --- contrib/server/mosesserver.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp index 0ec412157f..8e8854d529 100644 --- a/contrib/server/mosesserver.cpp +++ b/contrib/server/mosesserver.cpp @@ -281,7 +281,7 @@ class Translator : public xmlrpc_c::method const ChartHypothesis *hypo = manager.GetBestHypothesis(); outputChartHypo(out,hypo); if (addGraphInfo) { - const size_t translationId = tinput.GetTranslationId(); + // const size_t translationId = tinput.GetTranslationId(); std::ostringstream sgstream; manager.OutputSearchGraphMoses(sgstream); retData.insert(pair("sg", xmlrpc_c::value_string(sgstream.str()))); From 015d690b6f42d2483557fc746d3ea28595d7d5d3 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 30 Aug 2014 07:27:32 +0100 Subject: [PATCH 08/13] Added try-align2 to --with-mm and new switch --with-icu. --- Jamroot | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Jamroot b/Jamroot index 7635d7a87b..e47e6d628e 100644 --- a/Jamroot +++ b/Jamroot @@ -125,6 +125,19 @@ if [ option.get "with-cmph" : : "yes" ] { requirements += HAVE_CMPH ; } +if [ option.get "with-icu" : : "yes" ] +{ + external-lib icuuc ; + external-lib icuio ; + external-lib icui18n ; + requirements += icuuc/shared ; + requirements += icuio/shared ; + requirements += icui18n/shared ; + requirements += -fPIC ; + requirements += 64 ; + requirements += shared ; +} + if [ option.get "with-probing-pt" : : "yes" ] { external-lib boost_serialization ; @@ -173,6 +186,7 @@ if [ option.get "with-mm" : : "yes" ] moses/TranslationModel/UG/mm//mtt-count-words moses/TranslationModel/UG/mm//calc-coverage moses/TranslationModel/UG//try-align + moses/TranslationModel/UG//try-align2 ; } else From a87a9ff2070d32410ce2e22468de0e2f8cd5086b Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 30 Aug 2014 07:28:47 +0100 Subject: [PATCH 09/13] Moved class PhrasePair back to ug_bitext. Moved function expand() from mmsapt.cc to ug_bitext.h. Added new lookup function to class Bitext. Bug fixes related to inverse lookup in class Bitext. --- moses/TranslationModel/UG/mm/ug_bitext.h | 535 +++++++++++++++---- moses/TranslationModel/UG/mm/ug_phrasepair.h | 244 +-------- moses/TranslationModel/UG/mmsapt.cpp | 43 +- 3 files changed, 463 insertions(+), 359 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 01d8187d60..9d80d32fa8 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -47,6 +47,8 @@ #include "ug_corpus_token.h" #include "tpt_pickler.h" #include "ug_lexical_phrase_scorer2.h" +#include "ug_phrasepair.h" +#include "ug_lru_cache.h" #define PSTATS_CACHE_THRESHOLD 50 @@ -57,6 +59,7 @@ namespace Moses { namespace bitext { template class Bitext; + template class PhrasePair; using namespace ugdiss; template class Bitext; @@ -160,6 +163,246 @@ namespace Moses { }; + template + string + toString(TokenIndex const& V, Token const* x, size_t const len) + { + if (!len) return ""; + UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!"); + ostringstream buf; + buf << V[x->id()]; + size_t i = 1; + for (x = x->next(); x && i < len; ++i, x = x->next()) + buf << " " << V[x->id()]; + UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!"); + return buf.str(); + } + + template + class + PhrasePair + { + public: + class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; }; + Token const* start1; + Token const* start2; + uint32_t len1; + uint32_t len2; + uint64_t p1, p2; + uint32_t raw1,raw2,sample1,sample2,good1,good2,joint; + vector fvals; + float dfwd[po_other+1]; // distortion counts // counts or probs? + float dbwd[po_other+1]; // distortion counts + vector aln; + float score; + bool inverse; + PhrasePair() { }; + PhrasePair(PhrasePair const& o); + + PhrasePair const& operator+=(PhrasePair const& other); + + bool operator<(PhrasePair const& other) const; + bool operator>(PhrasePair const& other) const; + bool operator<=(PhrasePair const& other) const; + bool operator>=(PhrasePair const& other) const; + + void init(); + void init(uint64_t const pid1, bool is_inverse, + Token const* x, uint32_t const len, + pstats const* ps = NULL, size_t const numfeats=0); + + // void init(uint64_t const pid1, pstats const& ps, size_t const numfeats); + // void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, + // size_t const numfeats); + + // PhrasePair const& + // update(uint64_t const pid2, size_t r2 = 0); + + PhrasePair const& + update(uint64_t const pid2, Token const* x, + uint32_t const len, jstats const& js); + + // PhrasePair const& + // update(uint64_t const pid2, jstats const& js1, jstats const& js2); + + // PhrasePair const& + // update(uint64_t const pid2, size_t const raw2extra, jstats const& js); + + // float + // eval(vector const& w); + + class SortByTargetIdSeq + { + public: + int cmp(PhrasePair const& a, PhrasePair const& b) const; + bool operator()(PhrasePair const& a, PhrasePair const& b) const; + }; + }; + + template + void + PhrasePair:: + init(uint64_t const pid1, bool is_inverse, Token const* x, uint32_t const len, + pstats const* ps, size_t const numfeats) + { + inverse = is_inverse; + start1 = x; len1 = len; + p1 = pid1; + p2 = 0; + if (ps) + { + raw1 = ps->raw_cnt; + sample1 = ps->sample_cnt; + good1 = ps->good; + } + else raw1 = sample1 = good1 = 0; + joint = 0; + good2 = 0; + sample2 = 0; + raw2 = 0; + fvals.resize(numfeats); + } + + template + PhrasePair const& + PhrasePair:: + update(uint64_t const pid2, + Token const* x, uint32_t const len, jstats const& js) + { + p2 = pid2; + start2 = x; len2 = len; + raw2 = js.cnt2(); + joint = js.rcnt(); + assert(js.aln().size()); + if (js.aln().size()) + aln = js.aln()[0].second; + float total_fwd = 0, total_bwd = 0; + for (int i = po_first; i <= po_other; i++) + { + PhraseOrientation po = static_cast(i); + total_fwd += js.dcnt_fwd(po)+1; + total_bwd += js.dcnt_bwd(po)+1; + } + + // should we do that here or leave the raw counts? + for (int i = po_first; i <= po_other; i++) + { + PhraseOrientation po = static_cast(i); + dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd; + dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd; + } + + return *this; + } + + template + bool + PhrasePair:: + operator<(PhrasePair const& other) const + { return this->score < other.score; } + + template + bool + PhrasePair:: + operator>(PhrasePair const& other) const + { return this->score > other.score; } + + template + bool + PhrasePair:: + operator<=(PhrasePair const& other) const + { return this->score <= other.score; } + + template + bool + PhrasePair:: + operator>=(PhrasePair const& other) const + { return this->score >= other.score; } + + template + PhrasePair const& + PhrasePair:: + operator+=(PhrasePair const& o) + { + raw1 += o.raw1; + raw2 += o.raw2; + sample1 += o.sample1; + sample2 += o.sample2; + good1 += o.good1; + good2 += o.good2; + joint += o.joint; + return *this; + } + + template + PhrasePair:: + PhrasePair(PhrasePair const& o) + : start1(o.start1) + , start2(o.start2) + , len1(o.len1) + , len2(o.len2) + , p1(o.p1) + , p2(o.p2) + , raw1(o.raw1) + , raw2(o.raw2) + , sample1(o.sample1) + , sample2(o.sample2) + , good1(o.good1) + , good2(o.good2) + , joint(o.joint) + , fvals(o.fvals) + , aln(o.aln) + , score(o.score) + , inverse(o.inverse) + { + for (size_t i = 0; i <= po_other; ++i) + { + dfwd[i] = o.dfwd[i]; + dbwd[i] = o.dbwd[i]; + } + } + + template + int + PhrasePair:: + SortByTargetIdSeq:: + cmp(PhrasePair const& a, PhrasePair const& b) const + { + size_t i = 0; + Token const* x = a.start2; + Token const* y = b.start2; + while (i < a.len2 && i < b.len2 && x->id() == y->id()) + { + x = x->next(); + y = y->next(); + ++i; + } + if (i == a.len2 && i == b.len2) return 0; + if (i == a.len2) return -1; + if (i == b.len2) return 1; + return x->id() < y->id() ? -1 : 1; + } + + template + bool + PhrasePair:: + SortByTargetIdSeq:: + operator()(PhrasePair const& a, PhrasePair const& b) const + { + return this->cmp(a,b) < 0; + } + + template + void + PhrasePair:: + init() + { + inverse = false; + len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0; + start1 = start2 = NULL; + p1 = p2 = 0; + } + template class Bitext { @@ -210,9 +453,14 @@ namespace Moses { #endif mutable pcache_t cache1,cache2; protected: + typedef typename + lru_cache::LRU_Cache > > + pplist_cache_t; + size_t default_sample_size; size_t num_workers; size_t m_pstats_cache_threshold; + mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2; private: sptr prep2(iter const& phrase, size_t const max_sample) const; @@ -235,6 +483,14 @@ namespace Moses { // sptr lookup(Phrase const& phrase, size_t factor) const; sptr lookup(iter const& phrase) const; sptr lookup(iter const& phrase, size_t const max_sample) const; + + void + lookup(vector const& snt, TSA& idx, + vector > > > >& dest, + vector >* pidmap = NULL, + typename PhrasePair::Scorer* scorer=NULL, + bool multithread=true) const; + void prep(iter const& phrase) const; void setDefaultSampleSize(size_t const max_samples); @@ -487,7 +743,8 @@ namespace Moses { } else if (!ag.bt.find_trg_phr_bounds (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd, - NULL,NULL,true)) + // NULL,NULL,true)) + &aln,NULL,true)) continue; j->stats->lock.lock(); j->stats->good += 1; @@ -495,7 +752,8 @@ namespace Moses { ++j->stats->ofwd[po_fwd]; ++j->stats->obwd[po_bwd]; j->stats->lock.unlock(); - for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2) + // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2) + for (size_t k = 1; k < aln.size(); k += 2) aln[k] += s2 - s1; Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid); float sample_weight = 1./((s2-s1+1)*(e2-e1+1)); @@ -567,8 +825,10 @@ namespace Moses { #endif } } - if (j->fwd && s < s2) - for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2) + // if (j->fwd && s < s2) + // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2) + if (s < s2) + for (size_t k = 1; k < aln.size(); k += 2) --aln[k]; } // j->stats->lock.unlock(); @@ -584,7 +844,8 @@ namespace Moses { ~job() { if (stats) stats.reset(); - --active; + try { --active; } catch (...) {} + // counter may not exist any more at destruction time } template @@ -981,9 +1242,18 @@ namespace Moses { assert(T2); assert(Tx); - bitvector forbidden((flip ? T1 : T2)->sntLen(sid)); - size_t slen1 = (*T1).sntLen(sid); - size_t slen2 = (*T2).sntLen(sid); + size_t slen1,slen2; + if (flip) + { + slen1 = T2->sntLen(sid); + slen2 = T1->sntLen(sid); + } + else + { + slen1 = T1->sntLen(sid); + slen2 = T2->sntLen(sid); + } + bitvector forbidden(slen2); if (full_alignment) { if (slen1*slen2 > full_alignment->size()) @@ -1002,17 +1272,11 @@ namespace Moses { if (flip) { p = binread(p,trg); assert(p= slen1 || trg >= slen2) - { - ostringstream buf; - buf << "Alignment range error at sentence " << sid << "!" << endl - << src << "/" << slen1 << " " << trg << "/" << slen2 << endl; - cerr << buf.str() << endl; - UTIL_THROW(util::Exception, buf.str().c_str()); - } - + UTIL_THROW_IF2((src >= slen1 || trg >= slen2), + "Alignment range error at sentence " << sid << "!\n" + << src << "/" << slen1 << " " << + trg << "/" << slen2); + if (src < start || src >= stop) forbidden.set(trg); else @@ -1022,22 +1286,11 @@ namespace Moses { } if (core_alignment) { - if (flip) - { - aln1[trg].push_back(src); - aln2[src].push_back(trg); - } - else - { - aln1[src].push_back(trg); - aln2[trg].push_back(src); - } + aln1[src].push_back(trg); + aln2[trg].push_back(src); } if (full_alignment) - { - if (flip) full_alignment->set(trg*slen2 + src); - else full_alignment->set(src*slen2 + trg); - } + full_alignment->set(src*slen2 + trg); } for (size_t i = lft; i <= rgt; ++i) @@ -1051,67 +1304,17 @@ namespace Moses { if (core_alignment) { core_alignment->clear(); - if (flip) + for (size_t i = start; i < stop; ++i) { - for (size_t i = lft; i <= rgt; ++i) + BOOST_FOREACH(ushort x, aln1[i]) { - sort(aln1[i].begin(),aln1[i].end()); - BOOST_FOREACH(ushort x, aln1[i]) - { - core_alignment->push_back(i-lft); - core_alignment->push_back(x-start); - } + core_alignment->push_back(i-start); + core_alignment->push_back(x-lft); } } - else - { - for (size_t i = start; i < stop; ++i) - { - BOOST_FOREACH(ushort x, aln1[i]) - { - core_alignment->push_back(i-start); - core_alignment->push_back(x-lft); - } - } - } - // now determine fwd and bwd phrase orientation - if (flip) - { - po_fwd = find_po_fwd(aln2,aln1,start,stop,s1,e2); - po_bwd = find_po_bwd(aln2,aln1,start,stop,s1,e2); - } - else - { - po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2); - po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2); - } -#if 0 - // if (e1 - s1 > 3) - { - lock_guard guard(this->lock); - Token const* t1 = T1->sntStart(sid); - Token const* t2 = T2->sntStart(sid); - cout << "[" << start << ":" << stop << "] => [" - << s1 << ":" << s2 << ":" - << e1 << ":" << e2 << "]" << endl; - for (size_t k = start; k < stop; ++k) - cout << k-start << "." << (*V1)[t1[k].id()] << " "; - cout << endl; - for (size_t k = s1; k < e2;) - { - if (k == s2) cout << "["; - cout << int(k)-int(s2) << "." << (*V2)[t2[k].id()]; - if (++k == e1) cout << "] "; - else cout << " "; - } - cout << endl; - for (size_t k = 0; k < core_alignment->size(); k += 2) - cout << int((*core_alignment)[k]) << "-" << int((*core_alignment)[k+1]) << " "; - cout << "\n" << __FILE__ << ":" << __LINE__ << endl; - - } -#endif + po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2); + po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2); } return lft <= rgt; } @@ -1143,9 +1346,10 @@ namespace Moses { max_sample == this->default_sample_size && phrase.approxOccurrenceCount() > m_pstats_cache_threshold) { - // need to test what a good caching threshold is + // still need to test what a good caching threshold is // is caching here the cause of the apparent memory leak in - // confusion network decoding ???? + // confusion network decoding ???? No, it isn't. + // That was because of naive, brute-force input path generation. uint64_t pid = phrase.getPid(); pcache_t & cache(phrase.root == &(*this->I1) ? cache1 : cache2); pcache_t::value_type entry(pid,sptr()); @@ -1170,6 +1374,124 @@ namespace Moses { return ret; } + // worker for scoring and sorting phrase table entries in parallel + template + class pstats2pplist + { + Ttrack const& m_other; + sptr m_pstats; + vector >& m_pplist; + typename PhrasePair::Scorer const* m_scorer; + PhrasePair m_pp; + Token const* m_token; + size_t m_len; + uint64_t m_pid1; + bool m_is_inverse; + public: + + // CONSTRUCTOR + pstats2pplist(typename TSA::tree_iterator const& m, + Ttrack const& other, + sptr const& ps, + vector >& dest, + typename PhrasePair::Scorer const* scorer) + : m_other(other) + , m_pstats(ps) + , m_pplist(dest) + , m_scorer(scorer) + , m_token(m.getToken(0)) + , m_len(m.size()) + , m_pid1(m.getPid()) + , m_is_inverse(false) + { } + + // WORKER + void + operator()() + { + // wait till all statistics have been collected + boost::unique_lock lock(m_pstats->lock); + while (m_pstats->in_progress) + m_pstats->ready.wait(lock); + + m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0); + + // convert pstats entries to phrase pairs + pstats::trg_map_t::iterator a; + for (a = m_pstats->trg.begin(); a != m_pstats->trg.end(); ++a) + { + uint32_t sid,off,len; + parse_pid(a->first, sid, off, len); + m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second); + m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),m_pp.joint); + size_t J = m_pp.joint<<7; // hard coded threshold of 1/128 + if (m_pp.good1 > J || m_pp.good2 > J) continue; + if (m_scorer) + { + (*m_scorer)(m_pp); + } + m_pplist.push_back(m_pp); + } + greater > sorter; + if (m_scorer) sort(m_pplist.begin(), m_pplist.end(),sorter); + } + }; + + template + void + Bitext:: + lookup(vector const& snt, TSA& idx, + vector > > > >& dest, + vector >* pidmap, + typename PhrasePair::Scorer* scorer, + bool multithread) const + { + typedef vector > > > > ret_t; + + dest.clear(); + dest.resize(snt.size()); + if (pidmap) { pidmap->clear(); pidmap->resize(snt.size()); } + + // collect statistics in parallel, then build PT entries as + // the sampling finishes + bool fwd = &idx == I1.get(); + vector workers; // background threads doing the lookup + pplist_cache_t& C = (fwd ? m_pplist_cache1 : m_pplist_cache2); + if (C.capacity() < 100000) C.reserve(100000); + for (size_t i = 0; i < snt.size(); ++i) + { + dest[i].reserve(snt.size()-i); + typename TSA::tree_iterator m(&idx); + for (size_t k = i; k < snt.size() && m.extend(snt[k].id()); ++k) + { + uint64_t key = m.getPid(); + if (pidmap) (*pidmap)[i].push_back(key); + sptr > > pp = C.get(key); + if (pp) + dest[i].push_back(pp); + else + { + pp.reset(new vector >()); + C.set(key,pp); + dest[i].push_back(pp); + sptr x = prep2(m, this->default_sample_size); + pstats2pplist w(m,*(fwd?T2:T1),x,*pp,scorer); + if (multithread) + { + boost::thread* t = new boost::thread(w); + workers.push_back(t); + } + else w(); + } + } + } + for (size_t w = 0; w < workers.size(); ++w) + { + workers[w]->join(); + delete workers[w]; + } + } + template sptr Bitext:: @@ -1242,6 +1564,37 @@ namespace Moses { agenda:: job::active; + + template + void + expand(typename Bitext::iter const& m, + Bitext const& bt, + pstats const& ps, vector >& dest) + { + bool fwd = m.root == bt.I1.get(); + dest.reserve(ps.trg.size()); + PhrasePair pp; + pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0); + // cout << HERE << " " << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << endl; + pstats::trg_map_t::const_iterator a; + for (a = ps.trg.begin(); a != ps.trg.end(); ++a) + { + uint32_t sid,off,len; + parse_pid(a->first, sid, off, len); + pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off, + len, a->second); + dest.push_back(pp); + } +#if 0 + typename PhrasePair::SortByTargetIdSeq sorter; + sort(dest.begin(), dest.end(),sorter); + BOOST_FOREACH(PhrasePair const& p, dest) + cout << toString (*(fwd ? bt.V1 : bt.V2),p.start1,p.len1) << " ::: " + << toString (*(fwd ? bt.V2 : bt.V1),p.start2,p.len2) << " " + << p.joint << endl; +#endif + } + } // end of namespace bitext } // end of namespace moses #endif diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h index 8cd43dc187..a966d00dc1 100644 --- a/moses/TranslationModel/UG/mm/ug_phrasepair.h +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h @@ -1,243 +1,13 @@ //-*- c++ -*- #pragma once -#include "ug_bitext.h" -using namespace ugdiss; -using namespace std; +// using namespace ugdiss; +// using namespace std; -namespace Moses { - namespace bitext - { +// namespace Moses { +// namespace bitext +// { - template - string - toString(TokenIndex const& V, Token const* x, size_t const len) - { - if (!len) return ""; - UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!"); - ostringstream buf; - buf << V[x->id()]; - size_t i = 1; - for (x = x->next(); x && i < len; ++i, x = x->next()) - buf << " " << V[x->id()]; - UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!"); - return buf.str(); - } - template - class - PhrasePair - { - public: - Token const* start1; - Token const* start2; - uint32_t len1; - uint32_t len2; - // uint64_t p1, p2; - uint32_t raw1,raw2,sample1,sample2,good1,good2,joint; - vector fvals; - float dfwd[po_other+1]; // distortion counts // counts or probs? - float dbwd[po_other+1]; // distortion counts - vector aln; - float score; - PhrasePair() { }; - PhrasePair(PhrasePair const& o); - - PhrasePair const& operator+=(PhrasePair const& other); - - bool operator<(PhrasePair const& other) const; - bool operator>(PhrasePair const& other) const; - bool operator<=(PhrasePair const& other) const; - bool operator>=(PhrasePair const& other) const; - - void init(); - void init(Token const* x, uint32_t const len, - pstats const* ps = NULL, size_t const numfeats=0); - - // void init(uint64_t const pid1, pstats const& ps, size_t const numfeats); - // void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, - // size_t const numfeats); - - // PhrasePair const& - // update(uint64_t const pid2, size_t r2 = 0); - - PhrasePair const& - update(Token const* x, uint32_t const len, jstats const& js); - - // PhrasePair const& - // update(uint64_t const pid2, jstats const& js1, jstats const& js2); - - // PhrasePair const& - // update(uint64_t const pid2, size_t const raw2extra, jstats const& js); - - // float - // eval(vector const& w); - - class SortByTargetIdSeq - { - public: - int cmp(PhrasePair const& a, PhrasePair const& b) const; - bool operator()(PhrasePair const& a, PhrasePair const& b) const; - }; - }; - - template - void - PhrasePair:: - init(Token const* x, uint32_t const len, - pstats const* ps, size_t const numfeats) - { - start1 = x; len1 = len; - // p1 = pid1; - // p2 = 0; - if (ps) - { - raw1 = ps->raw_cnt; - sample1 = ps->sample_cnt; - good1 = ps->good; - } - else raw1 = sample1 = good1 = 0; - joint = 0; - good2 = 0; - sample2 = 0; - raw2 = 0; - fvals.resize(numfeats); - } - - template - PhrasePair const& - PhrasePair:: - update(Token const* x, uint32_t const len, jstats const& js) - { - // p2 = pid2; - start2 = x; len2 = len; - raw2 = js.cnt2(); - joint = js.rcnt(); - assert(js.aln().size()); - if (js.aln().size()) - aln = js.aln()[0].second; - float total_fwd = 0, total_bwd = 0; - for (int i = po_first; i <= po_other; i++) - { - PhraseOrientation po = static_cast(i); - total_fwd += js.dcnt_fwd(po)+1; - total_bwd += js.dcnt_bwd(po)+1; - } - - // should we do that here or leave the raw counts? - for (int i = po_first; i <= po_other; i++) - { - PhraseOrientation po = static_cast(i); - dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd; - dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd; - } - - return *this; - } - - template - bool - PhrasePair:: - operator<(PhrasePair const& other) const - { return this->score < other.score; } - - template - bool - PhrasePair:: - operator>(PhrasePair const& other) const - { return this->score > other.score; } - - template - bool - PhrasePair:: - operator<=(PhrasePair const& other) const - { return this->score <= other.score; } - - template - bool - PhrasePair:: - operator>=(PhrasePair const& other) const - { return this->score >= other.score; } - - template - PhrasePair const& - PhrasePair:: - operator+=(PhrasePair const& o) - { - raw1 += o.raw1; - raw2 += o.raw2; - sample1 += o.sample1; - sample2 += o.sample2; - good1 += o.good1; - good2 += o.good2; - joint += o.joint; - return *this; - } - - template - PhrasePair:: - PhrasePair(PhrasePair const& o) - : start1(o.start1) - , start2(o.start2) - , len1(o.len1) - , len2(o.len2) - , raw1(o.raw1) - , raw2(o.raw2) - , sample1(o.sample1) - , sample2(o.sample2) - , good1(o.good1) - , good2(o.good2) - , joint(o.joint) - , fvals(o.fvals) - , aln(o.aln) - , score(o.score) - { - for (size_t i = 0; i <= po_other; ++i) - { - dfwd[i] = o.dfwd[i]; - dbwd[i] = o.dbwd[i]; - } - } - - template - int - PhrasePair:: - SortByTargetIdSeq:: - cmp(PhrasePair const& a, PhrasePair const& b) const - { - size_t i = 0; - Token const* x = a.start2; - Token const* y = b.start2; - while (i < a.len2 && i < b.len2 && x->id() == y->id()) - { - x = x->next(); - y = y->next(); - ++i; - } - if (i == a.len2 && i == b.len2) return 0; - if (i == a.len2) return -1; - if (i == b.len2) return 1; - return x->id() < y->id() ? -1 : 1; - } - - template - bool - PhrasePair:: - SortByTargetIdSeq:: - operator()(PhrasePair const& a, PhrasePair const& b) const - { - return this->cmp(a,b) < 0; - } - - template - void - PhrasePair:: - init() - { - len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0; - start1 = start2 = NULL; - } - - - } // namespace bitext -} // namespace Moses +// } // namespace bitext +// } // namespace Moses diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index e59d4c61a8..459c64fa19 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -796,34 +796,6 @@ namespace Moses assert(this->refCount == 0); } - template - void - expand(typename Bitext::iter const& m, - Bitext const& bt, - pstats const& ps, vector >& dest) - { - dest.reserve(ps.trg.size()); - PhrasePair pp; - pp.init(m.getToken(0), m.size(), &ps, 0); - // cout << HERE << " " << toString(*(bt.V1),pp.start1,pp.len1) << endl; - pstats::trg_map_t::const_iterator a; - for (a = ps.trg.begin(); a != ps.trg.end(); ++a) - { - uint32_t sid,off,len; - parse_pid(a->first, sid, off, len); - pp.update(bt.T2->sntStart(sid)+off, len, a->second); - dest.push_back(pp); - } - typename PhrasePair::SortByTargetIdSeq sorter; - sort(dest.begin(), dest.end(),sorter); -#if 0 - BOOST_FOREACH(PhrasePair const& p, dest) - cout << toString (*bt.V1,p.start1,p.len1) << " ::: " - << toString (*bt.V2,p.start2,p.len2) << " " - << p.joint << endl; -#endif - } - // This is not the most efficient way of phrase lookup! TargetPhraseCollection const* Mmsapt:: @@ -889,9 +861,18 @@ namespace Moses if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(mdyn); vector > ppfix,ppdyn; - if (sfix) expand(mfix, btfix, *sfix, ppfix); - if (sdyn) expand(mdyn, *dyn, *sdyn, ppdyn); - + PhrasePair::SortByTargetIdSeq sort_by_tgt_id; + if (sfix) + { + expand(mfix, btfix, *sfix, ppfix); + sort(ppfix.begin(), ppfix.end(),sort_by_tgt_id); + } + if (sdyn) + { + expand(mdyn, *dyn, *sdyn, ppdyn); + sort(ppdyn.begin(), ppdyn.end(),sort_by_tgt_id); + } + // now we have two lists of Phrase Pairs, let's merge them TargetPhraseCollectionWrapper* ret; ret = new TargetPhraseCollectionWrapper(revision,phrasekey); From cef646098121a7b4ebf3fcedf6e65ad99f1b2df2 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 30 Aug 2014 07:36:37 +0100 Subject: [PATCH 10/13] Initial check-in. --- .../UG/generic/stringdist/ug_stringdist.cc | 434 +++++++++ .../UG/generic/stringdist/ug_stringdist.h | 87 ++ moses/TranslationModel/UG/mm/ug_lru_cache.h | 106 +++ moses/TranslationModel/UG/try-align2.cc | 886 ++++++++++++++++++ 4 files changed, 1513 insertions(+) create mode 100644 moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc create mode 100644 moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h create mode 100644 moses/TranslationModel/UG/mm/ug_lru_cache.h create mode 100644 moses/TranslationModel/UG/try-align2.cc diff --git a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc new file mode 100644 index 0000000000..4b61ecd60e --- /dev/null +++ b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc @@ -0,0 +1,434 @@ +#include +#include +#include +#include +#include "ug_stringdist.h" +// string distance measures +// Code by Ulrich Germann + +namespace stringdist +{ + + UErrorCode strip_accents(UnicodeString & trg) + { + UErrorCode status = U_ZERO_ERROR; + static Transliterator *stripper + = Transliterator::createInstance("NFD; [:M:] Remove; NFC", + UTRANS_FORWARD, status); + stripper->transliterate(trg); + return status; + } + + char const* + StringDiff:: + Segment:: + elabel[] = { "same", "cap", "flip", "permutation", + "accent", "duplication", + "insertion", "deletion", + "mismatch", "noinit" }; + + StringDiff:: + StringDiff() + {} + + StringDiff:: + StringDiff(string const& a, string const& b) + { + set_a(a); + set_b(b); + align(); + } + + StringDiff:: + Segment:: + Segment() + : start_a(-1), end_a(-1), start_b(-1), end_b(-1), match(noinit), dist(0) + {} + + UnicodeString const& + StringDiff:: + set_a(string const& a) + { + this->a = a.c_str(); + return this->a; + } + + UnicodeString const& + StringDiff:: + set_b(string const& b) + { + this->b = b.c_str(); + return this->b; + } + + UnicodeString const& + StringDiff:: + get_a() const + { + return this->a; + } + + UnicodeString const& + StringDiff:: + get_b() const + { + return this->b; + } + + size_t + StringDiff:: + size() + { + return this->difflist.size(); + } + + // float + // StringDiff:: + // levelshtein(bool force) + // { + // align(force); + // float ret = 0; + // for (size_t i = 0; i < difflist.size(); +++i) + // { + // Segment const& s = difflist[i]; + // if (s.match == same) continue; + // else if (s.match == insertion) ret += s.end_b - s.start_b; + // else if (s.match == deletion) ret += s.end_a - s.start_a; + + // } + // } + + void + StringDiff:: + fillAlignmentMatrix(vector > & M) const + { + assert(a.length() && b.length()); + M.assign(a.length(),vector(b.length(),0)); + int i = 0,j; + while (i < b.length() && b[i] != a[0]) ++i; + while (i < b.length()) M[0][i++] = 1; + i = 0; + while (i < a.length() && a[i] != b[0]) ++i; + while (i < a.length()) M[i++][0] = 1; + for (i = 1; i < a.length(); ++i) + { + for (j = 1; j < b.length(); ++j) + { + float & s = M[i][j]; + s = max(M[i-1][j],M[i][j-1]); + if (a[i] == b[j]) + s = max(s,M[i-1][j-1] + 1 + (a[i-1] == b[j-1] ? .1f : 0)); + } + } +#if 0 + string abuf,bbuf; + a.toUTF8String(abuf); + b.toUTF8String(bbuf); + cout << " " << bbuf[0]; + for (int x = 1; x < b.length(); ++x) + cout << " " << bbuf[x]; + cout << endl; + for (int x = 0; x < a.length(); ++x) + { + cout << abuf[x] << " "; + for (int y = 0; y < b.length(); ++y) + cout << int(M[x][y]) << " "; + cout << endl; + } +#endif + } + + float + fillAlignmentMatrix(UChar const* a, size_t const lenA, + UChar const* b, size_t const lenB, + vector > & M) + { + M.assign(lenA,vector(lenB,0)); + assert(lenA); assert(lenB); + size_t i = 0; + while (i < lenB && b[i] != a[0]) ++i; + while (i < lenB) M[0][i++] = 1; + i = 0; + while (i < lenA && a[i] != b[0]) ++i; + while (i < lenA) M[i++][0] = 1; + for (i = 1; i < lenA; ++i) + { + for (size_t j = 1; j < lenB; ++j) + { + float & s = M[i][j]; + s = max(M[i-1][j], M[i][j-1]); + if (a[i] == b[j]) + s = max(s, M[i-1][j-1] + 1); + } + } + return M.back().back(); + } + + float + levenshtein(UChar const* a, size_t const lenA, + UChar const* b, size_t const lenB) + { + vector > M; + fillAlignmentMatrix(a,lenA,b,lenB,M); + size_t ret = 0; +#define DEBUGME 0 +#if DEBUGME + for (size_t i = 0; i < M.size(); ++i) + { + for (size_t j = 0; j < M[i].size(); ++j) + cout << M[i][j] << " "; + cout << endl; + } + cout << string(25,'-') << endl; +#endif + + int i = M.size() -1; + int j = M.back().size() -1; + int I=i, J=j; + for (;i >= 0 || j >= 0; --i, --j) + { + I=i, J=j; + if (j>=0) while (i > 0 && M[i-1][j] == M[i][j]) --i; + if (i>=0) while (j > 0 && M[i][j-1] == M[i][j]) --j; + size_t ilen = I >= 0 ? I - i : 0; + size_t jlen = J >= 0 ? J - j : 0; + ret += max(ilen,jlen); +#if DEBUGME + cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl; +#endif + I=i, J=j; + } + size_t ilen = I >= 0 ? I - i : 0; + size_t jlen = J >= 0 ? J - j : 0; + ret += max(ilen,jlen); +#if DEBUGME + cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl; +#endif + return ret; + } + + + + StringDiff:: + Segment:: + Segment(size_t const as, size_t const ae, + size_t const bs, size_t const be, + UnicodeString const& a, + UnicodeString const& b) + { + dist = 0; + start_a = as; end_a = ae; + start_b = bs; end_b = be; + if (as == ae) + match = bs == be ? same : insertion; + else if (bs == be) + match = deletion; + else if (be-bs != ae-as) + { + match = mismatch; + dist = stringdist::levenshtein(a.getBuffer() + as, ae - as, + b.getBuffer() + bs, be - bs); + } + else + { + match = same; + size_t stop = ae-as; + for (size_t i = 0; i < stop && match == same; ++i) + if (a[as+i] != b[bs+i]) match = mismatch; + if (match == mismatch) + { + if (ae-as == 2 && a[as] == b[bs+1] && a[as+1] == b[bs]) + match = flip; + else + { + vector x(a.getBuffer() + as, a.getBuffer() + ae); + vector y(b.getBuffer() + bs, b.getBuffer() + be); + sort(x.begin(),x.end()); + sort(y.begin(),y.end()); + if (x == y) match = permutation; + else dist = stringdist::levenshtein(a.getBuffer() + as, ae - as, + b.getBuffer() + bs, be - bs); + } + } + } + if (match == insertion) + { + dist = be-bs; + } + else if (match == deletion) + { + dist = ae-as; + } + else if (match == flip) dist = 1; + else if (match == permutation) dist = ae-as-1; + if (match == mismatch) + { + UnicodeString ax(a,as,ae-as); + UnicodeString bx(b,bs,be-bs); + if (ax.toLower() == bx.toLower()) + match = cap; + else + { + strip_accents(ax); + strip_accents(bx); + if (ax == bx) match = accent; + } + } + } + + size_t + StringDiff:: + align(bool force) + { + if (force) difflist.clear(); + if (difflist.size()) return 0; + vector > M; + fillAlignmentMatrix(M); + // now backtrack + int i = a.length() - 1; + int j = b.length() - 1; + vector A(a.length(), -1); + vector B(b.length(), -1); + while (i + j) + { + while (i && M[i-1][j] == M[i][j]) --i; + while (j && M[i][j-1] == M[i][j]) --j; + if (a[i] == b[j]) { A[i] = j; B[j] = i; } + if (i) --i; + if (j) --j; + } + i = a.length() - 1; + j = b.length() - 1; + vector A2(a.length(), -1); + vector B2(b.length(), -1); + while (i + j) + { + while (j && M[i][j-1] == M[i][j]) --j; + while (i && M[i-1][j] == M[i][j]) --i; + if (a[i] == b[j]) { A2[i] = j; B2[j] = i; } + if (i) --i; + if (j) --j; + } + for (size_t k = 0; k < A.size(); ++k) + A[k] = min(A[k],A2[k]); + for (size_t k = 0; k < B.size(); ++k) + B[k] = min(B[k],B2[k]); + + if (a[i] == b[j]) { A[i] = j; B[j] = i; } + i = 0; + j = 0; + size_t I, J; + while (i < a.length() and j < b.length()) + { + if (A[i] < 0) + { + I = i + 1; + while (I < A.size() and A[I] < 0) ++I; + if (i) + { for (J = j = A[i-1]+1; J < B.size() && B[J] < 0; ++J); } + else if (I < A.size()) + { for (j = J = A[I]; j && B[j-1] < 0; --j); } + else J = B.size(); + difflist.push_back(Segment(i,I,j,J,a,b)); + i = I; j = J; + } + else if (B[j] < 0) + { + for (J = j + 1; J < B.size() && B[J] < 0; ++J); + difflist.push_back(Segment(i,i,j,J,a,b)); + j = J; + } + else + { + I = i; + J = j; + while(I < A.size() && A[I] >= 0 && J < B.size() && B[J] >= 0) + { ++I; ++J; } + difflist.push_back(Segment(i,I,j,J,a,b)); + i = I; j = J; + } + } + if (i < a.length() || j < b.length()) + difflist.push_back(Segment(i,a.length(),j,b.length(),a,b)); + + diffcnt.assign(noinit,0); + for (size_t i = 0; i < difflist.size(); ++i) + { + Segment & s = difflist[i]; + if (s.match == insertion and + ((s.start_a and a[s.start_a - 1] == b[s.start_b]) or + (s.end_a < a.length() and a[s.end_a] == b[s.start_b]))) + { + bool sameletter = true; + for (int i = s.start_b + 1; sameletter and i < s.end_b; ++i) + sameletter = b[i] == b[i-1]; + if (sameletter) s.match = duplication; + } + else if (s.match == deletion and + ((s.start_b and b[s.start_b - 1] == a[s.start_a]) or + (s.end_b < b.length() and b[s.end_b] == a[s.start_a]))) + { + bool sameletter = true; + for (int i = s.start_a + 1; sameletter and i < s.end_a; ++i) + sameletter = a[i] == a[i-1]; + if (sameletter) s.match= duplication; + } + ++diffcnt[s.match]; + } + return 0; + } + + void + StringDiff:: + showDiff(std::ostream& out) + { + if (difflist.size() == 0) align(); + vector fromEnd(difflist.size(),0); + for (int d = difflist.size()-1; d-- > 0;) + { + fromEnd[d] = a.length() - difflist[d].end_a; + // cout << d << " " << fromEnd[d] << " " + // << difflist[d].start_a << "-" + // << difflist[d].end_a << endl; + } + for (size_t d = 0; d < difflist.size(); ++d) + { + Segment const& s = difflist[d]; + UnicodeString aseg,bseg; + a.extract(s.start_a, s.end_a - s.start_a, aseg); + b.extract(s.start_b, s.end_b - s.start_b, bseg); + string abuf,bbuf; + aseg.toUTF8String(abuf); + bseg.toUTF8String(bbuf); + out << abuf << " "; + out << bbuf << " "; + out << s.label() << " " + << s.dist << " " + << fromEnd[d] + << endl; + } + } + + char const* + StringDiff:: + Segment:: + label() const + { + return elabel[this->match]; + } + + StringDiff::Segment const& + StringDiff:: + operator[](uint32_t const i) const + { + return difflist.at(i); + } + + vector const& + StringDiff:: + getFeatures() const + { + return diffcnt; + } + +} diff --git a/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h new file mode 100644 index 0000000000..43fb089f18 --- /dev/null +++ b/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h @@ -0,0 +1,87 @@ +//-*- c++ -*- +#pragma once + +// string distance measures +// Code by Ulrich Germann +#include + + +#include +#include +#include +#include +#include +#include +#include + +#include "moses/TranslationModel/UG/mm/tpt_typedefs.h" + + +using namespace std; +//using namespace boost; +using namespace ugdiss; + +namespace stringdist +{ + float + levenshtein(UChar const* a, size_t const lenA, + UChar const* b, size_t const lenB); + + UErrorCode strip_accents(UnicodeString & trg); + + float + fillAlignmentMatrix(UChar const* a, size_t const lenA, + UChar const* b, size_t const lenB, + vector > & M); + + class StringDiff + { + public: + enum MATCHTYPE + { + same, // a and b are identical + cap, // a and b differ only in capitalization + flip, // two-letter flip + permutation, // a and b have same letters but in different order + accent, // a and b are the same basic letters, ignoring accents + duplication, // a is empty + insertion, // a is empty + deletion, // b is empty + mismatch, // none of the above + noinit // not initialized + }; + + struct Segment + { + static char const* elabel[]; + int start_a, end_a; + int start_b, end_b; + MATCHTYPE match; + float dist; + Segment(); + Segment(size_t const as, size_t const ae, + size_t const bs, size_t const be, + UnicodeString const& a, + UnicodeString const& b); + char const* label() const; + }; + private: + UnicodeString a,b; + vector difflist; + vector diffcnt; + public: + UnicodeString const& set_a(string const& a); + UnicodeString const& set_b(string const& b); + UnicodeString const& get_a() const; + UnicodeString const& get_b() const; + StringDiff(string const& a, string const& b); + StringDiff(); + size_t size(); + size_t align(bool force=false); // returns the levenshtein distance + void showDiff(std::ostream& out); + float levenshtein(); + Segment const& operator[](uint32_t i) const; + void fillAlignmentMatrix(vector > & M) const; + vector const& getFeatures() const; + }; +} diff --git a/moses/TranslationModel/UG/mm/ug_lru_cache.h b/moses/TranslationModel/UG/mm/ug_lru_cache.h new file mode 100644 index 0000000000..d1c9a97678 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_lru_cache.h @@ -0,0 +1,106 @@ +//-*- c++ -*- +#pragma once +#include +#include +#include +#include +#include +#include + + +#ifndef sptr +#define sptr boost::shared_ptr +#endif + +namespace lru_cache +{ + using namespace std; + using namespace boost; + + template + class LRU_Cache + { + public: + typedef unordered_map map_t; + private: + struct Record + { + uint32_t prev,next; + KEY key; + // timeval tstamp; // time stamp + typename boost::shared_ptr ptr; // cached shared ptr + }; + + mutable boost::shared_mutex m_lock; + uint32_t m_qfront, m_qback; + vector m_recs; + map_t m_idx; + + void + update_queue(KEY const& key, uint32_t const p) + { + // CALLER MUST LOCK! + // "remove" item in slot p from it's current position of the + // queue (which is different from the slot position) and move it + // to the end + Record& r = m_recs[p]; + if (m_recs.size() == 1) + r.next = r.prev = m_qback = m_qfront = 0; + + if (r.key != key || p == m_qback) return; + + if (m_qfront == p) + m_qfront = m_recs[r.next].prev = r.next; + else + { + m_recs[r.prev].next = r.next; + m_recs[r.next].prev = r.prev; + } + r.prev = m_qback; + m_recs[r.prev].next = m_qback = r.next = p; + } + + public: + LRU_Cache(size_t capacity=1) : m_qfront(0), m_qback(0) { reserve(capacity); } + size_t capacity() const { return m_recs.capacity(); } + void reserve(size_t s) { m_recs.reserve(s); } + + sptr + get(KEY const& key) + { + uint32_t p; + { // brackets needed for lock scoping + boost::shared_lock rlock(m_lock); + typename map_t::const_iterator i = m_idx.find(key); + if (i == m_idx.end()) return sptr(); + p = i->second; + } + boost::lock_guard guard(m_lock); + update_queue(key,p); + return m_recs[p].ptr; + } + + void + set(KEY const& key, sptr const& ptr) + { + boost::lock_guard lock(m_lock); + pair foo; + foo = m_idx.insert(make_pair(key,m_recs.size())); + + uint32_t p = foo.first->second; + if (foo.second) // was not in the cache + { + if (m_recs.size() < m_recs.capacity()) + m_recs.push_back(Record()); + else + { + foo.first->second = p = m_qfront; + m_idx.erase(m_recs[p].key); + } + m_recs[p].key = key; + } + update_queue(key,p); + m_recs[p].ptr = ptr; + } + }; +} diff --git a/moses/TranslationModel/UG/try-align2.cc b/moses/TranslationModel/UG/try-align2.cc new file mode 100644 index 0000000000..68078a2fb5 --- /dev/null +++ b/moses/TranslationModel/UG/try-align2.cc @@ -0,0 +1,886 @@ +#include "mm/ug_bitext.h" +#include +// #include +#include +#include +#include +#include +#include +#include "moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h" + +using namespace std; +using namespace Moses; +using namespace ugdiss; +using namespace Moses::bitext; + +typedef L2R_Token Token; +typedef mmTtrack ttrack_t; +typedef mmTSA tsa_t; +typedef vector > pplist_t; +typedef pair span_t; + +TokenIndex V1,V2; +boost::shared_ptr T1,T2; +tsa_t I1,I2; +mmBitext BT; + +float lbop_level = .05; +#define smooth 1 +namespace stats +{ + using namespace Moses::bitext; + float + pmi(size_t j,size_t m1, size_t m2, size_t N) + { +#if smooth + float p1 = lbop(N,m1,lbop_level); + float p2 = lbop(N,m2,lbop_level); + float p12 = lbop(N,j,lbop_level); + return log(p12) - log(p1) - log(p2); +#else + return log(j) + log(N) - log(m1) - log(m2); +#endif + } + + float + npmi(size_t j,size_t m1, size_t m2, size_t N) + { +#if smooth + // cout << j << " " << m1 << " " << m2 << " " << N << endl; + float p1 = lbop(N,m1,lbop_level); + float p2 = lbop(N,m2,lbop_level); + float p12 = lbop(N,j,lbop_level); + return (log(p12) - log(p1) - log(p2)) / -log(p12); +#else + return pmi(j,m1,m2,N) / (log(N) - log(j)); +#endif + } + + float + mi(size_t j,size_t m1, size_t m2, size_t N) + { + float ret = 0; + if (j) ret += float(j)/N * pmi(j,m1,m2,N); + if (m1>j) ret += float(m1-j)/N * pmi(m1-j,m1,N-m2,N); + if (m2>j) ret += float(m2-j)/N * pmi(m2-j,N-m1,m2,N); + if (N>m1+m2-j) ret += float(N-m1-m2+j)/N * pmi(N-m1-m2+j,N-m1,N-m2,N); + return ret; + } +} + +struct SinglePhrase +{ + typedef map > cache_t; + uint64_t pid; // phrase id + vector occs; // occurrences +}; + + +struct PhrasePair2 +{ + struct score_t; + uint64_t p1,p2; + ushort s1,e1,s2,e2; + int parent; + + struct stats_t + { + typedef map, sptr > cache_t; + size_t m1,m2,j; + float npmi; // normalized point-wise mutual information + float pmi; // point-wise mutual information + float mi; // mutual information + float score; + + void + set(vector const& o1, + vector const& o2, + size_t const N) + { + m1 = m2 = j = 0; + size_t i1=0,i2=0; + while (i1 < o1.size() && i2 < o2.size()) + { + if (i1 && o1[i1].sid == o1[i1-1].sid) { ++i1; continue; } + if (i2 && o2[i2].sid == o2[i2-1].sid) { ++i2; continue; } + + if (o1[i1].sid == o2[i2].sid) { ++j; ++i1; ++i2; ++m1; ++m2; } + else if (o1[i1].sid < o2[i2].sid) { ++i1; ++m1; } + else { ++i2; ++m2; } + } + // for (++i1; i1 < o1.size(); ++i1) + // if (o1[i1-1].sid != o1[i1].sid) ++m1; + // for (++i2; i2 < o2.size(); ++i2) + // if (o2[i2-1].sid != o2[i2].sid) ++m2; + + m1 = 1; m2 = 1; + for (i1=1; i1 < o1.size(); ++i1) + if (o1[i1-1].sid != o1[i1].sid) ++m1; + for (i2=1; i2 < o2.size(); ++i2) + if (o2[i2-1].sid != o2[i2].sid) ++m2; + + this->mi = stats::mi(j,m1,m2,N); + this->pmi = stats::pmi(j,m1,m2,N); + this->npmi = stats::npmi(j,m1,m2,N); + // float z = float(m1)/N * float(m2)/N; + // float hmean = 2.*j/(m1+m2); + this->score = npmi; // npmi; // hmean; // /sqrt(z); + } + } stats; + + PhrasePair2(ushort s1_=0, ushort e1_=0, ushort s2_=0, ushort e2_=0) + : s1(s1_), e1(e1_), s2(s2_), e2(e2_), parent(-1) { } + + + bool + operator<(PhrasePair2 const& other) const + { + return (this->stats.score == other.stats.score + ? (e1-s1 + e2-s2 > other.e1-other.s1 + other.e2-other.s2) + : (this->stats.score > other.stats.score)); + } + + size_t len1() const { return e1 - s1; } + size_t len2() const { return e2 - s2; } + bool includes(PhrasePair2 const& o) const + { + return s1 <= o.s1 && e1 >= o.e1 && s2 <= o.s2 && e2 >= o.e2; + } + +}; + +SinglePhrase::cache_t cache1,cache2; +PhrasePair2::stats_t::cache_t ppcache; + + +struct SortByPositionInCorpus +{ + bool + operator()(ttrack::Position const& a, + ttrack::Position const& b) const + { + return a.sid != b.sid ? a.sid < b.sid : a.offset < b.offset; + } +}; + + +void +getoccs(tsa_t::tree_iterator const& m, + vector& occs) +{ + occs.clear(); + occs.reserve(m.approxOccurrenceCount()+10); + tsa::ArrayEntry I(m.lower_bound(-1)); + char const* stop = m.upper_bound(-1); + do { + m.root->readEntry(I.next,I); + occs.push_back(I); + } while (I.next != stop); + sort(occs.begin(),occs.end(),SortByPositionInCorpus()); +} + +void +lookup_phrases(vector const& snt, + TokenIndex& V, ttrack_t const& T, + tsa_t const& I, SinglePhrase::cache_t& cache, + vector > >& dest) +{ + dest.resize(snt.size()); + for (size_t i = 0; i < snt.size(); ++i) + { + tsa_t::tree_iterator m(&I); + dest[i].clear(); + for (size_t k = i; k < snt.size() && m.extend(snt[k]); ++k) + { + if (m.approxOccurrenceCount() < 3) break; + // if (k - i > 0) break; + sptr& o = cache[m.getPid()]; + if (!o) + { + o.reset(new SinglePhrase()); + o->pid = m.getPid(); + getoccs(m,o->occs); + } + dest[i].push_back(o); + } + } +} + + +struct +RowIndexSorter +{ + vector > const& M; + size_t const my_col; + RowIndexSorter(vector > const& m, size_t const c) + : M(m), my_col(c) { } + + template + bool + operator()(T const& a, T const& b) const + { + return M.at(a).at(my_col) > M.at(b).at(my_col); + } +}; + +struct +ColIndexSorter +{ + vector > const& M; + size_t const my_row; + ColIndexSorter(vector > const& m, size_t const r) + : M(m), my_row(r) { } + + template + bool + operator()(T const& a, T const& b) const + { + return M.at(my_row).at(a) > M[my_row].at(b); + } + +}; + +template +class +npmi_scorer1 : public Moses::bitext::PhrasePair::Scorer +{ +public: + float operator()(PhrasePair& pp) const + { +#if 0 + cout << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " " + << pp.raw2 << " " << pp.sample2 << " " << pp.good2 << " " + << pp.joint << " " << __FILE__ << ":" << __LINE__ << endl; +#endif + pp.good2 = ceil(pp.raw2 * float(pp.good1)/pp.raw1); + size_t N = ceil(BT.T1->numTokens() * float(pp.good1)/pp.raw1); + return pp.score = stats::npmi(pp.joint,pp.good1,pp.good2,N); + } +}; + + +class Alnhyp +{ + ushort s1,s2,e1,e2; + float score; +}; + + +size_t +lcs(string const a, string const b) +{ + using namespace stringdist; + if (a == b) return a.size(); + StringDiff diff(a,b); + size_t ret = 0; + size_t len = 0; + // size_t th = min(size_t(4),min(a.size(),b.size())); + for (size_t i = 0; i < diff.size(); ++i) + { + StringDiff::Segment const& s = diff[i]; + if (s.match != StringDiff::same && s.match != StringDiff::cap) + { + if (len > ret) ret = len; + len = 0; + continue; + } + len += s.end_a - s.start_a; + } + if (len > ret) ret = len; + return ret; +} + +size_t +mapstring(string const& utf8, + UnicodeString& U, + vector& c2w, + vector* wlen=NULL) +{ + static UChar space = UnicodeString(" ")[0]; + assert(utf8.at(0) != ' '); + U = UnicodeString(utf8.c_str()).toLower(); + stringdist::strip_accents(U); + c2w.assign(U.length(),-1); + size_t k = 0; + size_t z = 0; + for (int i = 0; i < U.length(); ++i) + if (U[i] == space) { if (wlen) wlen->push_back(i-z); z = ++k; } + else c2w[i] = k; + assert(c2w.back() >= 0); + if (wlen) wlen->push_back(U.length()-z); + return k+1; +} + +void +align_letters(UnicodeString const& A, vector const& a2p, + UnicodeString const& B, vector const& b2p, + vector >& W) +{ + vector > M(A.length(),vector(B.length(),0)); + for (int a = 0; a < A.length(); ++a) + { + for (int b = 0; b < B.length(); ++b) + { + if (A[a] != B[b] || a2p[a] < 0 || b2p[b] < 0) + continue; + M[a][b] = (a && b) ? M[a-1][b-1] + 1 : 1; + int& x = W[a2p[a]][b2p[b]]; + x = max(x,M[a][b]); + } + } + // string astring; A.toUTF8String(astring); + // string bstring; B.toUTF8String(bstring); + // cout << astring << "\n" << bstring << endl; + // for (size_t r = 0; r < W.size(); ++r) + // { + // BOOST_FOREACH(int x, W[r]) cout << setw(3) << x; + // cout << endl; + // } +} + +void +map_back(vector > const& W, + vector > & X, + vector const & aln) +{ + for (size_t i = 0; i < aln.size(); i += 2) + { + vector const& w = W.at(aln[i+1]); + vector& x = X.at(aln[i]); + assert(x.size() == w.size()); + for (size_t k = 0; k < x.size(); ++k) + x[k] = max(w[k],x[k]); + } +} + + +void trymatch3(vector > const& tcands, + UnicodeString const& T, size_t const tlen, + vector const& t2p, + TokenIndex const& V2, vector >&X) +{ + BOOST_FOREACH(PhrasePair const& pp, tcands) + { + UnicodeString H; vector h2p; + string hstring = toString(V2, pp.start2, pp.len2); + size_t hlen = mapstring(hstring,H,h2p); + vector > W(hlen,vector(tlen,0)); + align_letters(H, h2p, T, t2p, W); +#if 0 + string s; S.toUTF8String(s); + string h; H.toUTF8String(h); + string t; T.toUTF8String(t); + cout << s << endl << h << endl << t << endl; + cout << slen << " " << tlen << endl; + cout << "W: " << W.size() << " rows; " << W[0].size() << " cols" << endl; + cout << "X: " << X.size() << " rows; " << X[0].size() << " cols" << endl; + cout << "aln: "; + for (size_t a = 0; a < pp.aln.size(); a +=2) + cout << int(pp.aln[a]) << "-" << int(pp.aln[a+1]) << " "; + cout << endl; +#endif + map_back(W,X,pp.aln); + } +} + +void minmatch_filter(vector > & X, + vector const& len1, + vector const& len2) +{ + // compute marginals + vector m1(X.size(),0), m2(X.at(0).size(),0); + for (size_t r = 0; r < X.size(); ++r) + for (size_t c = 0; c < X[r].size(); ++c) + { + if (X[r][c] == 0) continue; + m1[r] += X[r][c]; + m2[c] += X[r][c]; + } + + bool sth_changed = true; + while (sth_changed) + { + sth_changed = false; + for (size_t r = 0; r < m1.size(); ++r) + { + if (m1[r] && m1[r] < max(2,min(5,len1[r]/2))) + { + sth_changed = true; + for (size_t c = 0; c < X[r].size(); ++c) + { + m2[c] -= X[r][c]; + X[r][c] = 0; + } + m1[r] = 0; + } + } + + for (size_t c = 0; c < m2.size(); ++c) + { + if (m2[c] && m2[c] < max(2,min(5,len2[c]/2))) + { + sth_changed = true; + for (size_t r = 0; r < m1.size(); ++r) + { + m1[r] -= X[r][c]; + X[r][c] = 0; + } + m2[c] = 0; + } + } + } +} + + +void +trymatch2(TokenIndex& V1, // source language vocab + TokenIndex& V2, // target language vocab + string const& source, // source phrase + string const& target, // observed target candidate + vector > const* const tcands, + vector >& X) // destination alignment matrix + // tcands: translations for source +{ + UnicodeString S,T; + vector t2p, s2p; // maps from character position in string to word pos. + vector wlen_t, wlen_s; // individual word lengths + size_t slen = mapstring(source, S, s2p, &wlen_s); + size_t tlen = mapstring(target, T, t2p, &wlen_t); + + X.assign(slen,vector(tlen,0)); + if (slen == 1 && tlen ==1 && S == T) + X[0][0] = S.length(); + else + { + align_letters(S,s2p,T,t2p,X); + if (tcands) trymatch3(*tcands, T, tlen, t2p, V2, X); + } + + minmatch_filter(X, wlen_s, wlen_t); + bool hit = false; + for (size_t r = 0; !hit && r < X.size(); ++r) + for (size_t c = 0; !hit && c < X[r].size(); ++c) + hit = X[r][c] > min(S.length(),T.length())/2; + + // if (hit) + // { + // cout << source << " ::: " << target; + // for (size_t r = 0; r < X.size(); ++r) + // for (size_t c = 0; c < X[r].size(); ++c) + // cout << boost::format(" %u-%u:%d") % r % c % X[r][c]; + // cout << endl; + // } +} + + + +// float +// trymatch(string const a, string const b, +// vector > const* atrans, +// vector > const* btrans) +// { +// if (a == b) return a.size(); +// float score = 0; +// float bar = lcs(a,b); +// // score = max(bar/min(a.size(),b.size()),score); +// score = max(bar,score); +// // cout << "\n[" << bar << "] " << a << " ::: " << b << endl; +// if (atrans) +// { +// BOOST_FOREACH(PhrasePair const& pp, *atrans) +// { +// // if (!pp.aln.size()) continue; +// ushort L = pp.aln[1], R = pp.aln[1]; +// for (size_t k = 3; k < pp.aln.size(); k += 2) +// { +// if (L > pp.aln[k]) L = pp.aln[k]; +// if (R < pp.aln[k]) R = pp.aln[k]; +// } +// if (L || R+1U < pp.len2) continue; +// string foo = toString(*BT.V2,pp.start2,pp.len2); +// // float bar = float(lcs(foo,b))/min(foo.size(),b.size()); +// float bar = float(lcs(foo,b)); + +// if (bar > .5) +// { +// // score = max(pp.score * bar,score); +// score = max(bar,score); +// // cout << "[" << bar << "] " << foo << " ::: " << b +// // << " (" << a << ") " << pp.score << endl; +// } +// } +// } +// if (btrans) +// { +// BOOST_FOREACH(PhrasePair const& pp, *btrans) +// { +// // if (!pp.aln.size()) continue; +// ushort L = pp.aln[1], R = pp.aln[1]; +// for (size_t k = 3; k < pp.aln.size(); k += 2) +// { +// if (L > pp.aln[k]) L = pp.aln[k]; +// if (R < pp.aln[k]) R = pp.aln[k]; +// } +// if (L || R+1U < pp.len2) continue; +// string foo = toString(*BT.V1,pp.start2,pp.len2); +// // float bar = float(lcs(a,foo))/min(a.size(),foo.size()); +// float bar = float(lcs(a,foo)); +// if (bar > .5) +// { +// score = max(bar,score); +// // cout << "[" << bar<< "] " << a << " ::: " << foo +// // << " (" << b << ") " << pp.score << endl; +// } +// } +// } +// return score; +// } + +struct ahyp +{ + ushort s1,s2,e1,e2; + float score; + bool operator<(ahyp const& o) const { return score < o.score; } + bool operator>(ahyp const& o) const { return score > o.score; } +}; + +struct AlnPoint +{ + enum status { no = 0, yes = 1, maybe = -1, undef = -7 }; + float score; + status state; + AlnPoint() : score(0), state(undef) {} +}; + +bool overlap(span_t const& a, span_t const& b) +{ + return !(a.second <= b.first || b.second <= a.first); +} + +class AlnMatrix +{ + vector A1,A2; // final alignment matrix + vector S1,S2; // shadow alignment matrix +public: + vector m1,m2; // margins + AlnMatrix(size_t const rows, size_t const cols); + bitvector const& + operator[](size_t const r) const + { return A1.at(r); } + + bool + incorporate(span_t const& rspan, span_t const& cspan, + vector const& aln, bool const flip); + + size_t size() const { return A1.size(); } +}; + +AlnMatrix:: +AlnMatrix(size_t const rows, size_t const cols) +{ + A1.assign(rows,bitvector(cols)); + S1.assign(rows,bitvector(cols)); + A2.assign(cols,bitvector(rows)); + S2.assign(cols,bitvector(rows)); + m1.assign(rows,NULL); + m2.assign(cols,NULL); +} + +bool +AlnMatrix:: +incorporate(span_t const& rspan, + span_t const& cspan, + vector const& aln, + bool const flip) +{ + for (size_t r = rspan.first; r < rspan.second; ++r) + S1[r].reset(); + for (size_t c = cspan.first; c < cspan.second; ++c) + S2[c].reset(); + if (flip) + { + for (size_t i = 0; i < aln.size(); i += 2) + { + size_t r = rspan.first + aln[i]; + size_t c = cspan.first + aln[i+1]; + S1[r].set(c); + S2[c].set(r); + } + } + else + { + for (size_t i = 0; i < aln.size(); i += 2) + { + size_t r = rspan.first + aln[i+1]; + size_t c = cspan.first + aln[i]; + S1[r].set(c); + S2[c].set(r); + } + } + // check compatibility with existing alignment + for (size_t r = rspan.first; r < rspan.second; ++r) + if (m1[r] && (*m1[r]) != S1[r]) return false; + for (size_t c = cspan.first; c < cspan.second; ++c) + if (m2[c] && (*m2[c]) != S2[c]) return false; + + // all good, add new points + for (size_t r = rspan.first; r < rspan.second; ++r) + if (!m1[r]) { A1[r] = S1[r]; m1[r] = &A1[r]; } + for (size_t c = cspan.first; c < cspan.second; ++c) + if (!m2[c]) { A2[c] = S2[c]; m2[c] = &A2[c]; } + + return true; +} + +struct alink +{ + size_t r,c,m; + bool operator<(alink const& o) const { return m < o.m; } + bool operator>(alink const& o) const { return m > o.m; } +}; + +int main(int argc, char* argv[]) +{ + string base = argc > 1 ? argv[1] : "crp/trn/mm/"; + string L1 = argc > 1 ? argv[2] : "de"; + string L2 = argc > 1 ? argv[3] : "en"; + BT.open(base,L1,L2); + BT.V1->setDynamic(true); + BT.V2->setDynamic(true); + string line1, line2; + npmi_scorer1 scorer; + while (getline(cin,line1) and getline(cin,line2)) + { + cout << "\n" << line1 << "\n" << line2 << endl; + vector snt1,snt2; + fill_token_seq(*BT.V1,line1,snt1); + fill_token_seq(*BT.V2,line2,snt2); + vector > > > > pt1,pt2; + vector > pm1,pm2; + BT.lookup(snt1,*BT.I1,pt1,&pm1,&scorer); + BT.lookup(snt2,*BT.I2,pt2,&pm2,&scorer); + + // build map from phrases to positions + typedef boost::unordered_map > + p2s_map_t; + typedef p2s_map_t::iterator p2s_iter; + p2s_map_t p2s1,p2s2; + for (ushort i = 0; i < pm1.size(); ++i) + for (ushort k = 0; k < pm1[i].size(); ++k) + p2s1[pm1[i][k]].push_back(make_pair(i,i+k+1)); + for (ushort i = 0; i < pm2.size(); ++i) + for (ushort k = 0; k < pm2[i].size(); ++k) + p2s2[pm2[i][k]].push_back(make_pair(i,i+k+1)); + + boost::unordered_map > > > all1,all2; + vector > pp_all; + for (size_t i = 0; i < pt2.size(); ++i) + for (size_t k = 0; k < pt2[i].size(); ++k) + all2[pm2[i][k]] = pt2[i][k]; + for (size_t i = 0; i < pt1.size(); ++i) + for (size_t k = 0; k < pt1[i].size(); ++k) + { + all1[pm1[i][k]] = pt1[i][k]; + BOOST_FOREACH(PhrasePair const& pp, *pt1[i][k]) + { + if (pp.score < 0) break; + if (p2s2.find(pp.p2) != p2s2.end()) + pp_all.push_back(pp); + } + } + sort(pp_all.begin(), pp_all.end(), greater >()); + vector a1(snt1.size(),-1), a2(snt2.size(),-1); + + vector R(snt1.size(),bitvector(snt2.size())); + vector C(snt2.size(),bitvector(snt1.size())); + vector myR(snt1.size(),bitvector(snt2.size())); + vector myC(snt2.size(),bitvector(snt1.size())); + vector m1(snt1.size(),NULL); + vector m2(snt2.size(),NULL); + + // vector > M(snt1.size(),vector(snt2.size())); + AlnMatrix A(snt1.size(),snt2.size()); + for (size_t p = 0; p < pp_all.size(); ++p) + { + PhrasePair const& pp = pp_all[p]; +#if 0 + cout << (boost::format("%30s ::: %-30s ") + % BT.toString(pp.p1,0).c_str() + % BT.toString(pp.p2,1).c_str()); + cout << (boost::format("%.4f [%d/%d/%d]") + % pp.score % pp.good1 % pp.joint % pp.good2); + for (size_t a = 0; a < pp.aln.size(); a += 2) + cout << " " << int(pp.aln[a]) << "-" << int(pp.aln[a+1]); + cout << endl; +#endif + + vector& v1 = p2s1[pp.p1]; + vector& v2 = p2s2[pp.p2]; + if (v1.size() == 1) + for (size_t i = v1[0].first; i < v1[0].second; ++i) + if (a1[i] < 0) a1[i] = p; + if (v2.size() == 1) + for (size_t i = v2[0].first; i < v2[0].second; ++i) + if (a2[i] < 0) a2[i] = p; + + if (v1.size() == 1 && v2.size() == 1) + A.incorporate(v1[0],v2[0],pp.aln,pp.inverse); + } + + for (size_t i = 0; i < A.size(); ++i) + { + cout << (*BT.V2)[snt1[i].id()] << ": "; + for (size_t k=A[i].find_first(); k < A[i].size(); k=A[i].find_next(k)) + cout << boost::format(" %d:%s") % k % (*BT.V2)[snt2[k].id()]; + cout << endl; + } + + + + vector > const* atrans, *btrans; + ahyp h; + vector hyps; + vector > L(snt1.size(),vector(snt2.size(),0)); + // L: matches by letter overlap + + for (h.s1 = 0; h.s1 < a1.size(); ++h.s1) + { + if (a1[h.s1] >= 0) continue; + ostringstream buf1; + for (h.e1 = h.s1; h.e1 < a1.size() && a1[h.e1] < 0; ++h.e1) + { + if (h.e1 > h.s1) + { + if (pt1[h.s1].size() + h.s1 <= h.e1) break; + buf1 << " "; + } + buf1 << (*BT.V1)[snt1[h.e1].id()]; + atrans = pt1[h.s1].size() ? pt1[h.s1].at(h.e1-h.s1).get() : NULL; + for (h.s2 = 0; h.s2 < a2.size(); ++h.s2) + { + ostringstream buf2; + if (a2[h.s2] >= 0) continue; + for (h.e2 = h.s2; h.e2 < a2.size() && a2[h.e2] < 0; ++h.e2) + { + if (h.e2 > h.s2) + { + if (pt2[h.s2].size() + h.s2 <= h.e2) break; + buf2 << " "; + } + buf2 << (*BT.V2)[snt2[h.e2].id()]; + btrans = (pt2[h.s2].size() + ? pt2[h.s2].at(h.e2-h.s2).get() + : NULL); + + vector > aln; + trymatch2(*BT.V1, *BT.V2, buf1.str(),buf2.str(), + atrans,aln); + for (size_t i = 0; i < aln.size(); ++i) + for (size_t k = 0; k < aln[i].size(); ++k) + L[h.s1+i][h.s2+k] = max(L[h.s1+i][h.s2+k],aln[i][k]); + trymatch2(*BT.V2, *BT.V1, buf2.str(),buf1.str(), + btrans,aln); + for (size_t i = 0; i < aln[0].size(); ++i) + for (size_t k = 0; k < aln.size(); ++k) + L[h.s1+i][h.s2+k] = max(L[h.s1+i][h.s2+k],aln[k][i]); + // h.score = trymatch(buf1.str(), buf2.str(), atrans, btrans); + // hyps.push_back(h); + } + } + } + } + + vector links; + + alink x; + for (x.r = 0; x.r < L.size(); ++x.r) + { + + for (x.c = 0; x.c < L[x.r].size(); ++x.c) + { + x.m = L[x.r][x.c]; + if (x.m) links.push_back(x); + } + } + + sort(links.begin(),links.end(),greater()); + + BOOST_FOREACH(alink& x, links) + { + if (L[x.r][x.c]) + { + cout << (*BT.V1)[snt1[x.r].id()] << " ::: " + << (*BT.V2)[snt2[x.c].id()] << " ::: " + << L[x.r][x.c] << endl; + } + } + + // sort(hyps.begin(),hyps.end(),greater()); + // BOOST_FOREACH(ahyp const& h, hyps) + // { + // if (h.score < .5) break; + // for (size_t i = h.s1; i <= h.e1; ++i) + // cout << i << ":" << (*BT.V1)[snt1[i].id()] << " "; + // cout << " ::: "; + // for (size_t i = h.s2; i <= h.e2; ++i) + // cout << i << ":" << (*BT.V2)[snt2[i].id()] << " "; + // cout << h.score << endl; + // } + + } +} + + +// for (size_t i = 0; i < pt1.size(); ++i) +// { +// for (size_t k = 0; k < pt1[i].size(); ++k) +// { +// size_t d1 = 0; +// bool first = true; +// BOOST_FOREACH(PhrasePair const& pt, *pt1[i][k]) +// { +// TSA::tree_iterator m(BT.I2.get(),pt.start2,pt.len2); +// if (pt.score < 0) break; +// int left = pt.aln[1], right = pt.aln[1]; +// bool match = p2s2.find(m.getPid()) != p2s2.end(); +// if (!match) +// { +// for (size_t a = 3; a < pt.aln.size(); a += 2) +// { +// if (left > pt.aln[a]) left = pt.aln[a]; +// if (right < pt.aln[a]) right = pt.aln[a]; +// } +// } +// #if 0 +// if (match) +// { +// if (first) +// { +// cout << BT.toString(pm1[i][k],0) << endl; +// first = false; +// } +// cout << boost::format("%.4f") % pt.score << " " +// << setw(5) << d1 << " " << (match ? "* " : " ") +// << toString(*BT.V2, pt.start2, pt.len2) << " [" +// << pt.good1 << "/" << pt.joint << "/" +// << pt.good2 << "]"; +// for (size_t a = 0; a < pt.aln.size(); a += 2) +// cout << " " << int(pt.aln[a]) << "-" << int(pt.aln[a+1]); +// cout << " [" << left << ":" << right << "]" << endl; +// } +// #endif +// if (!match) +// { +// if (left == 0 && pt.len2 - right == 1) +// d1 += pt.joint; +// } +// else +// { +// pp_all.push_back(pt); +// // pp_all.back().m1 -= d1; +// } + +// } +// if (!first) cout << endl; +// } + From a86d49fc88631acf7e543d0a324ee1195f2212b7 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 6 Sep 2014 03:39:23 +0100 Subject: [PATCH 11/13] Added bias to bitext sampling. --- Jamroot | 1 + moses/TranslationModel/UG/Jamfile | 10 + moses/TranslationModel/UG/mm/ug_bitext.cc | 8 +- moses/TranslationModel/UG/mm/ug_bitext.h | 88 +++++--- moses/TranslationModel/UG/mmsapt.cpp | 12 +- moses/TranslationModel/UG/mmsapt.h | 3 + .../UG/spe-check-coverage3.cc | 194 ++++++++++++++++++ moses/TranslationModel/UG/try-align2.cc | 2 +- 8 files changed, 285 insertions(+), 33 deletions(-) create mode 100644 moses/TranslationModel/UG/spe-check-coverage3.cc diff --git a/Jamroot b/Jamroot index e47e6d628e..ce3a42b9f6 100644 --- a/Jamroot +++ b/Jamroot @@ -172,6 +172,7 @@ build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses if [ option.get "with-mm" : : "yes" ] { alias mm : + moses/TranslationModel/UG//spe-check-coverage3 moses/TranslationModel/UG//spe-check-coverage2 moses/TranslationModel/UG//ptable-lookup moses/TranslationModel/UG//sim-pe diff --git a/moses/TranslationModel/UG/Jamfile b/moses/TranslationModel/UG/Jamfile index 2f1816f51e..ff2911ef69 100644 --- a/moses/TranslationModel/UG/Jamfile +++ b/moses/TranslationModel/UG/Jamfile @@ -65,6 +65,16 @@ $(TOP)/moses/TranslationModel/UG//mmsapt $(TOP)/util//kenutil ; +exe spe-check-coverage3 : +spe-check-coverage3.cc +$(TOP)/moses//moses +$(TOP)/moses/TranslationModel/UG/generic//generic +$(TOP)//boost_iostreams +$(TOP)//boost_program_options +$(TOP)/moses/TranslationModel/UG/mm//mm +$(TOP)/moses/TranslationModel/UG//mmsapt +$(TOP)/util//kenutil +; install $(PREFIX)/bin : try-align try-align2 ; fakelib mmsapt : [ glob *.cpp mmsapt*.cc sapt*.cc ] ; diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc index 8c27db784d..45c2b6a9b6 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext.cc @@ -11,7 +11,9 @@ namespace Moses namespace bitext { +#if UG_BITEXT_TRACK_ACTIVE_THREADS ThreadSafeCounter pstats::active; +#endif pstats:: pstats() @@ -23,15 +25,15 @@ namespace Moses { ofwd[0] = ofwd[1] = ofwd[2] = ofwd[3] = ofwd[4] = ofwd[5] = ofwd[6] = 0; obwd[0] = obwd[1] = obwd[2] = obwd[3] = obwd[4] = obwd[5] = obwd[6] = 0; - // if (++active%5 == 0) - // cerr << size_t(active) << " active pstats at " << __FILE__ << ":" << __LINE__ << endl; } pstats:: ~pstats() { +#if UG_BITEXT_TRACK_ACTIVE_THREADS + // counter may not exist any more at destruction time, so try ... catch try { --active; } catch (...) {} - // counter may not exist any more at destruction time +#endif } void diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 9d80d32fa8..4bc7b8bb33 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -15,6 +15,10 @@ // // - use multiple agendas for better load balancing and to avoid // competition for locks +// + + +#define UG_BITEXT_TRACK_ACTIVE_THREADS 0 #include #include @@ -133,7 +137,10 @@ namespace Moses { struct pstats { + +#if UG_BITEXT_TRACK_ACTIVE_THREADS static ThreadSafeCounter active; +#endif boost::mutex lock; // for parallel gathering of stats boost::condition_variable ready; // consumers can wait for this data structure to be ready. @@ -463,7 +470,8 @@ namespace Moses { mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2; private: sptr - prep2(iter const& phrase, size_t const max_sample) const; + prep2(iter const& phrase, size_t const max_sample, + vector const* const bias) const; public: Bitext(size_t const max_sample =1000, size_t const xnum_workers =16); @@ -481,17 +489,19 @@ namespace Moses { virtual void open(string const base, string const L1, string const L2) = 0; // sptr lookup(Phrase const& phrase, size_t factor) const; - sptr lookup(iter const& phrase) const; - sptr lookup(iter const& phrase, size_t const max_sample) const; + sptr lookup(iter const& phrase,vector const* const bias=NULL) const; + sptr lookup(iter const& phrase, size_t const max_sample, + vector const* const bias) const; void lookup(vector const& snt, TSA& idx, vector > > > >& dest, vector >* pidmap = NULL, typename PhrasePair::Scorer* scorer=NULL, + vector const* const bias=NULL, bool multithread=true) const; - void prep(iter const& phrase) const; + void prep(iter const& phrase, vector const* const bias) const; void setDefaultSampleSize(size_t const max_samples); size_t getDefaultSampleSize() const; @@ -576,7 +586,9 @@ namespace Moses { boost::mutex lock; class job { +#if UG_BITEXT_TRACK_ACTIVE_THREADS static ThreadSafeCounter active; +#endif boost::mutex lock; friend class agenda; boost::taus88 rnd; // every job has its own pseudo random generator @@ -594,10 +606,13 @@ namespace Moses { size_t len; // phrase length bool fwd; // if true, source phrase is L1 sptr stats; // stores statistics collected during sampling + vector const* bias; // sentence-level bias for sampling + bool step(uint64_t & sid, uint64_t & offset); // select another occurrence bool done() const; job(typename TSA::tree_iterator const& m, - sptr > const& r, size_t maxsmpl, bool isfwd); + sptr > const& r, size_t maxsmpl, bool isfwd, + vector const* const bias); ~job(); }; public: @@ -622,7 +637,9 @@ namespace Moses { sptr add_job(typename TSA::tree_iterator const& phrase, - size_t const max_samples); + size_t const max_samples, + vector const* const bias); + sptr get_job(); }; @@ -641,6 +658,8 @@ namespace Moses { next = root->readOffset(next,stop,offset); boost::lock_guard sguard(stats->lock); if (stats->raw_cnt == ctr) ++stats->raw_cnt; + if (bias && bias->at(sid) == 0) + return false; stats->sample_cnt++; } else @@ -654,14 +673,21 @@ namespace Moses { boost::lock_guard sguard(stats->lock); if (stats->raw_cnt == ctr) ++stats->raw_cnt; size_t scalefac = (stats->raw_cnt - ctr++); - size_t rnum = scalefac*(rnd()/(rnd.max()+1.)); + size_t rnum = scalefac * (rnd()/(rnd.max()+1.)); + size_t th = (bias == NULL ? max_samples + : bias->at(sid) * bias->size() * max_samples); #if 0 cerr << rnum << "/" << scalefac << " vs. " << max_samples - stats->good << " (" << max_samples << " - " << stats->good << ")" - << endl; + << " th=" << th; + if (bias) + cerr << " with bias " << bias->at(sid) + << " => " << bias->at(sid) * bias->size(); + else cerr << " without bias"; + cerr << endl; #endif - if (rnum < max_samples - stats->good) + if (rnum + stats->good < th) { stats->sample_cnt++; ret = true; @@ -743,8 +769,7 @@ namespace Moses { } else if (!ag.bt.find_trg_phr_bounds (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd, - // NULL,NULL,true)) - &aln,NULL,true)) + &aln,NULL,true)) // NULL,NULL,true)) continue; j->stats->lock.lock(); j->stats->good += 1; @@ -844,7 +869,9 @@ namespace Moses { ~job() { if (stats) stats.reset(); +#if UG_BITEXT_TRACK_ACTIVE_THREADS try { --active; } catch (...) {} +#endif // counter may not exist any more at destruction time } @@ -853,7 +880,8 @@ namespace Moses { agenda:: job:: job(typename TSA::tree_iterator const& m, - sptr > const& r, size_t maxsmpl, bool isfwd) + sptr > const& r, size_t maxsmpl, + bool isfwd, vector const* const sntbias) : rnd(0) , rnddenom(rnd.max() + 1.) , min_diverse(10) @@ -865,12 +893,15 @@ namespace Moses { , ctr(0) , len(m.size()) , fwd(isfwd) + , bias(sntbias) { stats.reset(new pstats()); stats->raw_cnt = m.approxOccurrenceCount(); +#if UG_BITEXT_TRACK_ACTIVE_THREADS // if (++active%5 == 0) ++active; // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl; +#endif } template @@ -878,12 +909,12 @@ namespace Moses { Bitext:: agenda:: add_job(typename TSA::tree_iterator const& phrase, - size_t const max_samples) + size_t const max_samples, vector const* const bias) { boost::unique_lock lk(this->lock); static boost::posix_time::time_duration nodelay(0,0,0,0); bool fwd = phrase.root == bt.I1.get(); - sptr j(new job(phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd)); + sptr j(new job(phrase, fwd ? bt.I1 : bt.I2, max_samples, fwd, bias)); j->stats->register_worker(); joblist.push_back(j); @@ -1322,15 +1353,16 @@ namespace Moses { template void Bitext:: - prep(iter const& phrase) const + prep(iter const& phrase, vector const* const bias) const { - prep2(phrase, this->default_sample_size); + prep2(phrase, this->default_sample_size,bias); } template sptr Bitext:: - prep2(iter const& phrase, size_t const max_sample) const + prep2(iter const& phrase, size_t const max_sample, + vector const* const bias) const { boost::lock_guard guard(this->lock); if (!ag) @@ -1343,7 +1375,7 @@ namespace Moses { #if 1 // use pcache only for plain sentence input if (StaticData::Instance().GetInputType() == SentenceInput && - max_sample == this->default_sample_size && + max_sample == this->default_sample_size && bias == NULL && phrase.approxOccurrenceCount() > m_pstats_cache_threshold) { // still need to test what a good caching threshold is @@ -1360,7 +1392,7 @@ namespace Moses { // cerr << "NEW FREQUENT PHRASE: " // << phrase.str(V1.get()) << " " << phrase.approxOccurrenceCount() // << " at " << __FILE__ << ":" << __LINE__ << endl; - foo.first->second = ag->add_job(phrase, max_sample); + foo.first->second = ag->add_job(phrase, max_sample,NULL); assert(foo.first->second); } assert(foo.first->second); @@ -1369,7 +1401,7 @@ namespace Moses { } else #endif - ret = ag->add_job(phrase, max_sample); + ret = ag->add_job(phrase, max_sample,bias); assert(ret); return ret; } @@ -1443,8 +1475,8 @@ namespace Moses { lookup(vector const& snt, TSA& idx, vector > > > >& dest, vector >* pidmap, - typename PhrasePair::Scorer* scorer, - bool multithread) const + typename PhrasePair::Scorer* scorer, + vector const* const bias, bool multithread) const { typedef vector > > > > ret_t; @@ -1474,7 +1506,7 @@ namespace Moses { pp.reset(new vector >()); C.set(key,pp); dest[i].push_back(pp); - sptr x = prep2(m, this->default_sample_size); + sptr x = prep2(m, this->default_sample_size,bias); pstats2pplist w(m,*(fwd?T2:T1),x,*pp,scorer); if (multithread) { @@ -1495,9 +1527,9 @@ namespace Moses { template sptr Bitext:: - lookup(iter const& phrase) const + lookup(iter const& phrase, vector const* const bias) const { - sptr ret = prep2(phrase, this->default_sample_size); + sptr ret = prep2(phrase, this->default_sample_size, bias); assert(ret); boost::lock_guard guard(this->lock); if (this->num_workers <= 1) @@ -1514,7 +1546,8 @@ namespace Moses { template sptr Bitext:: - lookup(iter const& phrase, size_t const max_sample) const + lookup(iter const& phrase, size_t const max_sample, + vector const* const bias) const { sptr ret = prep2(phrase, max_sample); boost::lock_guard guard(this->lock); @@ -1558,12 +1591,13 @@ namespace Moses { return (max_samples && stats->good >= max_samples) || next == stop; } +#if UG_BITEXT_TRACK_ACTIVE_THREADS template ThreadSafeCounter Bitext:: agenda:: job::active; - +#endif template void diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 459c64fa19..e765d0c7e4 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -1072,6 +1072,13 @@ namespace Moses bool Mmsapt:: PrefixExists(Moses::Phrase const& phrase) const + { + return PrefixExists(phrase,NULL); + } + + bool + Mmsapt:: + PrefixExists(Moses::Phrase const& phrase, vector const* const bias) const { if (phrase.GetSize() == 0) return false; vector myphrase; @@ -1080,7 +1087,7 @@ namespace Moses TSA::tree_iterator mfix(btfix.I1.get(),&myphrase[0],myphrase.size()); if (mfix.size() == myphrase.size()) { - btfix.prep(mfix); + btfix.prep(mfix,bias); // cerr << phrase << " " << mfix.approxOccurrenceCount() << endl; return true; } @@ -1096,7 +1103,8 @@ namespace Moses { for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i) mdyn.extend(myphrase[i]); - if (mdyn.size() == myphrase.size()) dyn->prep(mdyn); + // let's assume a uniform bias over the foreground corpus + if (mdyn.size() == myphrase.size()) dyn->prep(mdyn,NULL); } return mdyn.size() == myphrase.size(); } diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h index 2c088dd18a..eb469784bd 100644 --- a/moses/TranslationModel/UG/mmsapt.h +++ b/moses/TranslationModel/UG/mmsapt.h @@ -264,6 +264,9 @@ namespace Moses ProvidesPrefixCheck() const; /// return true if prefix /phrase/ exists + bool + PrefixExists(Phrase const& phrase, vector const* const bias) const; + bool PrefixExists(Phrase const& phrase) const; diff --git a/moses/TranslationModel/UG/spe-check-coverage3.cc b/moses/TranslationModel/UG/spe-check-coverage3.cc new file mode 100644 index 0000000000..ea8c85e996 --- /dev/null +++ b/moses/TranslationModel/UG/spe-check-coverage3.cc @@ -0,0 +1,194 @@ +#include "mmsapt.h" +#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h" +#include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace Moses; +using namespace bitext; +using namespace std; +using namespace boost; + +typedef L2R_Token Token; +typedef mmBitext mmbitext; +typedef imBitext imbitext; +typedef Bitext::iter iter; + +mmbitext bg; +vector src,trg,aln; + +void +show(ostream& out, iter& f) +{ + iter b(bg.I2.get(),f.getToken(0),f.size()); + if (b.size() == f.size()) + out << setw(12) << int(round(b.approxOccurrenceCount())); + else + out << string(12,' '); + out << " " << setw(5) << int(round(f.approxOccurrenceCount())) << " "; + out << f.str(bg.V1.get()) << endl; +} + + +void +dump(ostream& out, iter& f) +{ + float cnt = f.size() ? f.approxOccurrenceCount() : 0; + if (f.down()) + { + cnt = f.approxOccurrenceCount(); + do { dump(out,f); } + while (f.over()); + f.up(); + } + if (f.size() && cnt < f.approxOccurrenceCount() && f.approxOccurrenceCount() > 1) + show(out,f); +} + + +void +read_data(string fname, vector& dest) +{ + ifstream in(fname.c_str()); + string line; + while (getline(in,line)) dest.push_back(line); + in.close(); +} + +void +show_snt(ostream& out, TokenIndex const& V, vector const& snt, + vector > const& a) +{ + for (size_t i = 0; i < snt.size(); ++i) + { + cout << format("%d:%s[") % i % V[snt[i].id()]; + for (size_t k = 0; k < a[i].size(); ++k) + cout << (k?",":"") << a[i][k]; + cout << "] "; + } + cout << endl; +} + + +void show_pair(size_t const sid) +{ + vector s,t; + fill_token_seq(*bg.V1,src[sid],s); + fill_token_seq(*bg.V2,trg[sid],t); + vector > a1(s.size()),a2(t.size()); + istringstream buf(aln[sid]); + cout << aln[sid] << endl; + int i,k; char c; + while (buf >> i >> c >> k) + { + a1[i].push_back(k); + a2[k].push_back(i); + cout << i << "-" << k << " "; + } + cout << endl; + show_snt(cout,*bg.V1,s,a1); + show_snt(cout,*bg.V2,t,a2); +} + +int main(int argc, char* argv[]) +{ + if (argc < 5) + { + cerr << "usage: " << argv[0] + << " " + << endl; + exit(1); + } + bg.open(argv[1],argv[2],argv[3]); + sptr fg(new imbitext(bg.V1,bg.V2)); + string base = argv[4]; + if (*base.rbegin() != '.') base += '.'; + string srcfile = base + argv[2]; + string trgfile = base + argv[3]; + string alnfile = base + "symal"; + read_data(srcfile,src); + read_data(trgfile,trg); + read_data(alnfile,aln); + fg = fg->add(src,trg,aln); + + vector bias(src.size(),1./(src.size()-1)); + for (size_t sid = 0; sid < src.size(); ++sid) + { + bias[sid] = 0; + // cout << src[sid] << endl << trg[sid] << endl; + // show_pair(sid); + vector snt; + fill_token_seq(*bg.V1,src[sid],snt); + vector > > > > FG,BG; + fg->lookup(snt,*fg->I1,FG,NULL,NULL,&bias,true); + bg.lookup(snt,*bg.I1,BG,NULL,NULL,NULL,true); + set > > > seen; + for (size_t i = 0; i < snt.size(); ++i) + { + Bitext::iter m0(fg->I1.get()); + Bitext::iter m1(bg.I1.get()); + for (size_t k = 0; k < FG[i].size(); ++k) + { + if (!m0.extend(snt[i+k].id())) break; + if (k && m0.approxOccurrenceCount() < 2) break; + if (m1.size() == k && (!m1.extend(snt[i+k].id()) || + m1.approxOccurrenceCount() < 25)) + { + cout << toString((*fg->V1), m0.getToken(0), m0.size()) << " " + << int(m0.approxOccurrenceCount()); + if (m1.size() == k + 1) + cout << " "<< int(m1.approxOccurrenceCount()); + else if (m1.size()) + cout << " ["<< int(m1.approxOccurrenceCount()) << "]"; + else + cout << " NEW!"; + cout << endl; + } + if (m0.approxOccurrenceCount() < 2) break; + BOOST_FOREACH(PhrasePair const& pp, *FG[i][k]) + { + if (pp.joint < 2) continue; + sptr bgstats; + jstats const* bgjstats = NULL; + Bitext::iter m2(bg.I2.get(), pp.start2, pp.len2); + if (m1.approxOccurrenceCount() > 5000 || + m2.approxOccurrenceCount() > 5000) + continue; + if (m1.size() == pp.len1 && m2.size() == pp.len2) + { + bgstats = bg.lookup(m1,NULL); + if (bgstats) + { + pstats::trg_map_t::const_iterator mx; + mx = bgstats->trg.find(m2.getPid()); + if (mx != bgstats->trg.end()) + bgjstats = &mx->second; + } + } + cout << toString(*fg->V1, pp.start1, pp.len1) << " ::: " + << toString(*fg->V2, pp.start2, pp.len2) << " " + << format("[%u/%u/%u]") % pp.good1 % pp.joint % pp.good2; + if (bgjstats) + cout << " " << (format("[%u/%u/%u]") + % bgstats->good % bgjstats->rcnt() + % (bgjstats->cnt2() * bgstats->good + / bgstats->raw_cnt)); + else if (m1.size() == pp.len1) + cout << " " << int(m1.approxOccurrenceCount()); + cout << endl; + + } + } + } + bias[sid] = 1./(src.size()-1); + } + exit(0); +} + + + diff --git a/moses/TranslationModel/UG/try-align2.cc b/moses/TranslationModel/UG/try-align2.cc index 68078a2fb5..57cf25035f 100644 --- a/moses/TranslationModel/UG/try-align2.cc +++ b/moses/TranslationModel/UG/try-align2.cc @@ -729,7 +729,7 @@ int main(int argc, char* argv[]) for (size_t i = 0; i < A.size(); ++i) { - cout << (*BT.V2)[snt1[i].id()] << ": "; + cout << (*BT.V1)[snt1[i].id()] << ": "; for (size_t k=A[i].find_first(); k < A[i].size(); k=A[i].find_next(k)) cout << boost::format(" %d:%s") % k % (*BT.V2)[snt2[k].id()]; cout << endl; From 4941314a48bdc5cd73fc0b2af60731df641a3f58 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Sep 2014 09:13:50 +0100 Subject: [PATCH 12/13] Branch biased bitext sampling: Initial check-in. --- Jamroot | 3 --- 1 file changed, 3 deletions(-) diff --git a/Jamroot b/Jamroot index ce3a42b9f6..0b6ce3cb5f 100644 --- a/Jamroot +++ b/Jamroot @@ -172,8 +172,6 @@ build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses if [ option.get "with-mm" : : "yes" ] { alias mm : - moses/TranslationModel/UG//spe-check-coverage3 - moses/TranslationModel/UG//spe-check-coverage2 moses/TranslationModel/UG//ptable-lookup moses/TranslationModel/UG//sim-pe moses/TranslationModel/UG//spe-check-coverage @@ -187,7 +185,6 @@ if [ option.get "with-mm" : : "yes" ] moses/TranslationModel/UG/mm//mtt-count-words moses/TranslationModel/UG/mm//calc-coverage moses/TranslationModel/UG//try-align - moses/TranslationModel/UG//try-align2 ; } else From 5571ec91c609109825aaa329f57e9e781b185205 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Mon, 8 Sep 2014 09:25:18 +0100 Subject: [PATCH 13/13] Code cleanup. --- moses/TranslationModel/UG/mmsapt.cpp | 303 --------------------------- 1 file changed, 303 deletions(-) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index e765d0c7e4..834e07f386 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -13,27 +13,6 @@ namespace Moses using namespace std; using namespace boost; - - // uint64_t - // pack_phrasekey(uint64_t const shard_id, uint64_t const snt_id, - // uint64_t const offset, uint64_t const len) - // { - // uint64_t one = 1; - // // 8 bits - 256 shards - // // 13 bits - max offset - // // 11 bits - max len - // // 32 bits - max sentence id - // UTIL_TRHOW_IF2(shard_id >= 256, "[" << HERE << "] " - // << "Sentence ID exceeds limit."); - // UTIL_THROW_IF2(snt_id >= 4294967296, "[" << HERE << "] " - // << "Sentence ID exceeds limit."); - // UTIL_TRHOW_IF2(offset >= 8192, "[" << HERE << "]" - // << "Phrase offset exceeds limit."); - // UTIL_TRHOW_IF2(offset >= 2048, "[" << HERE << "]" - // << "Phrase length exceeds limit."); - // return ((shard_id<<56)+(snt_id<<24)+(offset<<11)+len); - // } - void fillIdSeq(Phrase const& mophrase, size_t const ifactor, TokenIndex const& V, vector& dest) @@ -87,13 +66,6 @@ namespace Moses Mmsapt:: Mmsapt(string const& line) : PhraseDictionary(line) - // , m_lex_alpha(1.0) - // , withLogCountFeatures(false) - // , withCoherence(true) - // , m_pfwd_features("g") - // , m_pbwd_features("g") - // , withPbwd(true) - // , poolCounts(true) , ofactor(1,0) , m_tpc_ctr(0) { @@ -508,281 +480,6 @@ namespace Moses return tp; } - // TargetPhrase* - // Mmsapt:: - // mkTPhrase(Phrase const& src, - // Bitext const& bt, - // PhrasePair const& pp) const - // { - // Word w; uint32_t sid,off,len; - // TargetPhrase* tp = new TargetPhrase(); - // parse_pid(pp.p2, sid, off, len); - // Token const* x = bt.T2->sntStart(sid) + off; - // for (uint32_t k = 0; k < len; ++k) - // { - // // cerr << (*bt.V2)[x[k].id()] << " at " << __FILE__ << ":" << __LINE__ << endl; - // StringPiece wrd = (*bt.V2)[x[k].id()]; - // // if ((off+len) > bt.T2->sntLen(sid)) - // // cerr << off << ";" << len << " " << bt.T2->sntLen(sid) << endl; - // assert(off+len <= bt.T2->sntLen(sid)); - // w.CreateFromString(Output,ofactor,wrd,false); - // tp->AddWord(w); - // } - // tp->GetScoreBreakdown().Assign(this, pp.fvals); - // tp->Evaluate(src); - // return tp; - // } - - // // process phrase stats from a single parallel corpus - // void - // Mmsapt:: - // process_pstats - // (Phrase const& src, - // uint64_t const pid1, - // pstats const& stats, - // Bitext const & bt, - // TargetPhraseCollection* tpcoll - // ) const - // { - // PhrasePair pp; - // pp.init(pid1, stats, this->m_numScoreComponents); - // pstats::trg_map_t::const_iterator t; - // for (t = stats.trg.begin(); t != stats.trg.end(); ++t) - // { - // pp.update(t->first,t->second); - // BOOST_FOREACH(sptr const& ff, m_active_ff_fix) - // (*ff)(bt,pp); - // BOOST_FOREACH(sptr const& ff, m_active_ff_common) - // (*ff)(bt,pp); - // tpcoll->Add(mkTPhrase(src,bt,pp)); - // } - // } - - // void - // Mmsapt:: - // ScorePPfix(PhrasePair& pp) const - // { - // BOOST_FOREACH(sptr const& ff, m_active_ff_fix) - // (*ff)(btfix,pp); - // BOOST_FOREACH(sptr const& ff, m_active_ff_common) - // (*ff)(btfix,pp); - // } - -// // process phrase stats from a single parallel corpus -// bool -// Mmsapt:: -// pool_pstats(Phrase const& src, -// uint64_t const pid1a, -// pstats * statsa, -// Bitext const & bta, -// uint64_t const pid1b, -// pstats const* statsb, -// Bitext const & btb, -// TargetPhraseCollection* tpcoll) const -// { -// PhrasePair pp; -// if (statsa && statsb) -// pp.init(pid1b, *statsa, *statsb, this->m_numScoreComponents); -// else if (statsa) -// pp.init(pid1a, *statsa, this->m_numScoreComponents); -// else if (statsb) -// pp.init(pid1b, *statsb, this->m_numScoreComponents); -// else return false; // throw "no stats for pooling available!"; - -// pstats::trg_map_t::const_iterator b; -// pstats::trg_map_t::iterator a; -// if (statsb) -// { -// for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b) -// { -// uint32_t sid,off,len; -// parse_pid(b->first, sid, off, len); -// Token const* x = btb.T2->sntStart(sid) + off; -// TSA::tree_iterator m(bta.I2.get(),x,x+len); -// if (m.size() == len) -// { -// ; -// if (statsa && ((a = statsa->trg.find(m.getPid())) -// != statsa->trg.end())) -// { -// pp.update(b->first,a->second,b->second); -// a->second.invalidate(); -// } -// else -// pp.update(b->first,m.approxOccurrenceCount(), -// b->second); -// } -// else pp.update(b->first,b->second); -// BOOST_FOREACH(sptr const& ff, m_active_ff_fix) -// (*ff)(btb,pp); -// BOOST_FOREACH(sptr const& ff, m_active_ff_common) -// (*ff)(btb,pp); -// tpcoll->Add(mkTPhrase(src,btb,pp)); -// } -// } -// if (!statsa) return statsb != NULL; -// for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a) -// { -// uint32_t sid,off,len; -// if (!a->second.valid()) continue; -// parse_pid(a->first, sid, off, len); -// if (btb.T2) -// { -// Token const* x = bta.T2->sntStart(sid) + off; -// TSA::tree_iterator m(btb.I2.get(), x, len); -// if (m.size() == len) -// pp.update(a->first,m.approxOccurrenceCount(),a->second); -// else -// pp.update(a->first,a->second); -// } -// else pp.update(a->first,a->second); -// #if 0 -// // jstats const& j = a->second; -// cerr << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " -// << bta.T2->pid2str(bta.V2.get(),pp.p2) << endl; -// cerr << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " " -// << pp.joint << " " << pp.raw2 << endl; -// #endif - -// UTIL_THROW_IF2(pp.raw2 == 0, -// "OOPS" << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " -// << bta.T2->pid2str(bta.V2.get(),pp.p2) << ": " -// << pp.raw1 << " " << pp.sample1 << " " -// << pp.good1 << " " << pp.joint << " " -// << pp.raw2); -// BOOST_FOREACH(sptr const& ff, m_active_ff_fix) -// (*ff)(bta,pp); -// BOOST_FOREACH(sptr const& ff, m_active_ff_common) -// (*ff)(bta,pp); -// tpcoll->Add(mkTPhrase(src,bta,pp)); -// } -// return true; -// } - - - - - // // process phrase stats from a single parallel corpus - // bool - // Mmsapt:: - // combine_pstats - // (Phrase const& src, - // uint64_t const pid1a, pstats * statsa, Bitext const & bta, - // uint64_t const pid1b, pstats const* statsb, Bitext const & btb, - // TargetPhraseCollection* tpcoll) const - // { - // if (!statsa && !statsb) return false; - - // PhrasePair ppfix,ppdyn,pool; Word w; - // // ppfix: counts from btfix - // // ppdyn: counts from btdyn - // // pool: pooled counts from both - - // pstats::trg_map_t::const_iterator b; - // pstats::trg_map_t::iterator a; - - - // set check; - // if (statsb) - // { - // ppdyn.init(pid1b,*statsb,this->m_numScoreComponents); - // if (statsa) - // { - // pool.init(pid1b, *statsa, *statsb, 0); - // ppfix.init(pid1a,*statsa, 0); - // } - // else - // { - // pool.init(pid1b, *statsb,0); - // ppfix.init(); - // } - - // for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b) - // { - // ppdyn.update(b->first,b->second); - // BOOST_FOREACH(sptr const& ff, m_active_ff_dyn) - // (*ff)(btb,ppdyn); - - // uint32_t sid,off,len; - // parse_pid(b->first, sid, off, len); - // Token const* x = btb.T2->sntStart(sid) + off; - // TSA::tree_iterator m(bta.I2.get(),x,len); - - // Token const* y = m.getToken(0); - // for (size_t i = 0; i < len; ++i) - // cout << x[i].id() << " " << endl; - // for (size_t i = 0; i < m.size(); ++i) - // cout << y[i].id() << " " << endl; - - // if (statsa && m.size() && - // ((a = statsa->trg.find(m.getPid())) != statsa->trg.end())) - // { // i.e., phrase pair found also in btfix - // ppfix.update(a->first,a->second); - // pool.update(b->first, b->second, a->second); - // BOOST_FOREACH(sptr const& ff, m_active_ff_fix) - // (*ff)(bta, ppfix, &ppdyn.fvals); - // check.insert(a->first); - // } - // else // phrase pair was not found in btfix - // { - // if (m.size()) // ... but the source phrase was - // { - // pool.update(b->first, m.approxOccurrenceCount(), b->second); - // ppfix.update(b->first,m.approxOccurrenceCount()); - // } - // else // ... and not even the source phrase - // { - // pool.update(b->first, b->second); - // ppfix.update(b->first,0); - // } - // BOOST_FOREACH(sptr const& ff, m_active_ff_fix) - // (*ff)(btb, ff->allowPooling() ? pool : ppfix, &ppdyn.fvals); - // } - // BOOST_FOREACH(sptr const& ff, m_active_ff_common) - // (*ff)(btb, pool, &ppdyn.fvals); - // tpcoll->Add(mkTPhrase(src,btb,ppdyn)); - // } - // } - - // // now deal with all phraise pairs that are ONLY in btfix - // // (the ones that are in both were dealt with above) - // if (statsa) - // { - // ppfix.init(pid1a, *statsa, this->m_numScoreComponents); - // pool.init(pid1a, *statsa, 0); - // ppdyn.init(); - // for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a) - // { - // if (check.find(a->first) != check.end()) - // continue; - - // ppfix.update(a->first, a->second); - // BOOST_FOREACH(sptr const& ff, m_active_ff_fix) - // (*ff)(bta, ppfix); - - // if (btb.I2) - // { - // uint32_t sid,off,len; - // parse_pid(a->first, sid, off, len); - // Token const* x = bta.T2->sntStart(sid) + off; - // TSA::tree_iterator m(btb.I2.get(), x, len); - // if (m.size()) - // pool.update(a->first, m.approxOccurrenceCount(), a->second); - // else - // pool.update(a->first, a->second); - // } - // else pool.update(a->first, a->second); - // BOOST_FOREACH(sptr const& ff, m_active_ff_dyn) - // (*ff)(btb, ff->allowPooling() ? pool : ppdyn, &ppfix.fvals); - // BOOST_FOREACH(sptr const& ff, m_active_ff_common) - // (*ff)(bta, pool, &ppfix.fvals); - // if (ppfix.p2) - // tpcoll->Add(mkTPhrase(src, bta, ppfix)); - // } - // } - // return true; - // } - Mmsapt:: TargetPhraseCollectionWrapper:: TargetPhraseCollectionWrapper(size_t r, uint64_t k)