Skip to content

Commit

Permalink
SQE: Added MT mode. Added grammar-based filtering to CLIR mode (H6). …
Browse files Browse the repository at this point in the history
…some refactoring. few changes to OpenNLPTokenizer
  • Loading branch information
ferhanture committed Jan 25, 2012
1 parent dc0d48b commit 2255634
Show file tree
Hide file tree
Showing 8 changed files with 675 additions and 364 deletions.
203 changes: 5 additions & 198 deletions src/java/main/ivory/sqe/querygenerator/CLPhraseQueryGenerator.java
Expand Up @@ -84,7 +84,7 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
LOG.info("H3 = " + H3);


phraseTable = generatePhraseTable(conf.get(Constants.SCFGPath));
phraseTable = Utils.generatePhraseTable(conf);
tokenizer = TokenizerFactory.createTokenizer(fs, conf.get(Constants.Language), conf.get(Constants.TokenizerData), null);
}

Expand Down Expand Up @@ -213,221 +213,28 @@ protected JSONArray getPhraseTranslations(String phrase) {
}

JSONArray phraseTranslationsArr = new JSONArray();
float sumProbEF = 0;
float sumProb = 0;
for (String translation : translation2prob.keySet()) {
try {
float prob = translation2prob.get(translation);
sumProbEF += prob;
sumProb += prob;
phraseTranslationsArr.put(prob);
phraseTranslationsArr.put(translation);
} catch (JSONException e) {
throw new RuntimeException("Error adding translation and prob values");
}
}

// normalize weights
for (int i = 0; i < phraseTranslationsArr.length(); i=i+2){
try {
float pr = (float) phraseTranslationsArr.getDouble(i);
phraseTranslationsArr.put(i, pr/sumProbEF);
phraseTranslationsArr.put(i, pr/sumProb);
} catch (JSONException e1) {
throw new RuntimeException("Error normalizing");
}
}

return phraseTranslationsArr;
}

/**
* @param grammarFile
* grammar file that contains a SCFG grammar that has been extracted from GIZA++ alignments using Hiero w.r.t set of queries
* @return
* set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> score) maps
*/
private Map<String, HMapKF<String>> generatePhraseTable(String grammarFile) {
// phrase2score table is a set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> score) maps
Map<String,HMapKF<String>> phrase2score = new HashMap<String,HMapKF<String>>();

// phrase2count table is a set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> count) maps
Map<String,HMapKI<String>> phrase2count = new HashMap<String,HMapKI<String>>();

try {
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(grammarFile), "UTF-8"));
String rule = null;
while ((rule = r.readLine())!=null) {
// LOG.info("SCFG rule = " + rule);
String[] parts = rule.split("\\|\\|\\|");
String[] lhs = parts[1].trim().split(" ");
String[] rhs = parts[2].trim().split(" ");;
String[] probs = parts[3].trim().split(" ");
String[] alignments = parts[4].trim().split(" ");;

// early termination: need more than 1 alignment pair to get a phrase translation
if (alignments.length < 2)
continue;

HMapIV<ArrayListOfInts> one2manyAlign = readAlignments(alignments);

// we have parsed all components of this grammar rule. now, update phrase table accordingly
updatePhraseTable(phrase2score, phrase2count, lhs, rhs, probs, one2manyAlign);
}
} catch (IOException e) {
e.printStackTrace();
}
return phrase2score;
}

/**
* @param phrase2score
* set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> score) maps
* @param phrase2count
* set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> count) maps
* @param lhs
* LHS of grammar rule
* @param rhs
* RHS of grammar rule
* @param probs
* Phrase translation probabilities from grammar rule
* @param one2manyAlign
* map of alignments ( LHS token id x --> List of RHS token ids aligned to x )
*/
private void updatePhraseTable(Map<String, HMapKF<String>> phrase2score, Map<String, HMapKI<String>> phrase2count, String[] lhs, String[] rhs, String[] probs, HMapIV<ArrayListOfInts> one2manyAlign) {
// if H1, minwindow is 1
int MaxWindow = 1;
int MinWindow = MaxWindow;
if (H1 == ON) {
MinWindow = 0;
}
float prob = (float) Math.pow(Math.E, -Float.parseFloat(probs[0]));

for (int w = MinWindow; w <= MaxWindow; w++){
// f is the beginning point of the phrase
// w is the size of the window, starting after f

int numWindows = lhs.length - w;
for (int start = 0; start < numWindows; start++) {
//LOG.info("w="+w);
//LOG.info("start="+start);
// phrase window = [f,f+w]

ArrayListOfInts phraseTranslationIds = new ArrayListOfInts();

String fPhrase = "";
int cnt = 0;
for (int cur = start; cur <= start + w; cur++) {
ArrayListOfInts translationIds = one2manyAlign.get(cur);
if (translationIds == null) {
// if there are any unaligned source terms in this window, move to next f
start = cur;
break;
}
String fTerm = lhs[cur];

// if H2, don't break
if (fTerm.matches("\\[X,\\d+\\]") || fTerm.equals("<s>") || fTerm.equals("</s>")) break;
phraseTranslationIds = phraseTranslationIds.mergeNoDuplicates(translationIds);
fPhrase += fTerm+" ";
cnt++;
}

// String fPhrase = getPhraseTranslation(f, w, lhs, one2manyAlign, phraseTranslationIds);

// if there was no source well-defined phrase at [f, f+w] (i.e., previous loop hit a 'break' case), move to next value of f
if (cnt < w+1) {
continue;
}

//LOG.info("Found source phrase " + fPhrase + "\n -->" + phraseTranslationIds);

// check if the translation of [f, f+w-1] is a well-defined phrase as well
// allow 1-to-many and many-to-1 phrase pairs if H1 is set
if ((phraseTranslationIds.size() > 1 || (H1 == ON && w > 0)) && isConsecutive(phraseTranslationIds)) {
String transPhrase = "";
boolean ignore = false;
for (int e : phraseTranslationIds) {
String eTerm = rhs[e];
if (eTerm.matches("\\[X,\\d+\\]") || eTerm.equals("<s>") || eTerm.equals("</s>")) {
ignore = true;
break;
}
transPhrase += eTerm + " ";
}

// add phrase pair to table
if (!ignore) {
fPhrase = fPhrase.trim();
transPhrase = transPhrase.trim();

//LOG.info("Found translation phrase " + transPhrase);

if (!phrase2score.containsKey(fPhrase)) {
phrase2score.put(fPhrase, new HMapKF<String>());
}
// H3 = if same phrase extracted from multiple rules, add, average or take max of prob.s

HMapKF<String> scoreTable = phrase2score.get(fPhrase);
if (H3 == SUM) {
// H3 = sum
scoreTable.increment(transPhrase, prob); // sum
}else if (H3 == AVG) {
// H3 = average
if (!phrase2count.containsKey(fPhrase)) {
phrase2count.put(fPhrase, new HMapKI<String>());
}
HMapKI<String> countTable = phrase2count.get(fPhrase);

// case1 : first time we've seen phrase (fPhrase, transPhrase)
if (!scoreTable.containsKey(transPhrase)) {
// LOG.debug("Phrase = "+fPhrase+" -> " +transPhrase);
scoreTable.put(transPhrase, prob); // update score in table
countTable.increment(transPhrase, 1); // update count in table
}else { // case2 : we've seen phrase (fPhrase, transPhrase) before. update the average prob.
int count = countTable.get(transPhrase); // get current count
float scoreUpdated = (scoreTable.get(transPhrase)*count + prob) / (count+1); // compute updated average
scoreTable.put(transPhrase, scoreUpdated); // update score in table
countTable.increment(transPhrase, 1); // update count in table
}
} else {
// H3 = take max

// if first occurrence, OR if current prob is greater than max, set score to prob
if (!scoreTable.containsKey(transPhrase) || prob > scoreTable.get(transPhrase)) {
scoreTable.put(transPhrase, prob);
}
}

}
}
}
}
}

// private String getPhraseTranslation(int start, int size, String[] lhs, HMapIV<ArrayListOfInts> one2manyAlign, ArrayListOfInts phraseTranslationIds) {
//
// }

private static boolean isConsecutive(ArrayListOfInts lst) {
int prev = -1;
for(int i : lst){
if(prev != -1 && i > prev+1){
return false;
}
prev = i;
}
return true;
}

private static HMapIV<ArrayListOfInts> readAlignments(String[] alignments) {
HMapIV<ArrayListOfInts> one2manyAlign = new HMapIV<ArrayListOfInts>();
for(String alignment : alignments){
String[] alPair = alignment.split("-");
int f = Integer.parseInt(alPair[0]);
int e = Integer.parseInt(alPair[1]);
if(!one2manyAlign.containsKey(f)){
one2manyAlign.put(f, new ArrayListOfInts());
}
one2manyAlign.get(f).add(e);
}
return one2manyAlign;
}

}
Expand Up @@ -4,16 +4,12 @@
import ivory.core.tokenize.TokenizerFactory;
import ivory.sqe.retrieval.Constants;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.mortbay.log.Log;


public class CLWordAndPhraseQueryGenerator implements QueryGenerator {
Expand All @@ -26,7 +22,7 @@ public class CLWordAndPhraseQueryGenerator implements QueryGenerator {
// private int H4;
private static final int OFF = 0, ON = 1;
private static final int COMBINE = 0, PERTOKEN = 1, COVER = 2;
private int H1, H4;
private int H1, H4, H6;

public CLWordAndPhraseQueryGenerator() throws IOException {
super();
Expand All @@ -38,6 +34,7 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
LOG.info(conf.get(Constants.Heuristic1));
LOG.info(conf.get(Constants.Heuristic3));
LOG.info(conf.get(Constants.Heuristic4));
LOG.info(conf.get(Constants.Heuristic6));

String h4 = conf.get(Constants.Heuristic4);
if (h4.equals("combine")) {
Expand All @@ -62,8 +59,8 @@ public void init(FileSystem fs, Configuration conf) throws IOException {

tokenizer = TokenizerFactory.createTokenizer(fs, conf.get(Constants.Language), conf.get(Constants.TokenizerData), null);
clGenerator = new CLWordQueryGenerator();
phraseGenerator = new CLPhraseQueryGenerator();
clGenerator.init(fs, conf);
phraseGenerator = new CLPhraseQueryGenerator();
phraseGenerator.init(fs, conf);
}

Expand All @@ -82,22 +79,6 @@ public JSONObject parseQuery(String query) {
// iterate over tokens and phrases, and create weighted representation for each.
// save representations into resp. arrays

// word
JSONObject[] weightObjects = new JSONObject[length];
for (int start = 0; start < length; start++) {
// create a #weight JSonObject
String token = tokens[start];
JSONArray weights = clGenerator.getTranslations(token);
if (weights != null) {
JSONObject wordTrans = new JSONObject();
wordTrans.put("#weight", weights);
weightObjects[start] = wordTrans;
}else {
// token doesn't appear in vocab
//LOG.info("Skipped "+token);
}
}

// phrase
int numWindowSizes = maxWindow-minWindow+1;
boolean isCovered[][] = new boolean[numWindowSizes][length];
Expand All @@ -106,7 +87,7 @@ public JSONObject parseQuery(String query) {
int wIndex = windowSize - minWindow;

// extract phrases
String[] phrases = extractPhrases(tokens, windowSize);
String[] phrases = Utils.extractPhrases(tokens, windowSize);
pweightObjects[wIndex] = new JSONObject[phrases.length];

// find translations for each phrase.
Expand All @@ -127,7 +108,10 @@ public JSONObject parseQuery(String query) {
ignore = true;
// LOG.info("already covered "+wIndex+","+cur);
}
isCovered[wIndex][cur] = true;
if (windowSize > 0) { // if phrase has single term, no need to mark as covered
LOG.info("covered "+wIndex+","+cur);
isCovered[wIndex][cur] = true;
}
}
}
if (!ignore) {
Expand All @@ -136,8 +120,30 @@ public JSONObject parseQuery(String query) {
pweightObjects[wIndex][start] = phraseTrans;
}
}

}

// word
JSONObject[] weightObjects = new JSONObject[length];
for (int start = 0; start < length; start++) {
// // we want to check coverage from phrases with length > 0 (not single term)
// if(minWindow == 0){
// if (isCovered[1][start]) continue;
// }else {
// if (isCovered[0][start]) continue;
// }
// create a #weight JSonObject
String token = tokens[start];
// LOG.info("Token "+token+" at "+start+" not covered by pweight");
JSONArray weights = clGenerator.getTranslations(token);
if (weights != null) {
JSONObject wordTrans = new JSONObject();
wordTrans.put("#weight", weights);
weightObjects[start] = wordTrans;
}else {
// token doesn't appear in vocab
//LOG.info("Skipped "+token);
}
}

// represent each token with an array of #pweight and #weight objects = tokenArr
// represent query by #combine of token representations = queryArr
Expand Down Expand Up @@ -195,21 +201,6 @@ public JSONObject parseQuery(String query) {
return queryJson;
}

private String[] extractPhrases(String[] tokens, int windowSize) {
int numWindows = length - windowSize;
String[] phrases = new String[numWindows];
for (int start = 0; start < numWindows; start++) {
String phrase = "";
for (int k = 0; k <= windowSize; k++) {
int cur = start + k;
phrase = phrase + tokens[cur]+" ";
}
phrase = phrase.trim();
phrases[start] = phrase;
}
return phrases;
}

public int getQueryLength(){
return length;
}
Expand Down

0 comments on commit 2255634

Please sign in to comment.