SQE: Added MT mode. Added grammar-based filtering to CLIR mode (H6). …

…some refactoring. few changes to OpenNLPTokenizer
mt3 · Jan 25, 2012 · 2255634 · 2255634
1 parent dc0d48b
commit 2255634
Show file tree

Hide file tree

Showing 8 changed files with 675 additions and 364 deletions.
diff --git a/src/java/main/ivory/sqe/querygenerator/CLPhraseQueryGenerator.java b/src/java/main/ivory/sqe/querygenerator/CLPhraseQueryGenerator.java
@@ -84,7 +84,7 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
     LOG.info("H3 = " + H3);
 
 
-    phraseTable = generatePhraseTable(conf.get(Constants.SCFGPath));
+    phraseTable = Utils.generatePhraseTable(conf);
     tokenizer = TokenizerFactory.createTokenizer(fs, conf.get(Constants.Language), conf.get(Constants.TokenizerData), null);
   }
 
@@ -213,221 +213,28 @@ protected JSONArray getPhraseTranslations(String phrase) {
     }
 
     JSONArray phraseTranslationsArr = new JSONArray();
-    float sumProbEF = 0;
+    float sumProb = 0;
     for (String translation : translation2prob.keySet()) {
       try {
         float prob = translation2prob.get(translation);
-        sumProbEF += prob;
+        sumProb += prob;
         phraseTranslationsArr.put(prob);
         phraseTranslationsArr.put(translation);
       } catch (JSONException e) {
         throw new RuntimeException("Error adding translation and prob values");
       }
     }
 
+    // normalize weights
     for (int i = 0; i < phraseTranslationsArr.length(); i=i+2){
       try {
         float pr = (float) phraseTranslationsArr.getDouble(i);
-        phraseTranslationsArr.put(i, pr/sumProbEF);
+        phraseTranslationsArr.put(i, pr/sumProb);
       } catch (JSONException e1) {
         throw new RuntimeException("Error normalizing");
       }
     }
 
     return phraseTranslationsArr;
   }
-
-  /**
-   * @param grammarFile
-   * 		grammar file that contains a SCFG grammar that has been extracted from GIZA++ alignments using Hiero w.r.t set of queries
-   * @return
-   * 		set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> score) maps
-   */
-  private Map<String, HMapKF<String>> generatePhraseTable(String grammarFile) {
-    // phrase2score table is a set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> score) maps
-    Map<String,HMapKF<String>> phrase2score = new HashMap<String,HMapKF<String>>();
-
-    // phrase2count table is a set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> count) maps
-    Map<String,HMapKI<String>> phrase2count = new HashMap<String,HMapKI<String>>();
-
-    try {
-      BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(grammarFile), "UTF-8"));
-      String rule = null;
-      while ((rule = r.readLine())!=null) {
-//        LOG.info("SCFG rule = " + rule);
-        String[] parts = rule.split("\\|\\|\\|");
-        String[] lhs = parts[1].trim().split(" ");
-        String[] rhs = parts[2].trim().split(" ");;
-        String[] probs = parts[3].trim().split(" ");
-        String[] alignments = parts[4].trim().split(" ");;
-
-        // early termination: need more than 1 alignment pair to get a phrase translation
-        if (alignments.length < 2)
-          continue;
-
-        HMapIV<ArrayListOfInts> one2manyAlign = readAlignments(alignments);
-
-        // we have parsed all components of this grammar rule. now, update phrase table accordingly
-        updatePhraseTable(phrase2score, phrase2count, lhs, rhs, probs, one2manyAlign);
-      }
-    } catch (IOException e) {
-      e.printStackTrace();
-    }
-    return phrase2score;
-  }
-
-  /**
-   * @param phrase2score
-   * 		set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> score) maps
-   * @param phrase2count
-   * 		set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> count) maps
-   * @param lhs
-   * 		LHS of grammar rule
-   * @param rhs
-   * 		RHS of grammar rule
-   * @param probs
-   * 		Phrase translation probabilities from grammar rule
-   * @param one2manyAlign
-   * 		map of alignments ( LHS token id x --> List of RHS token ids aligned to x )
-   */
-  private void updatePhraseTable(Map<String, HMapKF<String>> phrase2score, Map<String, HMapKI<String>> phrase2count, String[] lhs, String[] rhs, String[] probs, HMapIV<ArrayListOfInts> one2manyAlign) {
-    // if H1, minwindow is 1
-    int MaxWindow = 1;	
-    int MinWindow = MaxWindow;	
-    if (H1 == ON) {
-      MinWindow = 0;
-    }
-    float prob = (float) Math.pow(Math.E, -Float.parseFloat(probs[0]));
-
-    for (int w = MinWindow; w <= MaxWindow; w++){
-      // f is the beginning point of the phrase
-      // w is the size of the window, starting after f
-
-      int numWindows = lhs.length - w;
-      for (int start = 0; start < numWindows; start++) {
-        //LOG.info("w="+w);
-        //LOG.info("start="+start);
-        // phrase window = [f,f+w]
-
-        ArrayListOfInts phraseTranslationIds = new ArrayListOfInts();
-
-        String fPhrase = "";
-        int cnt = 0;
-        for (int cur = start; cur <= start + w; cur++) {
-          ArrayListOfInts translationIds = one2manyAlign.get(cur);
-          if (translationIds == null) {
-            // if there are any unaligned source terms in this window, move to next f
-            start = cur; 
-            break;
-          }
-          String fTerm = lhs[cur];
-
-          // if H2, don't break
-          if (fTerm.matches("\\[X,\\d+\\]") || fTerm.equals("<s>") || fTerm.equals("</s>"))	 break;
-          phraseTranslationIds = phraseTranslationIds.mergeNoDuplicates(translationIds);
-          fPhrase += fTerm+" ";
-          cnt++;
-        }		
-
-        //				String fPhrase = getPhraseTranslation(f, w, lhs, one2manyAlign, phraseTranslationIds);
-
-        // if there was no source well-defined phrase at [f, f+w] (i.e., previous loop hit a 'break' case), move to next value of f
-        if (cnt < w+1) {
-          continue;
-        }
-
-        //LOG.info("Found source phrase " + fPhrase + "\n -->" + phraseTranslationIds);
-
-        // check if the translation of [f, f+w-1] is a well-defined phrase as well
-        // allow 1-to-many and many-to-1 phrase pairs if H1 is set
-        if ((phraseTranslationIds.size() > 1 || (H1 == ON && w > 0)) && isConsecutive(phraseTranslationIds)) {
-          String transPhrase = "";
-          boolean ignore = false;
-          for (int e : phraseTranslationIds) {
-            String eTerm = rhs[e];
-            if (eTerm.matches("\\[X,\\d+\\]") || eTerm.equals("<s>") || eTerm.equals("</s>")) {
-              ignore = true;
-              break;
-            }
-            transPhrase += eTerm + " ";
-          }
-
-          // add phrase pair to table
-          if (!ignore) {
-            fPhrase = fPhrase.trim();
-            transPhrase = transPhrase.trim();
-
-            //LOG.info("Found translation phrase " + transPhrase);
-
-            if (!phrase2score.containsKey(fPhrase)) {
-              phrase2score.put(fPhrase, new HMapKF<String>());
-            }
-            // H3 = if same phrase extracted from multiple rules, add, average or take max of prob.s
-
-            HMapKF<String> scoreTable = phrase2score.get(fPhrase);
-            if (H3 == SUM) {
-              // H3 = sum
-              scoreTable.increment(transPhrase, prob);		// sum
-            }else if (H3 == AVG) {
-              // H3 = average
-              if (!phrase2count.containsKey(fPhrase)) {
-                phrase2count.put(fPhrase, new HMapKI<String>());
-              }
-              HMapKI<String> countTable = phrase2count.get(fPhrase);
-
-              // case1 : first time we've seen phrase (fPhrase, transPhrase)
-              if (!scoreTable.containsKey(transPhrase)) {
-//                LOG.debug("Phrase = "+fPhrase+" -> " +transPhrase);
-                scoreTable.put(transPhrase, prob);		// update score in table
-                countTable.increment(transPhrase, 1);			// update count in table
-              }else {               // case2 : we've seen phrase (fPhrase, transPhrase) before. update the average prob.
-                int count = countTable.get(transPhrase);		// get current count
-                float scoreUpdated = (scoreTable.get(transPhrase)*count + prob) / (count+1);		// compute updated average
-                scoreTable.put(transPhrase, scoreUpdated);		// update score in table
-                countTable.increment(transPhrase, 1);			// update count in table
-              }
-            } else {
-              // H3 = take max
-
-              // if first occurrence, OR if current prob is greater than max, set score to prob
-              if (!scoreTable.containsKey(transPhrase) || prob > scoreTable.get(transPhrase)) {				
-                scoreTable.put(transPhrase, prob);
-              }
-            }
-
-          }
-        }
-      }
-    }		
-  }
-
-  //	private String getPhraseTranslation(int start, int size, String[] lhs, HMapIV<ArrayListOfInts> one2manyAlign, ArrayListOfInts phraseTranslationIds) {
-  //
-  //	}
-
-  private static boolean isConsecutive(ArrayListOfInts lst) {
-    int prev = -1;
-    for(int i : lst){
-      if(prev != -1 && i > prev+1){
-        return false;
-      }
-      prev = i;
-    }
-    return true;
-  }
-
-  private static HMapIV<ArrayListOfInts> readAlignments(String[] alignments) {
-    HMapIV<ArrayListOfInts> one2manyAlign = new HMapIV<ArrayListOfInts>();
-    for(String alignment : alignments){
-      String[] alPair = alignment.split("-");
-      int f = Integer.parseInt(alPair[0]);
-      int e = Integer.parseInt(alPair[1]);
-      if(!one2manyAlign.containsKey(f)){
-        one2manyAlign.put(f, new ArrayListOfInts());	
-      }
-      one2manyAlign.get(f).add(e);
-    }
-    return one2manyAlign;
-  }
-
 }
diff --git a/src/java/main/ivory/sqe/querygenerator/CLWordAndPhraseQueryGenerator.java b/src/java/main/ivory/sqe/querygenerator/CLWordAndPhraseQueryGenerator.java
@@ -4,16 +4,12 @@
 import ivory.core.tokenize.TokenizerFactory;
 import ivory.sqe.retrieval.Constants;
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.log4j.Logger;
 import org.json.JSONArray;
 import org.json.JSONException;
 import org.json.JSONObject;
-import org.mortbay.log.Log;
 
 
 public class CLWordAndPhraseQueryGenerator implements QueryGenerator {
@@ -26,7 +22,7 @@ public class CLWordAndPhraseQueryGenerator implements QueryGenerator {
   //  private int H4;
   private static final int OFF = 0, ON = 1;
   private static final int COMBINE = 0, PERTOKEN = 1, COVER = 2;
-  private int H1, H4;
+  private int H1, H4, H6;
 
   public CLWordAndPhraseQueryGenerator() throws IOException {
     super();
@@ -38,6 +34,7 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
     LOG.info(conf.get(Constants.Heuristic1));
     LOG.info(conf.get(Constants.Heuristic3));
     LOG.info(conf.get(Constants.Heuristic4));
+    LOG.info(conf.get(Constants.Heuristic6));
 
     String h4 = conf.get(Constants.Heuristic4); 
     if (h4.equals("combine")) {
@@ -62,8 +59,8 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
 
     tokenizer = TokenizerFactory.createTokenizer(fs, conf.get(Constants.Language), conf.get(Constants.TokenizerData), null);
     clGenerator = new CLWordQueryGenerator();
-    phraseGenerator = new CLPhraseQueryGenerator();
     clGenerator.init(fs, conf);
+    phraseGenerator = new CLPhraseQueryGenerator();
     phraseGenerator.init(fs, conf);
   }
 
@@ -82,22 +79,6 @@ public JSONObject parseQuery(String query) {
       // iterate over tokens and phrases, and create weighted representation for each. 
       // save representations into resp. arrays
 
-      // word
-      JSONObject[] weightObjects = new JSONObject[length];
-      for (int start = 0; start < length; start++) {
-        // create a #weight JSonObject
-        String token = tokens[start];
-        JSONArray weights = clGenerator.getTranslations(token);
-        if (weights != null) {				
-          JSONObject wordTrans = new JSONObject();
-          wordTrans.put("#weight", weights);
-          weightObjects[start] = wordTrans;
-        }else {
-          // token doesn't appear in vocab
-          //LOG.info("Skipped "+token);
-        }
-      }	
-
       // phrase
       int numWindowSizes = maxWindow-minWindow+1;
       boolean isCovered[][] = new boolean[numWindowSizes][length];
@@ -106,7 +87,7 @@ public JSONObject parseQuery(String query) {
         int wIndex = windowSize - minWindow;
 
         // extract phrases
-        String[] phrases = extractPhrases(tokens, windowSize);
+        String[] phrases = Utils.extractPhrases(tokens, windowSize);
         pweightObjects[wIndex] = new JSONObject[phrases.length];
 
         // find translations for each phrase.
@@ -127,7 +108,10 @@ public JSONObject parseQuery(String query) {
                 ignore = true;
 //                LOG.info("already covered "+wIndex+","+cur); 
               }
-              isCovered[wIndex][cur] = true;
+              if (windowSize > 0) {     // if phrase has single term, no need to mark as covered
+                LOG.info("covered "+wIndex+","+cur); 
+                isCovered[wIndex][cur] = true;
+              }
             }
           }
           if (!ignore) {
@@ -136,8 +120,30 @@ public JSONObject parseQuery(String query) {
             pweightObjects[wIndex][start] = phraseTrans;
           }
         }
-
       }
+
+      // word
+      JSONObject[] weightObjects = new JSONObject[length];
+      for (int start = 0; start < length; start++) {
+//        // we want to check coverage from phrases with length > 0 (not single term)
+//        if(minWindow == 0){
+//          if (isCovered[1][start])  continue;          
+//        }else {
+//          if (isCovered[0][start])  continue;          
+//        }
+        // create a #weight JSonObject
+        String token = tokens[start];
+//        LOG.info("Token "+token+" at "+start+" not covered by pweight");
+        JSONArray weights = clGenerator.getTranslations(token);
+        if (weights != null) {        
+          JSONObject wordTrans = new JSONObject();
+          wordTrans.put("#weight", weights);
+          weightObjects[start] = wordTrans;
+        }else {
+          // token doesn't appear in vocab
+          //LOG.info("Skipped "+token);
+        }
+      } 
 
       // represent each token with an array of #pweight and #weight objects = tokenArr
       // represent query by #combine of token representations = queryArr
@@ -195,21 +201,6 @@ public JSONObject parseQuery(String query) {
     return queryJson;
   }
 
-  private String[] extractPhrases(String[] tokens, int windowSize) {
-    int numWindows = length - windowSize;
-    String[] phrases = new String[numWindows];
-    for (int start = 0; start < numWindows; start++) {
-      String phrase = "";
-      for (int k = 0; k <= windowSize; k++) {
-        int cur = start + k;
-        phrase = phrase + tokens[cur]+" ";
-      }
-      phrase = phrase.trim();
-      phrases[start] = phrase;
-    }
-    return phrases;
-  }
-
   public int getQueryLength(){
     return length;  
   }