Merge branch 'ferhanture-master' into clpwsim-lsh

mt3 · Jan 26, 2012 · e474286 · e474286
2 parents c8e3d1a + 778e0c4
commit e474286
Show file tree

Hide file tree

Showing 3 changed files with 133 additions and 19 deletions.
diff --git a/...y/core/driver/PreprocessTREC9Chinese.java → ...ry/core/driver/PreprocessTRECChinese.java b/...y/core/driver/PreprocessTREC9Chinese.java → ...ry/core/driver/PreprocessTRECChinese.java
@@ -39,11 +39,11 @@
 import edu.umd.cloud9.collection.trec.TrecDocnoMapping;
 import edu.umd.cloud9.collection.trec.TrecDocumentInputFormat2;
 
-public class PreprocessTREC9Chinese extends Configured implements Tool {
-  private static final Logger LOG = Logger.getLogger(PreprocessTREC9Chinese.class);
+public class PreprocessTRECChinese extends Configured implements Tool {
+  private static final Logger LOG = Logger.getLogger(PreprocessTRECChinese.class);
 
   private static int printUsage() {
-    System.out.println("usage: [input-path] [index-path] [stanford-segmenter-model-path]");
+    System.out.println("usage: [input-path] [index-path] [tokenizer-class] [stanford-segmenter-model-path]");
     ToolRunner.printGenericCommandUsage(System.out);
     return -1;
   }
@@ -52,17 +52,22 @@ private static int printUsage() {
    * Runs this tool.
    */
   public int run(String[] args) throws Exception {
-    if (args.length != 3) {
+    if (args.length < 3) {
       printUsage();
       return -1;
     }
 
     String collection = args[0];
     String indexRootPath = args[1];
-    String tokenizerPath = args[2];
-    LOG.info("Tool name: " + PreprocessTREC9Chinese.class.getCanonicalName());
+    String tokenizerClass = args[2];
+    String tokenizerPath = null;
+    if (args.length == 4) {
+      tokenizerPath = args[3];
+    }
+    LOG.info("Tool name: " + PreprocessTRECChinese.class.getCanonicalName());
     LOG.info(" - Collection path: " + collection);
     LOG.info(" - Index path: " + indexRootPath);
+    LOG.info(" - Tokenizer class: " + tokenizerClass);
     LOG.info(" - Tokenizer path: " + tokenizerPath);
 
     Configuration conf = getConf();
@@ -96,8 +101,10 @@ public int run(String[] args) throws Exception {
     conf.set(Constants.CollectionPath, collection);
     conf.set(Constants.IndexPath, indexRootPath);
     conf.set(Constants.InputFormat, TrecDocumentInputFormat2.class.getCanonicalName());
-    conf.set(Constants.Tokenizer, StanfordChineseTokenizer.class.getCanonicalName());
-    conf.set(Constants.TokenizerData, tokenizerPath);
+    conf.set(Constants.Tokenizer, tokenizerClass);
+    if (tokenizerPath != null) {
+      conf.set(Constants.TokenizerData, tokenizerPath);
+    }
     conf.set(Constants.DocnoMappingClass, TrecDocnoMapping.class.getCanonicalName());
     conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());
 
@@ -121,6 +128,6 @@ public int run(String[] args) throws Exception {
    * Dispatches command-line arguments to the tool via the {@code ToolRunner}.
    */
   public static void main(String[] args) throws Exception {
-    ToolRunner.run(new Configuration(), new PreprocessTREC9Chinese(), args);
+    ToolRunner.run(new Configuration(), new PreprocessTRECChinese(), args);
   }
 }
diff --git a/src/java/main/ivory/core/tokenize/BigramChineseTokenizer.java b/src/java/main/ivory/core/tokenize/BigramChineseTokenizer.java
@@ -0,0 +1,92 @@
+package ivory.core.tokenize;
+
+import ivory.core.Constants;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.mortbay.log.Log;
+
+import edu.stanford.nlp.ie.crf.CRFClassifier;
+import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
+
+public class BigramChineseTokenizer extends Tokenizer {
+  private static final Logger LOG = Logger.getLogger(BigramChineseTokenizer.class);
+  static{
+    LOG.setLevel(Level.INFO);
+  }
+  public BigramChineseTokenizer(){
+    super();
+  }
+
+  @SuppressWarnings("unchecked")
+  @Override
+  public void configure(Configuration conf) { }
+
+  @Override
+  public void configure(Configuration conf, FileSystem fs) {  }
+
+  @Override
+  public String[] processContent(String text) {
+    int numTokens = 0;
+    String[] chunks = text.split("\\s+");
+
+    List<String> tokens = new ArrayList<String>();
+    for (String chunk : chunks){
+      chunk = chunk.toLowerCase();
+//      LOG.info("chunk="+chunk.length());
+
+      char prev = 0, cur;
+      for (int i = 0; i < chunk.length(); i++) {
+        cur = chunk.charAt(i);
+        if (i > 0) {
+          String bigram = "";
+          bigram += prev;
+          bigram += cur;
+          tokens.add(bigram);
+//          LOG.info("bigram="+bigram);
+          numTokens++;
+        }
+        prev = cur;
+      }
+    }
+
+    String[] tokensArr = new String[numTokens];
+    return tokens.toArray(tokensArr); 
+  }
+
+  public static void main(String[] args) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException{
+    if(args.length < 2){
+      System.err.println("usage: [input] [output-file]");
+      System.exit(-1);
+    }
+    Tokenizer tokenizer = new BigramChineseTokenizer();
+    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]), "UTF8"));
+    BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF8"));
+
+    String line = null;
+    while((line = in.readLine()) != null){
+      String[] tokens = tokenizer.processContent(line);
+      String s = "";
+      for (String token : tokens) {
+        s += token+" ";
+      }
+      out.write(s+"\n");
+    }
+    out.close();
+  }
+
+}
diff --git a/src/java/main/ivory/sqe/querygenerator/DefaultBagOfWordQueryGenerator.java b/src/java/main/ivory/sqe/querygenerator/DefaultBagOfWordQueryGenerator.java
@@ -2,18 +2,22 @@
 
 import java.io.IOException;
 
+import ivory.core.tokenize.BigramChineseTokenizer;
 import ivory.core.tokenize.GalagoTokenizer;
 import ivory.core.tokenize.Tokenizer;
 import ivory.core.tokenize.TokenizerFactory;
 import ivory.sqe.retrieval.Constants;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.log4j.Logger;
 import org.json.JSONArray;
 import org.json.JSONException;
 import org.json.JSONObject;
 
 public class DefaultBagOfWordQueryGenerator implements QueryGenerator {
+  private static final Logger LOG = Logger.getLogger(CLWordQueryGenerator.class);
   Tokenizer tokenizer;
   int length;
 
@@ -22,16 +26,27 @@ public DefaultBagOfWordQueryGenerator() {
   }
 
 	public void init(FileSystem fs, Configuration conf) throws IOException {
-		String lang = conf.get(Constants.Language);
-	  if(lang == null || lang.equals(Constants.English)){
-			tokenizer = new GalagoTokenizer();		
-		}else if(lang.equals(Constants.German)){
-			tokenizer = TokenizerFactory.createTokenizer(conf.get(Constants.Language), conf.get(Constants.TokenizerData), null);
-		}else if(lang.equals(Constants.Chinese)){
-			tokenizer = TokenizerFactory.createTokenizer(conf.get(Constants.Language), conf.get(Constants.TokenizerData), null);
-		}else{
-			throw new RuntimeException("Language code "+conf.get(Constants.Language)+ " not known");
-		}
+	  String lang = conf.get(Constants.Language);
+    String tokenizerPath = conf.get(Constants.TokenizerData);
+    if (lang.equals(Constants.English)) {
+      if (!fs.exists(new Path(tokenizerPath))) {
+        LOG.info("Tokenizer path "+tokenizerPath+" doesn't exist -- using GalagoTokenizer");
+        tokenizer = new GalagoTokenizer();    
+      }else {
+        tokenizer = TokenizerFactory.createTokenizer(lang, tokenizerPath, null);
+      }
+    }else if (lang.equals(Constants.German)) {
+      tokenizer = TokenizerFactory.createTokenizer(conf.get(Constants.Language), conf.get(Constants.TokenizerData), null);
+    }else if (lang.equals(Constants.Chinese)) {
+      if (!fs.exists(new Path(tokenizerPath))) {
+        LOG.info("Tokenizer path "+tokenizerPath+" doesn't exist -- using BigramChineseTokenizer");
+        tokenizer = new BigramChineseTokenizer();
+      }else {
+        tokenizer = TokenizerFactory.createTokenizer(conf.get(Constants.Language), conf.get(Constants.TokenizerData), null);
+      }
+    }else {
+      throw new RuntimeException("Language code "+lang+ " not known");
+    }
 	}
 
   public JSONObject parseQuery(String query){