Skip to content

Commit

Permalink
Merge branch 'ferhanture-master' into clpwsim-lsh
Browse files Browse the repository at this point in the history
  • Loading branch information
jimmy0017 committed Jan 26, 2012
2 parents c8e3d1a + 778e0c4 commit e474286
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@
import edu.umd.cloud9.collection.trec.TrecDocnoMapping;
import edu.umd.cloud9.collection.trec.TrecDocumentInputFormat2;

public class PreprocessTREC9Chinese extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(PreprocessTREC9Chinese.class);
public class PreprocessTRECChinese extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(PreprocessTRECChinese.class);

private static int printUsage() {
System.out.println("usage: [input-path] [index-path] [stanford-segmenter-model-path]");
System.out.println("usage: [input-path] [index-path] [tokenizer-class] [stanford-segmenter-model-path]");
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
Expand All @@ -52,17 +52,22 @@ private static int printUsage() {
* Runs this tool.
*/
public int run(String[] args) throws Exception {
if (args.length != 3) {
if (args.length < 3) {
printUsage();
return -1;
}

String collection = args[0];
String indexRootPath = args[1];
String tokenizerPath = args[2];
LOG.info("Tool name: " + PreprocessTREC9Chinese.class.getCanonicalName());
String tokenizerClass = args[2];
String tokenizerPath = null;
if (args.length == 4) {
tokenizerPath = args[3];
}
LOG.info("Tool name: " + PreprocessTRECChinese.class.getCanonicalName());
LOG.info(" - Collection path: " + collection);
LOG.info(" - Index path: " + indexRootPath);
LOG.info(" - Tokenizer class: " + tokenizerClass);
LOG.info(" - Tokenizer path: " + tokenizerPath);

Configuration conf = getConf();
Expand Down Expand Up @@ -96,8 +101,10 @@ public int run(String[] args) throws Exception {
conf.set(Constants.CollectionPath, collection);
conf.set(Constants.IndexPath, indexRootPath);
conf.set(Constants.InputFormat, TrecDocumentInputFormat2.class.getCanonicalName());
conf.set(Constants.Tokenizer, StanfordChineseTokenizer.class.getCanonicalName());
conf.set(Constants.TokenizerData, tokenizerPath);
conf.set(Constants.Tokenizer, tokenizerClass);
if (tokenizerPath != null) {
conf.set(Constants.TokenizerData, tokenizerPath);
}
conf.set(Constants.DocnoMappingClass, TrecDocnoMapping.class.getCanonicalName());
conf.set(Constants.DocnoMappingFile, env.getDocnoMappingData().toString());

Expand All @@ -121,6 +128,6 @@ public int run(String[] args) throws Exception {
* Dispatches command-line arguments to the tool via the {@code ToolRunner}.
*/
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new PreprocessTREC9Chinese(), args);
ToolRunner.run(new Configuration(), new PreprocessTRECChinese(), args);
}
}
92 changes: 92 additions & 0 deletions src/java/main/ivory/core/tokenize/BigramChineseTokenizer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package ivory.core.tokenize;

import ivory.core.Constants;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.mortbay.log.Log;

import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;

public class BigramChineseTokenizer extends Tokenizer {
private static final Logger LOG = Logger.getLogger(BigramChineseTokenizer.class);
static{
LOG.setLevel(Level.INFO);
}
public BigramChineseTokenizer(){
super();
}

@SuppressWarnings("unchecked")
@Override
public void configure(Configuration conf) { }

@Override
public void configure(Configuration conf, FileSystem fs) { }

@Override
public String[] processContent(String text) {
int numTokens = 0;
String[] chunks = text.split("\\s+");

List<String> tokens = new ArrayList<String>();
for (String chunk : chunks){
chunk = chunk.toLowerCase();
// LOG.info("chunk="+chunk.length());

char prev = 0, cur;
for (int i = 0; i < chunk.length(); i++) {
cur = chunk.charAt(i);
if (i > 0) {
String bigram = "";
bigram += prev;
bigram += cur;
tokens.add(bigram);
// LOG.info("bigram="+bigram);
numTokens++;
}
prev = cur;
}
}

String[] tokensArr = new String[numTokens];
return tokens.toArray(tokensArr);
}

public static void main(String[] args) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException{
if(args.length < 2){
System.err.println("usage: [input] [output-file]");
System.exit(-1);
}
Tokenizer tokenizer = new BigramChineseTokenizer();
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]), "UTF8"));
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF8"));

String line = null;
while((line = in.readLine()) != null){
String[] tokens = tokenizer.processContent(line);
String s = "";
for (String token : tokens) {
s += token+" ";
}
out.write(s+"\n");
}
out.close();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,22 @@

import java.io.IOException;

import ivory.core.tokenize.BigramChineseTokenizer;
import ivory.core.tokenize.GalagoTokenizer;
import ivory.core.tokenize.Tokenizer;
import ivory.core.tokenize.TokenizerFactory;
import ivory.sqe.retrieval.Constants;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

public class DefaultBagOfWordQueryGenerator implements QueryGenerator {
private static final Logger LOG = Logger.getLogger(CLWordQueryGenerator.class);
Tokenizer tokenizer;
int length;

Expand All @@ -22,16 +26,27 @@ public DefaultBagOfWordQueryGenerator() {
}

public void init(FileSystem fs, Configuration conf) throws IOException {
String lang = conf.get(Constants.Language);
if(lang == null || lang.equals(Constants.English)){
tokenizer = new GalagoTokenizer();
}else if(lang.equals(Constants.German)){
tokenizer = TokenizerFactory.createTokenizer(conf.get(Constants.Language), conf.get(Constants.TokenizerData), null);
}else if(lang.equals(Constants.Chinese)){
tokenizer = TokenizerFactory.createTokenizer(conf.get(Constants.Language), conf.get(Constants.TokenizerData), null);
}else{
throw new RuntimeException("Language code "+conf.get(Constants.Language)+ " not known");
}
String lang = conf.get(Constants.Language);
String tokenizerPath = conf.get(Constants.TokenizerData);
if (lang.equals(Constants.English)) {
if (!fs.exists(new Path(tokenizerPath))) {
LOG.info("Tokenizer path "+tokenizerPath+" doesn't exist -- using GalagoTokenizer");
tokenizer = new GalagoTokenizer();
}else {
tokenizer = TokenizerFactory.createTokenizer(lang, tokenizerPath, null);
}
}else if (lang.equals(Constants.German)) {
tokenizer = TokenizerFactory.createTokenizer(conf.get(Constants.Language), conf.get(Constants.TokenizerData), null);
}else if (lang.equals(Constants.Chinese)) {
if (!fs.exists(new Path(tokenizerPath))) {
LOG.info("Tokenizer path "+tokenizerPath+" doesn't exist -- using BigramChineseTokenizer");
tokenizer = new BigramChineseTokenizer();
}else {
tokenizer = TokenizerFactory.createTokenizer(conf.get(Constants.Language), conf.get(Constants.TokenizerData), null);
}
}else {
throw new RuntimeException("Language code "+lang+ " not known");
}
}

public JSONObject parseQuery(String query){
Expand Down

0 comments on commit e474286

Please sign in to comment.