Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Compared results to standard Ivory SMRF with DefaultBoW and verified …

…correctness, on robust04
  • Loading branch information...
commit 69552f8bb739f8084bd0cf2f815d43240a88c24c 1 parent d294956
@ferhanture ferhanture authored
View
27 src/java/main/ivory/core/tokenize/TokenizerFactory.java
@@ -33,6 +33,33 @@ public static Tokenizer createTokenizer(String lang, String modelPath, Vocabular
}
}
+ public static Tokenizer createTokenizer(FileSystem fs, String lang, String modelPath, VocabularyWritable vocab){
+ Configuration conf = new Configuration();
+ try {
+ if(lang.equals("zh")){
+ StanfordChineseTokenizer fTok = new StanfordChineseTokenizer();
+ conf.set("Ivory.TokenizerModel", modelPath); //can't use tokenizer file because it points to local path, which isn't supported in StanfordTokenizer at the moment
+ fTok.configure(conf, fs);
+ return fTok;
+ }else if(lang.equals("de") || lang.equals("en")){
+ OpenNLPTokenizer fTok = new OpenNLPTokenizer();
+ conf.set("Ivory.Lang", lang);
+ conf.set("Ivory.TokenizerModel", modelPath);
+ fTok.configure(conf, fs);
+ if(vocab!=null){
+ fTok.setVocab(vocab);
+ }
+
+ return fTok;
+ }else{
+ throw new RuntimeException("Unknown language code: "+lang);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new RuntimeException(e);
+ }
+ }
+
public static Tokenizer createTokenizer(FileSystem fs, Configuration conf, String lang, String modelPath, VocabularyWritable vocab){
try {
if(lang.equals("zh")){
View
13 src/java/main/ivory/sqe/querygenerator/ClQueryGenerator.java
@@ -22,13 +22,16 @@
private TTable_monolithic_IFAs f2eProbs;
private int length;
- public ClQueryGenerator(FileSystem fs, Configuration conf) throws IOException {
+ public ClQueryGenerator() throws IOException {
super();
- fVocab_f2e = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get("Ivory.F_Vocab_F2E")), fs);
- eVocab_f2e = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get("Ivory.E_Vocab_F2E")), fs);
+ }
+
+ public void init(FileSystem fs, String[] args) throws IOException {
+ fVocab_f2e = (VocabularyWritable) HadoopAlign.loadVocab(new Path(args[2]), fs);
+ eVocab_f2e = (VocabularyWritable) HadoopAlign.loadVocab(new Path(args[3]), fs);
- tokenizer = TokenizerFactory.createTokenizer(fs, conf, "en", conf.get("Ivory.TokenizerModel"), fVocab_f2e);
- f2eProbs = new TTable_monolithic_IFAs(fs, new Path(conf.get("Ivory.TTable_F2E")), true);
+ f2eProbs = new TTable_monolithic_IFAs(fs, new Path(args[4]), true);
+ tokenizer = TokenizerFactory.createTokenizer(fs, "en", args[5], fVocab_f2e);
}
public JSONObject parseQuery(String query) {
View
9 src/java/main/ivory/sqe/querygenerator/DefaultBagOfWordQueryGenerator.java
@@ -1,8 +1,12 @@
package ivory.sqe.querygenerator;
+import java.io.IOException;
+
import ivory.core.tokenize.GalagoTokenizer;
import ivory.core.tokenize.Tokenizer;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
@@ -13,9 +17,12 @@
public DefaultBagOfWordQueryGenerator() {
super();
- tokenizer = new GalagoTokenizer();
}
+ public void init(FileSystem fs, String[] args) throws IOException {
+ tokenizer = new GalagoTokenizer();
+ }
+
public JSONObject parseQuery(String query){
String[] tokens = tokenizer.processContent(query);
length = tokens.length;
View
4 src/java/main/ivory/sqe/querygenerator/QueryGenerator.java
@@ -1,9 +1,11 @@
package ivory.sqe.querygenerator;
+import java.io.IOException;
+import org.apache.hadoop.fs.FileSystem;
import org.json.JSONObject;
public interface QueryGenerator {
public JSONObject parseQuery(String query);
-
+ public void init(FileSystem fs, String[] args) throws IOException;
public int getQueryLength();
}
View
47 src/java/main/ivory/sqe/retrieval/PostingsReaderWrapper.java
@@ -30,12 +30,12 @@
protected boolean endOfList = true; // Whether or not we're at the end of the postings list.
protected int lastScoredDocno = 0;
- protected String operator;
+ protected String operator, term;
protected JSONArray values;
protected List<PostingsReaderWrapper> children;
-
- protected boolean ignore = false;
-
+ protected GlobalTermEvidence gte;
+ protected GlobalEvidence ge;
+
public PostingsReaderWrapper(JSONObject query, RetrievalEnvironment env, ScoringFunction scoringFunction, GlobalEvidence ge) throws JSONException {
this.operator = query.keys().next();
this.values = query.getJSONArray(operator);
@@ -79,12 +79,11 @@ public PostingsReaderWrapper(String term, RetrievalEnvironment env, ScoringFunct
////LOG.info("leaf node");
operator = "term";
+ this.term = term;
PostingsList pl = env.getPostingsList(term);
- if (pl == null) {
- ignore = true;
- return;
- }
+
+
//----------BEGIN LOCAL_USAGE
// PostingsListDocSortedNonPositional pl = new PostingsListDocSortedNonPositional();
@@ -112,8 +111,8 @@ public PostingsReaderWrapper(String term, RetrievalEnvironment env, ScoringFunct
//---------END LOCAL_USAGE
postingsReader = pl.getPostingsReader();
- GlobalTermEvidence gte = new GlobalTermEvidence(pl.getDf(), pl.getCf());
- scoringFunction.initialize(gte, ge);
+ gte = new GlobalTermEvidence(pl.getDf(), pl.getCf());
+ this.ge = ge;
lastScoredDocno = 0;
////LOG.info("leaf done.");
@@ -121,21 +120,16 @@ public PostingsReaderWrapper(String term, RetrievalEnvironment env, ScoringFunct
public float computeScore(int curDocno) {
- //shouldn't happen normally, but I'm testing with queries from wt10g and index from trec, so this is possible. i'll just avoid it by returning a score of 0
- if (ignore) {
- return 0;
- }
-
- //System.out.print("Scoring...");
+ //LOG.info("Scoring...");
float score = 0;
// If this is not a leaf node, compute scores from children and combine them w.r.t operator
if (!operator.equals("term")) {
- //System.out.println("non-leaf node");
+ //LOG.info("non-leaf node");
float[] scores = new float[children.size()];
for (int i = 0; i < children.size(); i++) {
scores[i] = children.get(i).computeScore(curDocno);
- //System.out.println("Child "+ i + " score = " + scores[i]);
+ //LOG.info("Child "+ i + " score = " + scores[i]);
}
score = runOperator(scores);
//System.out.println("non-leaf score = " + score);
@@ -154,15 +148,15 @@ public float computeScore(int curDocno) {
if (curDocno == postingsReader.getDocno()) {
tf = postingsReader.getTf();
}
- //System.out.println("term " + operator + " : tf,docno = "+tf+","+curDocno);
((BM25ScoringFunction) scoringFunction).setB(0.3f);
((BM25ScoringFunction) scoringFunction).setK1(0.5f);
int docLen = env.getDocumentLength(curDocno);
- score = scoringFunction.getScore(tf, docLen);
+ this.scoringFunction.initialize(gte, ge);
+ score = scoringFunction.getScore(tf, docLen);
lastScoredDocno = curDocno;
- //System.out.println("leaf score = " + score);
+ //LOG.info("leaf score = " + score);
}
return score;
}
@@ -187,10 +181,6 @@ private float runOperator(float[] scores) {
* @return next smallest docno from posting lists of leaf nodes
*/
public int getNextCandidate(int docno) {
- //shouldn't happen normally, but I'm testing with queries from wt10g and index from trec, so this is possible. i'll just avoid it by returning a score of 0
- if (ignore) {
- return docno;
- }
if (postingsReader == null) { // not a leaf node
for (int i = 0; i < children.size(); i++) {
int nextDocno = children.get(i).getNextCandidate(docno);
@@ -230,12 +220,7 @@ public float getMaxScore() {
return scoringFunction.getMaxScore();
}
- public void setNextCandidate(int docno) {
- //shouldn't happen normally, but I'm testing with queries from wt10g and index from trec, so this is possible. i'll just avoid it by returning a score of 0
- if (ignore) {
- return;
- }
-
+ public void setNextCandidate(int docno) {
// Advance postings reader. Invariant: curPosting will always point to
// the next posting that has not yet been scored.
while (!endOfList && postingsReader.getDocno() < docno) {
View
61 src/java/main/ivory/sqe/retrieval/QueryEngine.java
@@ -7,8 +7,12 @@
import ivory.smrf.retrieval.QueryRunner;
import ivory.sqe.querygenerator.ClQueryGenerator;
import ivory.sqe.querygenerator.DefaultBagOfWordQueryGenerator;
+import ivory.sqe.querygenerator.QueryGenerator;
+
import java.io.IOException;
import java.util.Map;
+import java.util.Set;
+
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.conf.Configuration;
@@ -32,14 +36,20 @@
private StructuredQueryRanker ranker;
private Map<String, String> queries;
private FileSystem fs;
- private Configuration conf;
+ private QueryGenerator generator;
- public QueryEngine(String[] args, FileSystem fs, Configuration conf) {
+ public QueryEngine(String[] args, FileSystem fs) {
try {
this.fs = fs;
- this.conf = conf;
- ranker = new StructuredQueryRanker(args[0], fs, 1000);
+ ranker = new StructuredQueryRanker(args[0], fs, 10);
queries = parseQueries(args[1], fs);
+ if (args.length == 6) {
+ generator = new ClQueryGenerator();
+ } else {
+ generator = new DefaultBagOfWordQueryGenerator();
+ }
+ generator.init(fs, args);
+
} catch (IOException e) {
e.printStackTrace();
} catch (ConfigurationException e) {
@@ -106,54 +116,61 @@ public QueryEngine(String[] args, FileSystem fs, Configuration conf) {
return queries;
}
- private void printResults(StructuredQueryRanker ranker, ResultWriter resultWriter) throws IOException {
+ private void printResults(String queryID, StructuredQueryRanker ranker, ResultWriter resultWriter) throws IOException {
DocnoMapping mapping = ranker.getDocnoMapping();
- for (String queryID : queries.keySet()) {
+// for (String queryID : queries.keySet()) {
// Get the ranked list for this query.
Accumulator[] list = ranker.getResults(queryID);
if (list == null) {
LOG.info("null results for: " + queryID);
- continue;
+ return;
}
for ( int i=0; i<list.length; i++) {
resultWriter.println(queryID + " Q0 " + mapping.getDocid(list[i].docno) + " " + (i + 1) + " "
- + list[i].score + " Ivory");
+ + list[i].score + " sqe-bow-robust04");
}
- }
+// }
}
public void runQueries() {
try {
- ClQueryGenerator generator = new ClQueryGenerator(fs, conf );
LOG.info("Parsed "+queries.size()+" queries");
+ ResultWriter resultWriter = new ResultWriter("sqe-bow-trec.txt", false, fs);
for ( String qid : queries.keySet()) {
String query = queries.get(qid);
LOG.info("Query "+qid+" = "+query);
JSONObject structuredQuery = generator.parseQuery(query);
-
- LOG.info(structuredQuery);
-
+
long start = System.currentTimeMillis();
ranker.rank(qid, structuredQuery, generator.getQueryLength());
long end = System.currentTimeMillis();
LOG.info("Ranking " + qid + ": " + ( end - start) + "ms");
+ printResults(qid, ranker, resultWriter);
}
-
- // Where should we output these results?
-// Node model = models.get(modelID);
-// String fileName = XMLTools.getAttributeValue(model, "output", null);
-// boolean compress = XMLTools.getAttributeValue(model, "compress", false);
-
- ResultWriter resultWriter = new ResultWriter("sqe-bow-trec.txt", false, fs);
- printResults(ranker, resultWriter);
- resultWriter.flush();
+ resultWriter.flush();
} catch (IOException e) {
e.printStackTrace();
}
}
+
+
+
+ public Map<String, Accumulator[]> getResults() {
+ return ranker.getResults();
+ }
+
+ public DocnoMapping getDocnoMapping() {
+ try {
+ return ranker.getDocnoMapping();
+ } catch (IOException e) {
+ e.printStackTrace();
+ throw new RuntimeException(e);
+ }
+ }
+
}
View
18 src/java/main/ivory/sqe/retrieval/QueryEngineHDFS.java
@@ -30,7 +30,7 @@ public void run(JobConf conf, Reporter reporter) throws IOException {
QueryEngine qe;
try {
LOG.info("Initializing QueryEngine...");
- qe = new QueryEngine(args, fs, conf);
+ qe = new QueryEngine(args, fs);
LOG.info("Running the queries ...");
long start = System.currentTimeMillis();
qe.runQueries();
@@ -44,8 +44,9 @@ public void run(JobConf conf, Reporter reporter) throws IOException {
}
public int run(String[] args) throws Exception {
- if (args.length != 6) {
- System.out.println("usage: [queries-file] [models-file] [vocab-f-file] [vocab-e-file] [ttable-f2e-file] [tokenizer-model-file]");
+ if (args.length != 2 && args.length != 6) {
+ System.out.println("usage 1: [queries-file] [models-file] [vocab-f-file] [vocab-e-file] [ttable-f2e-file] [tokenizer-model-file]");
+ System.out.println("usage 2: [queries-file] [models-file]");
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
@@ -64,10 +65,13 @@ public int run(String[] args) throws Exception {
conf.set("args", argsStr);
conf.set("mapred.child.java.opts", "-Xmx16g");
- conf.set("Ivory.F_Vocab_F2E", args[2]);
- conf.set("Ivory.E_Vocab_F2E", args[3]);
- conf.set("Ivory.TTable_F2E", args[4]);
- conf.set("Ivory.TokenizerModel", args[5]);
+
+// if (args.length == 6) {
+// conf.set("Ivory.F_Vocab_F2E", args[2]);
+// conf.set("Ivory.E_Vocab_F2E", args[3]);
+// conf.set("Ivory.TTable_F2E", args[4]);
+// conf.set("Ivory.TokenizerModel", args[5]);
+// }
LOG.info("argsStr: " + argsStr);
JobClient client = new JobClient(conf);
View
32 src/java/main/ivory/sqe/retrieval/StructuredQueryRanker.java
@@ -7,13 +7,17 @@
import ivory.smrf.retrieval.Accumulator;
import java.io.IOException;
import java.util.HashMap;
+import java.util.Map;
import java.util.PriorityQueue;
import org.apache.hadoop.fs.FileSystem;
+import org.apache.log4j.Logger;
import org.json.JSONException;
import org.json.JSONObject;
import edu.umd.cloud9.collection.DocnoMapping;
public class StructuredQueryRanker {
+ private static final Logger LOG = Logger.getLogger(StructuredQueryRanker.class);
+
private RetrievalEnvironment env;
private Accumulator[] accumulators = null;
private final PriorityQueue<Accumulator> sortedAccumulators = new PriorityQueue<Accumulator>();
@@ -69,7 +73,9 @@ public StructuredQueryRanker(String indexPath, FileSystem fs, int numResults) th
score = structureReader.computeScore(docno);
cnt++;
if (cnt % 10000 == 0) {
- System.out.println(cnt + " docs processed = "+docno);
+// LOG.info(cnt + " docs processed = "+docno);
+// LOG.info(scoreThreshold);
+// LOG.info(sortedAccumulators);
}
// Keep track of numResults best accumulators.
if (score > scoreThreshold) {
@@ -84,6 +90,12 @@ public StructuredQueryRanker(String indexPath, FileSystem fs, int numResults) th
a = accumulators[sortedAccumulators.size()];
}
}
+// if (cnt % 10000 == 0) {
+// LOG.info(a);
+// LOG.info(scoreThreshold);
+// LOG.info(sortedAccumulators);
+// LOG.info("======================");
+// }
// Advance to next document
docno = Integer.MAX_VALUE;
@@ -94,14 +106,16 @@ public StructuredQueryRanker(String indexPath, FileSystem fs, int numResults) th
}
// Grab the accumulators off the stack, in (reverse) order.
- Accumulator[] results = new Accumulator[Math.min(numResults, sortedAccumulators.size())];
- for (int i = 0; i < results.length; i++) {
- results[results.length - 1 - i] = sortedAccumulators.poll();
+ Accumulator[] accs = new Accumulator[Math.min(numResults, sortedAccumulators.size())];
+ for (int i = 0; i < accs.length; i++) {
+ Accumulator acc = sortedAccumulators.poll();
+// LOG.info((results.length - 1 - i)+"="+acc);
+ accs[accs.length - 1 - i] = acc;
}
- this.results.put(qid, results);
+ this.results.put(qid, accs);
- return results;
+ return accs;
}
public DocnoMapping getDocnoMapping() throws IOException {
@@ -115,4 +129,10 @@ public DocnoMapping getDocnoMapping() throws IOException {
}
+
+public Map<String, Accumulator[]> getResults() {
+ return results;
+}
+
+
}
Please sign in to comment.
Something went wrong with that request. Please try again.