Skip to content

Commit

Permalink
Fixed issues with Wikipedia integration tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
jimmy0017 committed Feb 1, 2012
1 parent 033ff7e commit f960e91
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 16 deletions.
2 changes: 1 addition & 1 deletion ivy/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
<exclude name="ant"/>
<exclude name="hadoop-ant"/>
</dependency>
<dependency org="edu.umd" name="cloud9" rev="1.2.10">
<dependency org="edu.umd" name="cloud9" rev="1.2.11">
<exclude name="ant"/>
<exclude org="org.apache.hadoop"/>
<exclude org="com.cloudera.hadoop"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import java.util.List;
import java.util.Map;
import java.util.Random;

import junit.framework.JUnit4TestAdapter;

Expand All @@ -25,11 +26,14 @@
import edu.umd.cloud9.io.map.HMapSFW;

public class VerifyWikipediaProcessingCrosslingual {
private static final String vocabPath = "vocab";
private static final Random rand = new Random();
private static final String tmp = "tmp-" + VerifyWikipediaProcessingCrosslingual.class.getSimpleName() + rand.nextInt(10000);

private static final String vocabPath = tmp + "/vocab";
private static final String enwikiPath =
"/shared/collections/wikipedia/raw/enwiki-20110115-pages-articles.xml";
private static final String enwikiRepacked = "enwiki-20110115.repacked";
private static final String enwikiEn = "enwiki.en";
private static final String enwikiRepacked = tmp + "/enwiki-20110115.repacked";
private static final String enwikiEn = tmp + "/enwiki.en";

// en side: part 00000, key = 92101
private ImmutableMap<String, Float> enTermDocVector1 = ImmutableMap.of(
Expand All @@ -49,8 +53,8 @@ public class VerifyWikipediaProcessingCrosslingual {

private static final String dewikiPath =
"/shared/collections/wikipedia/raw/dewiki-20110131-pages-articles.xml";
private static final String dewikiRepacked = "dewiki-20110131.repacked";
private static final String dewikiEn = "dewiki.en";
private static final String dewikiRepacked = tmp + "/dewiki-20110131.repacked";
private static final String dewikiEn = tmp + "/dewiki.en";

// de side: part 00000, key = 1001242228
private ImmutableMap<String, Float> deTermDocVector1 = ImmutableMap.of(
Expand All @@ -77,6 +81,7 @@ public void runBuildIndexEnSide() throws Exception {

fs.delete(new Path(enwikiEn), true);
fs.delete(new Path(enwikiRepacked), true);
fs.delete(new Path(vocabPath), true);

fs.copyFromLocalFile(false, true, new Path("data/vocab"), new Path(vocabPath));

Expand All @@ -100,7 +105,7 @@ public void runBuildIndexEnSide() throws Exception {
IntegrationUtils.D_JT, IntegrationUtils.D_NN,
enwikiEn, enwikiPath, enwikiRepacked,
ivory.core.tokenize.OpenNLPTokenizer.class.getCanonicalName(), "en",
"vocab/en-token.bin", "vocab/vocab.en-de.en"});
vocabPath + "/en-token.bin", vocabPath + "/vocab.en-de.en"});
}

@Test
Expand Down Expand Up @@ -152,6 +157,7 @@ public void runBuildIndexDeSide() throws Exception {

fs.delete(new Path(dewikiEn), true);
fs.delete(new Path(dewikiRepacked), true);
fs.delete(new Path(vocabPath), true);

fs.copyFromLocalFile(false, true, new Path("data/vocab"), new Path(vocabPath));

Expand All @@ -175,8 +181,9 @@ public void runBuildIndexDeSide() throws Exception {
IntegrationUtils.D_JT, IntegrationUtils.D_NN,
dewikiEn, dewikiPath, dewikiRepacked,
ivory.core.tokenize.OpenNLPTokenizer.class.getCanonicalName(), "de",
"vocab/de-token.bin", "vocab/vocab.de-en.de", "vocab/vocab.de-en.en", "vocab/ttable.de-en",
"vocab/vocab.en-de.en", "vocab/vocab.en-de.de", "vocab/ttable.en-de"});
vocabPath + "/de-token.bin",
vocabPath + "/vocab.de-en.de", vocabPath + "/vocab.de-en.en", vocabPath + "/ttable.de-en",
vocabPath + "/vocab.en-de.en", vocabPath + "/vocab.en-de.de", vocabPath + "/ttable.en-de"});
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import java.util.List;
import java.util.Map;
import java.util.Random;

import junit.framework.JUnit4TestAdapter;

Expand All @@ -25,10 +26,13 @@
import edu.umd.cloud9.io.map.HMapSFW;

public class VerifyWikipediaProcessingMonolingual {
private static final Random rand = new Random();
private static final String tmp = "tmp-" + VerifyWikipediaProcessingMonolingual.class.getSimpleName() + rand.nextInt(10000);

private static final String collectionPath =
"/shared/collections/wikipedia/raw/enwiki-20110115-pages-articles.xml";
private static final String collectionRepacked = "enwiki-20110115.repacked";
private static final String galagoIndex = "enwiki.galago";
private static final String collectionRepacked = tmp + "/enwiki-20110115.repacked";
private static final String galagoIndex = tmp + "/enwiki.galago";

// Galago: part 00000, key = 92101
private ImmutableMap<String, Float> galagoTermDocVector1 = ImmutableMap.of(
Expand All @@ -46,8 +50,8 @@ public class VerifyWikipediaProcessingMonolingual {
private ImmutableMap<Integer, Float> galagoIntDocVector2 =
ImmutableMap.of(2, 0.003051088f, 156, 0.03952723f, 2726, 0.08285294f, 402710, 0.20997283f);

private static final String opennlpIndex = "enwiki.opennlp";
private static final String vocabPath = "vocab";
private static final String opennlpIndex = tmp + "/enwiki.opennlp";
private static final String vocabPath = tmp + "/vocab";

// Opennlp: part 00000, key = 92101
private ImmutableMap<String, Float> opennlpTermDocVector1 = ImmutableMap.of(
Expand Down Expand Up @@ -146,6 +150,7 @@ public void runBuildIndexOpennlp() throws Exception {

fs.delete(new Path(opennlpIndex), true);
fs.delete(new Path(collectionRepacked), true);
fs.delete(new Path(vocabPath), true);

fs.copyFromLocalFile(false, true, new Path("data/vocab"), new Path(vocabPath));

Expand Down
5 changes: 3 additions & 2 deletions src/java/main/ivory/core/driver/PreprocessWikipedia.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ public int run(String[] args) throws Exception {
String collectionLang = null, tokenizerModel = null, collectionVocab = null,
fVocab_f2e = null, eVocab_f2e = null, fVocab_e2f, eVocab_e2f = null, ttable_f2e = null, ttable_e2f = null;
String indexRootPath = args[0];
String rawCollection = args[1]; //"/shared/Wikipedia/raw/dewiki-20100117-pages-articles.xml";
String seqCollection = args[2]; //"/umd-lin/fture/pwsim/de-wikipedia/compressed.block/de-20100117";
String rawCollection = args[1];
String seqCollection = args[2];
String tokenizerClass = args[3];
if (args.length > 4) {
collectionLang = args[4];
Expand Down Expand Up @@ -144,6 +144,7 @@ public int run(String[] args) throws Exception {
LOG.info(" - Compressed collection path: " + seqCollection);
LOG.info(" - Collection language: " + collectionLang);
LOG.info(" - Tokenizer class: " + tokenizerClass);
LOG.info(" - Tokenizer model: " + tokenizerModel);
LOG.info(" - Minimum # terms per article : " + MinNumTermsPerArticle);

if (mode == CROSS_LINGUAL_E || mode == CROSS_LINGUAL_F) {
Expand Down
2 changes: 1 addition & 1 deletion src/java/main/ivory/core/tokenize/OpenNLPTokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,7 @@ public String[] processContent(String text) {
for(String token : tokens){
token = removeNonUnicodeChars(token);
if(isDiscard(token)){
sLogger.warn("Discarded stopword "+token);
// sLogger.warn("Discarded stopword "+token);
continue;
}

Expand Down

0 comments on commit f960e91

Please sign in to comment.