Skip to content
This repository has been archived by the owner on May 30, 2023. It is now read-only.

Commit

Permalink
changed tests to use ISO3166 country alpha2, tweaked algorithm to be …
Browse files Browse the repository at this point in the history
…more permissive, added frequency of mention "aboutness" selection (to test)
  • Loading branch information
rahulbot committed Sep 18, 2013
1 parent f220e6b commit 3d2c310
Show file tree
Hide file tree
Showing 11 changed files with 195 additions and 116 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package edu.mit.civic.clavin.resolver;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import com.berico.clavin.gazetteer.CountryCode;
import com.berico.clavin.resolver.ResolvedLocation;

/**
* Once we have selected the candidates, we need to pick what country the document is "about". This
* is the most naive "Aboutness" strategy; it just picks the most mentioned country.
*
* @author rahulb
*/
public class FrequencyOfMentionAboutnessStrategy {

public static List<CountryCode> select(List<ResolvedLocation> resolvedLocations){
// count country mentions
HashMap<CountryCode,Integer> countryCounts = new HashMap<CountryCode,Integer>();
for (ResolvedLocation resolvedLocation: resolvedLocations){
CountryCode country = resolvedLocation.geoname.primaryCountryCode;
if(!countryCounts.containsKey(country)){
countryCounts.put(country, 0);
}
countryCounts.put(country, countryCounts.get(country)+1);
}
// find the most mentioned
CountryCode primaryCountry = null;
for(CountryCode countryCode: countryCounts.keySet()){
if( (primaryCountry==null) || (countryCounts.get(countryCode) > countryCounts.get(primaryCountry)) ){
primaryCountry = countryCode;
}
}
// return results
List<CountryCode> results = new ArrayList<CountryCode>();
results.add(primaryCountry);
return results;
}

}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package edu.mit.civic.clavin.resolver.lucene;
package edu.mit.civic.clavin.resolver;

import java.util.ArrayList;
import java.util.List;
Expand All @@ -11,17 +11,24 @@
import com.berico.clavin.gazetteer.GeoName;
import com.berico.clavin.resolver.ResolvedLocation;

import edu.mit.civic.clavin.resolver.lucene.CustomLuceneLocationResolver;

/**
* Employ a variety of heuristics for picking the best candidate, based on what might work
* better for news articles where we care about what _country_ is being report on.
*
* This is originally modeled on the common colocation + cooccurance strategy.
*
* Noted Failures: Africa, Del., "Rocky Mountains", names ("Bristol Palin", "Chad")
* Failures I've noticed:
* Africa
* Del.
* "Rocky Mountains"
* Fla doesn't give you Florida
* names ("Bristol Palin", "Chad")
*/
public class NewsHeuristicsStrategy {
public class HeuristicCandidateSelectionStrategy {

private static final Logger logger = LoggerFactory.getLogger(NewsHeuristicsStrategy.class);
private static final Logger logger = LoggerFactory.getLogger(HeuristicCandidateSelectionStrategy.class);

private static final double EXACT_MATCH_CONFIDENCE = 1.0;

Expand Down Expand Up @@ -67,7 +74,7 @@ public static List<ResolvedLocation> select(CustomLuceneLocationResolver resolve
possibilitiesToDo.remove(toRemove);
}
logger.info("Still have "+possibilitiesToDo.size()+" lists to do");

logger.info("Pass 1: Pick countries that might not be an exact match");
possibilitiesToRemove.clear();
for( List<ResolvedLocation> candidates: possibilitiesToDo){
Expand Down Expand Up @@ -131,7 +138,7 @@ candidate.geoname.population>0 && inSameCountry(candidate, bestCandidates)){
for( List<ResolvedLocation> candidates: possibilitiesToDo){
boolean foundOne = false;
for( ResolvedLocation candidate: candidates) {
if(!foundOne &&
if(!foundOne && (candidate.geoname.population>0) &&
(candidate.geoname.featureClass==FeatureClass.A || candidate.geoname.featureClass==FeatureClass.P)){
bestCandidates.add(candidate);
logger.info(" PICKED: "+candidate.location.text+"@"+candidate.location.position);
Expand All @@ -146,12 +153,14 @@ candidate.geoname.population>0 && inSameCountry(candidate, bestCandidates)){
}
logger.info("Still have "+possibilitiesToDo.size()+" lists to do");


logger.info("Pass 5: Pick the top result, preferrring ones in the a country found already (last ditch effort)");
possibilitiesToRemove.clear();
for( List<ResolvedLocation> candidates: possibilitiesToDo){
boolean foundOne = false;
// check for one in the same country
for( ResolvedLocation candidate: candidates) {
if(!foundOne && inSameCountry(candidate,bestCandidates)){
if(!foundOne && inSameCountry(candidate,bestCandidates) ){
bestCandidates.add(candidate);
logger.info(" PICKED: "+candidate.location.text+"@"+candidate.location.position);
logResolvedLocationInfo(candidate);
Expand All @@ -160,11 +169,11 @@ candidate.geoname.population>0 && inSameCountry(candidate, bestCandidates)){
}
}
if(!foundOne){
ResolvedLocation candidate = candidates.get(0);
ResolvedLocation candidate = candidates.get(0);
bestCandidates.add(candidate);
logger.info(" PICKED: "+candidate.location.text+"@"+candidate.location.position);
logResolvedLocationInfo(candidate);
possibilitiesToRemove.add(candidates);
logResolvedLocationInfo(candidate);
possibilitiesToRemove.add(candidates);
}
}
for (List<ResolvedLocation> toRemove: possibilitiesToRemove){
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
import com.berico.clavin.resolver.LocationResolver;
import com.berico.clavin.resolver.ResolvedLocation;

import edu.mit.civic.clavin.resolver.HeuristicCandidateSelectionStrategy;

/*#####################################################################
*
* CLAVIN (Cartographic Location And Vicinity INdexer)
Expand Down Expand Up @@ -158,6 +160,10 @@ private List<ResolvedLocation> getCandidateMatches(LocationOccurrence locationNa
List<ResolvedLocation> exactMatch = new ArrayList<ResolvedLocation>();
exactMatch.add(getFirstExactMatch("People’s Republic of China", locationName));
return exactMatch;
} else if (locationName.text.equals("Thai")){
List<ResolvedLocation> exactMatch = new ArrayList<ResolvedLocation>();
exactMatch.add(getFirstExactMatch("Thailand", locationName));
return exactMatch;
}

// Lucene query used to look for matches based on the
Expand Down Expand Up @@ -252,7 +258,7 @@ private List<ResolvedLocation> getCandidateMatches(LocationOccurrence locationNa
private List<ResolvedLocation> pickBestCandidates(List<List<ResolvedLocation>> allCandidates) {

// initialize return object
List<ResolvedLocation> bestCandidates = NewsHeuristicsStrategy.select(this, allCandidates);
List<ResolvedLocation> bestCandidates = HeuristicCandidateSelectionStrategy.select(this, allCandidates);

return bestCandidates;
}
Expand Down
15 changes: 8 additions & 7 deletions src/main/java/edu/mit/civic/clavin/server/ParseManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import com.berico.clavin.resolver.ResolvedLocation;
import com.google.gson.Gson;

import edu.mit.civic.clavin.resolver.FrequencyOfMentionAboutnessStrategy;
import edu.mit.civic.clavin.resolver.lucene.CustomLuceneLocationResolver;

/**
Expand Down Expand Up @@ -54,7 +55,7 @@ public static String locate(String text) {
try {
HashMap results = new HashMap();
results.put("status",STATUS_OK);
ArrayList locationList = new ArrayList();
ArrayList places = new ArrayList();
List<ResolvedLocation> resolvedLocations = locateRaw(text);
for (ResolvedLocation resolvedLocation: resolvedLocations){
HashMap loc = new HashMap();
Expand All @@ -63,16 +64,16 @@ public static String locate(String text) {
loc.put("id",place.geonameID);
loc.put("name",place.name);
loc.put("countryCode",place.primaryCountryCode.toString());
//loc.put("lat",place.latitude);
//loc.put("lon",place.longitude);
loc.put("lat",place.latitude);
loc.put("lon",place.longitude);
HashMap sourceInfo = new HashMap();
sourceInfo.put("string",resolvedLocation.location.text);
sourceInfo.put("charIndex",resolvedLocation.location.position);
loc.put("source",sourceInfo);
//loc.put("type",place.featureClass.type);
locationList.add(loc);
}
results.put("results",locationList);
places.add(loc);
}
results.put("results",places);
results.put("primaryCountries", FrequencyOfMentionAboutnessStrategy.select(resolvedLocations));
return gson.toJson(results);
} catch (Exception e) {
return getErrorText(e.toString());
Expand Down
34 changes: 14 additions & 20 deletions src/test/java/edu/mit/civic/clavin/MultipleArticleTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,31 +35,25 @@ public void testNewYorkTimesExamples() throws Exception {
verifyArticles(articles);
}

@Test
public void testBBCExamples() throws Exception {
List<CodedArticle> articles = loadExamplesFromFile("src/test/resources/sample-docs/bbc_annotated.json");
assertEquals(24, articles.size());
verifyArticles(articles);
}

@Test
public void testHuffingtonPostExamples() throws Exception {
List<CodedArticle> articles = loadExamplesFromFile("src/test/resources/sample-docs/huffington_post_annotated.json");
assertEquals(21, articles.size());
verifyArticles(articles);
}

@Test
public void testBBCExamples() throws Exception {
List<CodedArticle> articles = loadExamplesFromFile("src/test/resources/sample-docs/bbc_annotated.json");
assertEquals(24, articles.size());
verifyArticles(articles);
}

private void verifyArticles(List<CodedArticle> articles) throws Exception{
for(CodedArticle article: articles){
logger.info("Testing article "+article.mediacloudId+" (looking for "+article.handCodedPlaceName+" / "+article.primaryPlaceId+")");
/*
List<ResolvedLocation> results = ParseManager.locateRaw(article.text);
for(ResolvedLocation resolvedLocation: results){
logger.info(" "+resolvedLocation.geoname.geonameID+": "+resolvedLocation.geoname.name+", "+resolvedLocation.geoname.primaryCountryCode+" ("+resolvedLocation.location.text+" @ "+resolvedLocation.location.position+")");
}
*/
assertTrue("Didn't find "+article.handCodedPlaceName+" ("+article.primaryPlaceId+") in article "+article.mediacloudId,
article.primaryPlaceIsParsed());
logger.info("Testing article "+article.mediacloudId+" (looking for "+article.handCodedPlaceName+" / "+article.handCodedCountryCode+")");
assertTrue("Didn't find "+article.handCodedPlaceName+" ("+article.handCodedCountryCode+") in article "+article.mediacloudId,
article.isHandCodedCountryInResults());
}
}

Expand All @@ -74,14 +68,14 @@ private class CodedArticle{
public int mediacloudId;
public String text;
public String handCodedPlaceName;
public int primaryPlaceId;
public String handCodedCountryCode;

public boolean primaryPlaceIsParsed() throws Exception{
public boolean isHandCodedCountryInResults() throws Exception{
List<ResolvedLocation> results = ParseManager.locateRaw(text);
if(primaryPlaceId==0){ // no places mentioned in article!
if(handCodedCountryCode.length()==0){ // no places mentioned in article!
return true;
} else {
return TestUtils.resultsContainsPlaceId(results, primaryPlaceId);
return TestUtils.isCountryCodeInResolvedLocations(results, handCodedCountryCode);
}
}
}
Expand Down
9 changes: 8 additions & 1 deletion src/test/java/edu/mit/civic/clavin/SpecificCaseTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,14 @@ public class SpecificCaseTest {
private static final int COUNTRY_NORWAY = 3144096;
private static final int CITY_LONDON = 2643741;
private static final int PLACE_RUSSEL_SQ_LONDON = 6954795;

private static final int COUNTRY_THAILAND = 1605651;

@Test
public void testThailand() throws Exception {
verifyPlacesInFile("src/test/resources/sample-docs/thai.txt",
new int[] {COUNTRY_THAILAND});
}

@Test
public void testRussellSq() throws Exception {
// picks the right Russel Sq (the one in GB) after we find London in the article
Expand Down
11 changes: 10 additions & 1 deletion src/test/java/edu/mit/civic/clavin/TestUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,14 @@ public static boolean resultsContainsPlaceId(List<ResolvedLocation> results, int
}
return false;
}


public static boolean isCountryCodeInResolvedLocations(List<ResolvedLocation> results, String countryAlpha2){
for(ResolvedLocation location: results){
if(location.geoname.primaryCountryCode.toString().equals(countryAlpha2)){
return true;
}
}
return false;
}

}
Loading

0 comments on commit 3d2c310

Please sign in to comment.