Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update logic and tests 20-sept-17 #49

Merged
merged 12 commits into from Sep 22, 2017
11 changes: 11 additions & 0 deletions README.md
Expand Up @@ -124,6 +124,17 @@ placeholder > id 85772991
neighbourhood_id: 85772991,
region_id: 85687233 },
names: { eng: [ 'Kelburn' ] } }

placeholder > edges 85632473
[ 85675251,
85675259,
85675261,
85681309,
421182667,
421188405,
890430305,
890441225,
890441463 ]
```

---
Expand Down
2 changes: 1 addition & 1 deletion cmd/jq.filter
@@ -1,6 +1,6 @@
.properties | with_entries(
select( .key |
test("^wof:(id|name|placetype|hierarchy|country_alpha3|abbreviation|superseded_by|label|population|megacity)$"),
test("^wof:(id|name|placetype|hierarchy|parent_id|country_alpha3|abbreviation|superseded_by|label|population|megacity)$"),
test("^lbl:(bbox|latitude|longitude)$"),
test("^geom:(area|bbox|latitude|longitude)$"),
test("^iso:(country)$"),
Expand Down
10 changes: 9 additions & 1 deletion cmd/repl.js
Expand Up @@ -34,11 +34,19 @@ var commands = {
console.timeEnd('took');
cb();
},
edges: function( id, cb ){
console.log( 'edges', '"' + id + '"' );
console.time('took');
console.log( ph.graph.outEdges( id ) );
console.timeEnd('took');
cb();
},
id: function( id, cb ){
console.time('took');
ph.store.get( id, function( err, doc ){
if( err ){ return console.error( err ); }
console.log( ' -', [ doc.id, doc.placetype + ' ', doc.name ].join('\t') );
// console.log( ' -', [ doc.id, doc.placetype + ' ', doc.name ].join('\t') );
console.log( doc );
console.timeEnd('took');
cb();
});
Expand Down
12 changes: 9 additions & 3 deletions cmd/s3_upload.sh
Expand Up @@ -5,6 +5,7 @@ set -euo pipefail
DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd );
DATA_DIR=${PLACEHOLDER_DATA:-"${DIR}/../data"};
BUCKET='s3://pelias-data/placeholder/';
TODAY=`date +%Y-%m-%d`;

echo '--- gzip data files ---';
if type pigz >/dev/null
Expand All @@ -23,8 +24,13 @@ aws s3 cp "${DATA_DIR}/graph.json.gz" "${BUCKET}" --region us-east-1 --acl publi
aws s3 cp "${DATA_DIR}/store.sqlite3.gz" "${BUCKET}" --region us-east-1 --acl public-read;
aws s3 cp "${DATA_DIR}/wof.extract.gz" "${BUCKET}" --region us-east-1 --acl public-read;

echo '--- create archive ---';
aws s3 cp "${BUCKET}graph.json.gz" "${BUCKET}archive/${TODAY}/graph.json.gz" --region us-east-1 --acl public-read;
aws s3 cp "${BUCKET}store.sqlite3.gz" "${BUCKET}archive/${TODAY}/store.sqlite3.gz" --region us-east-1 --acl public-read;
aws s3 cp "${BUCKET}wof.extract.gz" "${BUCKET}archive/${TODAY}/wof.extract.gz" --region us-east-1 --acl public-read;

echo '--- list files ---';
aws s3 ls --human-readable "${BUCKET}";
# 2017-05-05 15:19:08 13.3 MiB graph.json.gz
# 2017-05-05 15:19:48 43.2 MiB store.sqlite3.gz
# 2017-05-05 15:20:30 46.0 MiB wof.extract.gz

echo '--- list archive files ---';
aws s3 ls --human-readable "${BUCKET}archive/${TODAY}/";
22 changes: 19 additions & 3 deletions lib/analysis.js
Expand Up @@ -10,14 +10,30 @@ var lowercase = require('lower-case'),
function normalize( input ){

// sanity check arguments
if( !input || !input.length ){ return []; }
if( typeof input !== 'string' ){ return []; }

// trim input of superfluous whitespace
input = input.trim();

// string is empty
if( 0 === input.length ){ return []; }

// whosonfirst sometimes uses '-1' instead of ''
if( input === '-1' ){ return []; }

// input consists of only numbers
if( /^\d+$/.test( input ) ){ return []; }

// remove certain punctuation
input = input.replace(/[\.]+/g,'');

// replace certain punctuation with spaces
input = input.replace(/[",]+/g,' ');

// remove 'disambiguation' tokens from name suffix
// see: https://github.com/whosonfirst-data/whosonfirst-data/issues/885
input = input.replace(/(\s[-֊־‐‑﹣]|[\(\[]).*$/, '');

// generic synonym contractions
input = input.replace(/\b(sainte)\b/gi, 'ste')
.replace(/\b(saint)\b/gi, 'st')
Expand Down Expand Up @@ -75,9 +91,9 @@ function normalize( input ){
);
}

// replace multiple spaces with a single space
// replace multiple spaces with a single space and trim tokens
return synonyms.map( function( synonym ){
return synonym.replace(/\s{2,}/g, ' ');
return synonym.replace(/\s{2,}/g, ' ').trim();
})
// basic normalization
// note: lowercase MUST be run before removeAccents, please don't change the order
Expand Down
2 changes: 1 addition & 1 deletion prototype/io.js
Expand Up @@ -19,7 +19,7 @@ module.exports.load = function( opts ){
}
};

// load data from disk
// save data to disk
module.exports.save = function( path ){
fs.writeFileSync( graphPath, JSON.stringify( this.graph ) );
this.close();
Expand Down
24 changes: 22 additions & 2 deletions prototype/wof.js
Expand Up @@ -101,11 +101,23 @@ function insertWofRecord( wof, next ){
}

// --- graph ---

// parent_id property (some records have this property set but no hierarchy)
var parentId;
if( wof.hasOwnProperty('wof:parent_id') ){
parentId = wof['wof:parent_id'];
if( 'string' === typeof parentId ){ parentId = parseInt( parentId, 10 ); }
if( !isNaN( parentId ) && parentId !== id && parentId > 0 ){
this.graph.setEdge( parentId, id ); // is child of
}
}

// hierarchy properties
for( var h in wof['wof:hierarchy'] ){
for( var i in wof['wof:hierarchy'][h] ){
var pid = wof['wof:hierarchy'][h][i];
if( 'string' === typeof pid ){ pid = parseInt( pid, 10 ); }
if( pid === id || pid <= 0 ){ continue; }
if( pid === id || pid <= 0 || pid === parentId ){ continue; }
// this.graph.setEdge( id, pid, 'p' ); // has parent
this.graph.setEdge( pid, id ); // is child of
}
Expand Down Expand Up @@ -142,7 +154,15 @@ function isValidWofRecord( id, wof ){
return false;
}

// skip non-current records
/**
skip non-current records

0 signifies a non-current record
1 signifies a current record
-1 signifies an inderminate state, someone needs to look at this record and decide

note: we are considering -1 values as current (for now)
**/
var isCurrent = wof['mz:is_current'];
if( isCurrent === '0' || isCurrent === 0 ){
return false;
Expand Down
12 changes: 6 additions & 6 deletions test/cases/capitalCities.txt
Expand Up @@ -36,13 +36,13 @@
101750367 London, United Kingdom
890451719 St. George's, Grenada
1091680781 Cayenne, French Guiana
85681223 St Peter Port, Guernsey
85632547 St Peter Port, Guernsey
421168965 Accra, Ghana
101753853 Gibraltar, Gibraltar
101870623 Nuuk, Greenland
421167921 Banjul, Gambia
890444429 Conakry, Guinea
890442797 Basse-Terre, Guadeloupe
890420199 Basse-Terre, Guadeloupe
421178347 Malabo, Equatorial Guinea
421197943 Bissau, Guinea-Bissau
890437279 Hong Kong, Hong Kong
Expand All @@ -69,7 +69,7 @@
890451463 Majuro, Marshall Islands
890491957 Skopje, Macedonia
85681291 Macao, Macao
101750297 Plymouth, Montserrat
1108960813 Plymouth, Montserrat
101752423 Valletta, Malta
85674093 Male, Maldives
421168781 Lilongwe, Malawi
Expand All @@ -80,8 +80,8 @@
101751893 Amsterdam, Netherlands
101751917 Oslo, Norway
85675677 Yaren, Nauru
85632493 Alofi, Niue
85632179 Panama City, Panama
1141909453 Alofi, Niue
890445081 Panama City, Panama
890435983 Saint-Pierre, Saint Pierre and Miquelon
85676471 Melekeok, Palau
890441607 Doha, Qatar
Expand All @@ -91,7 +91,7 @@
101752307 Stockholm, Sweden
102032341 Singapore, Singapore
101752073 Ljubljana, Slovenia
101913495 Bratislava, Slovakia
1108800123 Bratislava, Slovakia
890452049 Freetown, Sierra Leone
85677205 San Marino, San Marino
890449737 Mogadishu, Somalia
Expand Down
12 changes: 6 additions & 6 deletions test/cases/citySearch.txt
Expand Up @@ -181,7 +181,7 @@
101912525 Cacak
101750149 Caen
890458845 Calgary
85675853 Callao
85675817 Callao
85923249 Camarillo
890435969 Camayenne
101750465 Cambridge, England
Expand Down Expand Up @@ -245,7 +245,7 @@
421174425 Chitungwiza
85678795 Chon Buri
102027689 Chongqing
101915501 Christchurch
101914521 Christchurch
85922285 Chula Vista
85940877 Cicero
101712203 Cincinnati
Expand Down Expand Up @@ -345,7 +345,7 @@
101748713 Duisburg
404513247 Duluth
85950181 Dundalk
101914509 Dunedin
101914269 Dunedin
890456493 Durham
404512099 Eagan
101732375 Eau Claire
Expand Down Expand Up @@ -472,7 +472,7 @@
85899245 Hamburg-Mitte
85899247 Hamburg-Nord
101735515 Hamilton, Canada
101914771 Hamilton, NZ
101914271 Hamilton, NZ
101712657 Hamilton, OH
101748739 Hamm
85823723 Hammond
Expand Down Expand Up @@ -662,7 +662,7 @@
85923517 Los Angeles
85929677 Loveland
85950327 Lowell
101916199 Lower Hutt
101914319 Lower Hutt
101750443 Lowestoft
85667669 Luanda
101725261 Lubbock
Expand Down Expand Up @@ -1131,7 +1131,7 @@
102018849 Tangerang
102027111 Tangshan
101751591 Taunton
101915917 Tauranga
101914325 Tauranga
101728059 Taylorsville
85670879 Tebessa
85672313 Tehran
Expand Down
6 changes: 5 additions & 1 deletion test/functional.js
Expand Up @@ -11,8 +11,12 @@ module.exports.tokenize = function(test, util) {

assert('Kelburn Wellington New Zealand', [85772991]);
assert('North Sydney', [85771181, 85784821, 101931469, 102048877, 404225393]);
assert('Sydney New South Wales Australia', [101932003, 404226357]);
assert('Sydney New South Wales Australia', [101932003, 102049151, 404226357]);
assert('ケープタウン 南アフリカ', [101928027]);
assert('경기도 광명시', [890472589]);
assert('서울 마포구', [890473201]);
assert('부산광역시 부산진구', [890475779]);
assert('전라북도 전주시 완산구', [890476473]);

assert('london on', [ 101735809 ]);
assert('paris, tx', [ 101725293 ]);
Expand Down
24 changes: 24 additions & 0 deletions test/lib/analysis.js
Expand Up @@ -60,11 +60,35 @@ module.exports.normalize = function(test, common) {

assert( 'City of the Sun', [ 'city of the sun' ] );
assert( 'City of Sun', [ 'city of sun', 'sun' ] );

// remove 'disambiguation' tokens from name suffix
// see: https://github.com/whosonfirst-data/whosonfirst-data/issues/885
assert( 'St Kilda (Vic.)', [ 'st kilda' ] );
assert( 'Spring Mountain (Qld)', [ 'spring mountain' ] );
assert( 'Mónaco - Monaco', [ 'monaco' ] );
assert( 'Monako (peyi)', [ 'monako' ] );
assert( 'Monako [peyi]', [ 'monako' ] );
assert( 'Port Phillip (C)', [ 'port phillip' ] );
assert( 'Portland (Oregon)', [ 'portland' ] );
assert( 'Sutherland Shire (A)', [ 'sutherland shire' ] );
assert( 'Cocos- [Keeling] eilande', [ 'cocos' ] );

// remove tokens that *only* contain numbers
assert( '1', [] );
assert( '22', [] );
assert( '333', [] );
assert( '22nd', ['22nd'] );
assert( 'a12', ['a12'] );
assert( '-1', [] ); // special case: handle '-1' values
};

module.exports.tokenize = function(test, common) {
var assert = runner.bind(null, test, 'tokenize');

// invalid type
assert( [], [] );
assert( {}, [] );

// delimiters
assert( 'Foo Bar', [[ 'foo', 'bar' ]] );
assert( 'Foo,,Bar', [[ 'foo', 'bar' ]] );
Expand Down
2 changes: 1 addition & 1 deletion test/prototype/query.js
Expand Up @@ -11,7 +11,7 @@ module.exports.query = function(test, util) {

assert([['kelburn', 'wellington', 'new zealand']], [85772991]);
assert([['north sydney']], [85771181, 85784821, 101931469, 102048877, 404225393]);
assert([['sydney', 'new south wales', 'australia']], [101932003, 404226357]);
assert([['sydney', 'new south wales', 'australia']], [101932003, 102049151, 404226357]);
assert([['ケープタウン', '南アフリカ']], [101928027]);
};

Expand Down
12 changes: 11 additions & 1 deletion test/prototype/tokenize.js
Expand Up @@ -16,14 +16,24 @@ module.exports.tokenize = function(test, util) {
// duplicates
assert('lancaster lancaster pa', [['lancaster', 'lancaster', 'pa']]);

// korean place names
assert('세종특별자치시', [['세종특별자치시']]);

// synonymous groupings
// see: https://github.com/pelias/placeholder/issues/28
assert('Le Cros-d’Utelle, France', [['le cros','d','utelle','france']]);
// note: the 'Le Cros-d’Utelle, France' example (as at 20-09-17) no longer dedupes
// to a single grouping due to the introduction of the token 'le' from 85685547
assert('Le Cros-d’Utelle, France', [['le','france'],['le cros','d','utelle','france']]);
assert('luxemburg luxemburg', [['luxemburg', 'luxemburg']]); // does not remove duplicate tokens

// ambiguous parses
// @note: these are the glorious future:

// assert('Adams North Brunswick', [
// [ 'adams north', 'brunswick' ],
// [ 'adams', 'north brunswick' ]
// ]);
//
// assert('Heritage East San Jose', [
// [ 'heritage east', 'san jose' ],
// [ 'heritage', 'east san jose' ]
Expand Down
28 changes: 28 additions & 0 deletions test/prototype/wof.js
Expand Up @@ -583,6 +583,10 @@ module.exports.isValidWofRecord = function(test, util) {
t.true( wof.isValidWofRecord( 1, params({ 'mz:is_current': 1 }) ) );
t.true( wof.isValidWofRecord( 1, params({ 'mz:is_current': '1' }) ) );
t.true( wof.isValidWofRecord( 1, params({ 'mz:is_current': '' }) ) );

// we are considering -1 values as current (for now)
t.true( wof.isValidWofRecord( 1, params({ 'mz:is_current': -1 }) ) );
t.true( wof.isValidWofRecord( 1, params({ 'mz:is_current': '-1' }) ) );
t.end();
});

Expand Down Expand Up @@ -865,6 +869,30 @@ module.exports.set_edges = function(test, util) {
});
});

test( 'from parent_id', function(t) {
var mock = new Mock();
mock.insertWofRecord(params({
'wof:id': 100,
'wof:parent_id': 200
}), function(){
t.equal( mock._calls.setEdge.length, 1 );
t.equal( mock._calls.setEdge[0][0], 200 );
t.equal( mock._calls.setEdge[0][1], 100 );
t.end();
});
});

test( 'from parent_id - same value', function(t) {
var mock = new Mock();
mock.insertWofRecord(params({
'wof:id': 100,
'wof:parent_id': 100
}), function(){
t.deepEqual( mock._calls.setEdge, [] );
t.end();
});
});

test( 'hierarchy: single lineage', function(t) {
var mock = new Mock();
mock.insertWofRecord(params({
Expand Down