In [None]:
import pandas as pd
import numpy as np 

import spacy 
import en_core_web_sm 

from nltk.parse import CoreNLPParser

path_to_data = './data/'

### Load spacy

In [None]:
spacy.prefer_gpu()
nlp = en_core_web_sm.load()

### Load US city names

In [None]:
cities = pd.read_csv(path_to_data + 'us_cities_states_counties.csv')  
cities['City alias'] = cities['City alias'].apply(lambda x: str(x))

### Test how many cities Spacy knows

In [None]:
%%timeit -n1 -r1 
# GPE = Countries, cities, states.
count = 0
passed = 0
for i, city in enumerate(cities['City alias'].values):
    try:
        doc = nlp(city)
        for X in doc.ents: 
            if X.label_=='GPE': 
                count+=1
    except:
        passed +=1
        pass
    if i% 5000 == 0: print (i, count, passed)
print(f'Spacy knows {count} out of {cities.shape[0]}')
print('couldnt process:', passed)

# Result: spacy knows 7946 out of 63211 = 12.5% 

### Load Stanford Ner Tagger

### Test how many cities Stanford NER knows

##### fast version https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK

In [None]:
# instructions:
# 1. run getCoreNLP.sh
# 2. run runCoreNLP.sh
# 3. now you can use the fast version api

In [None]:
ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner') 
parser = CoreNLPParser(url='http://localhost:9000')

In [None]:
def formatted_entities(classified_paragraphs_list):
    entities = []

    for classified_paragraph in classified_paragraphs_list:
        for entry in classified_paragraph:
            entry_value = entry[0]
            entry_type = entry[1]
            
            if entry_type == 'LOCATION': 
                entities.append(entry_value) 
    return entities 

In [None]:
%%timeit -n1 -r1 
count = 0
passed = 0
for i, city in enumerate(cities['City alias'].values):
    try:         
        city_ = parser.tokenize(city)     
        classified_paragraphs_list = ner_tagger.tag_sents([city_]) 
        formatted_result = formatted_entities(classified_paragraphs_list)  
        if len(formatted_result)>0:
            count+=1
    except Exception as e:  
        passed +=1
        print(i, city, 'error:', e)
        pass
    if i% 5000 == 0: 
        print (i, count, passed, city, 'result:', ' '.join(formatted_result)) 
print(f'Stanford knows {count} out of {cities.shape[0]}')
print('couldnt process:', passed)

# Result: Stanford Ner knows 15503 out of 63211 = 24.5%