In [1]:
import sys
import pandas
import pickle
import pysolr
import spacy
from tmdbMovies import indexableMovies
pandas.set_option('display.max_rows', 5000)

In [2]:
def extract_independent_locations(text,model):
    doc = model(text)
    locs = []
    #Fun fact, "GPE" means Geo-Political Entity
    for gpe in filter(lambda w: w.ent_type_ in ['GPE'], doc):
        locs.append(gpe.text)
    return locs

In [3]:
def extract_dependent_locations(text,model):
    #debug here:
    # https://explosion.ai/demos/displacy?text=Kevin%20McAllister%20in%20New%20York%20NY&model=en_core_web_lg&cpu=1&cph=1

    #merge entities and noun chunks into one token
    doc = model(text)
    spans = list(doc.ents)# + list(doc.noun_chunks)
    for span in spans:
        span.merge()

    relations = []
    for gpe in filter(lambda w: w.ent_type_ in ['GPE'], doc):
        if gpe.dep_ in ('attr', 'dobj'):
            subject = [w for w in gpe.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append(gpe.text)
        elif gpe.dep_ == 'pobj' and gpe.head.dep_ == 'prep':
            relations.append(gpe.text)
        else:
            relations.append(gpe.text)

    return relations

In [4]:
def enrich_dataset(df,extractor,model):
    results = []
    for idx,row in df.iterrows():
        text = row["text"]
        annotation = row["annotation"]
        ents = extractor(text,model)
        if len(ents):
            result = " ".join(ents)
        else:
            result = ""
        results.append(result)
    df['result'] = results
    return df

In [5]:
def tptnfpfn(df):
    total = 0
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    types = []
    for idx,row in df.iterrows():
        total += 1
        annotation = row["annotation"]
        result = row["result"]
        if   len(annotation) and len(result) and annotation == result:
            #Classification found and matches annotation (correct), True Positive
            types.append('tp')
            tp += 1
        elif len(annotation) and len(result) and annotation != result:
            #Classification found but does not match annotation (incorrect), False Positive
            types.append('fp')
            fp += 1
        elif len(annotation) and not len(result):
            #Classification missing and there should be one (incorrect), False Negative
            types.append('fn')
            fn += 1
        elif not len(annotation) and len(result):
            #Classification found but there should not be one (incorrect), False Positive
            types.append('fp')
            fp += 1
        elif not len(annotation) and not len(result):
            #Classification missing and there should not be one (correct), True Negative
            types.append('tn')
            tn += 1
    df['type'] = types
    return total,tp,tn,fp,fn,df

def precision(df):
    total,tp,tn,fp,fn,df = tptnfpfn(df)
    P = tp / (tp + fp)
    return P,tp,tn,fp,fn,df

def recall(df):
    total,tp,tn,fp,fn,df = tptnfpfn(df)
    R = tp / (tp + fn)
    return R,tp,tn,fp,fn,df

def f1(df):
    total,tp,tn,fp,fn,df = tptnfpfn(df)
    P = tp / (tp + fp)
    R = tp / (tp + fn)
    F = 2 * ((P * R) / (P + R))
    return F,P,R,tp,tn,fp,fn,df

In [6]:
def test(filename,model_name,classifier):
    model = spacy.load(model_name)
    dataset = pandas.read_csv(filename)
    dataset = dataset.fillna('')
    dataset = enrich_dataset(dataset,classifier,model)
    F,P,R,tp,tn,fp,fn,df = f1(dataset)
    return df,[classifier.__name__,model_name,P,R,F,tp,tn,fp,fn]

In [7]:
summaries = pandas.DataFrame({"Method":[],"Model":[],"P":[],"R":[],"F1":[],"tp":[],"tn":[],"fp":[],"fn":[]})

In [8]:
#Small Model, Dependent Entities
df1,summary1 = test('locations/test_title_locations.csv','en',extract_dependent_locations)
summaries.loc[len(summaries)]=summary1
#Medium Model, Dependent Entities
df2,summary2 = test('locations/test_title_locations.csv','en_core_web_md',extract_dependent_locations)
summaries.loc[len(summaries)]=summary2
#Large Model, Dependent Entities
df3,summary3 = test('locations/test_title_locations.csv','en_core_web_lg',extract_dependent_locations)
summaries.loc[len(summaries)]=summary3

#Small Model, Independent Entities
df4,summary4 = test('locations/test_title_locations.csv','en',extract_independent_locations)
summaries.loc[len(summaries)]=summary4
#Medium Model, Independent Entities
df5,summary5 = test('locations/test_title_locations.csv','en_core_web_md',extract_independent_locations)
summaries.loc[len(summaries)]=summary5
#Large Model, Independent Entities
df6,summary6 = test('locations/test_title_locations.csv','en_core_web_lg',extract_independent_locations)
summaries.loc[len(summaries)]=summary6

In [9]:
summaries.sort_values("F1")

Unnamed: 0,Method,Model,P,R,F1,tp,tn,fp,fn
0,extract_dependent_locations,en,0.381679,0.520833,0.440529,50.0,1822.0,81.0,46.0
3,extract_independent_locations,en,0.383459,0.53125,0.445415,51.0,1821.0,82.0,45.0
1,extract_dependent_locations,en_core_web_md,0.485714,0.536842,0.51,51.0,1850.0,54.0,44.0
4,extract_independent_locations,en_core_web_md,0.495327,0.557895,0.524752,53.0,1850.0,54.0,42.0
2,extract_dependent_locations,en_core_web_lg,0.652174,0.625,0.638298,60.0,1871.0,32.0,36.0
5,extract_independent_locations,en_core_web_lg,0.659574,0.645833,0.652632,62.0,1871.0,32.0,34.0


In [10]:
def count_annotations(filename):
    dataset = pandas.read_csv(filename)
    dataset = dataset.fillna('')
    annos = 0
    total = 0
    for idx,row in dataset.iterrows():
        total +=1
        if len(row["annotation"]):
            annos += 1
    print(annos,total)

In [11]:
count_annotations("locations/test_title_locations.csv")

98 1999


In [13]:
df3

Unnamed: 0,id,text,annotation,note,result,type
0,374430,Black Mirror: White Christmas,,,,tn
1,19404,The Brave-Hearted Will Take the Bride,,,,tn
2,278,The Shawshank Redemption,,,,tn
3,372058,Your Name.,,,,tn
4,238,The Godfather,,,,tn
5,360814,Dangal,,,,tn
6,244786,Whiplash,,,,tn
7,424,Schindler's List,,,,tn
8,129,Spirited Away,,,,tn
9,240,The Godfather: Part II,,,,tn
