# Imports

In [None]:
# ! pip install en_core_web_sm
# ! python -m spacy download en

In [3]:
import spacy
import re
import pandas as pd

from spacy import displacy
from spacy.attrs import LOWER 
from collections import Counter
from spacy.matcher import Matcher

nlp = spacy.load('en')

In [6]:
# Get dataframes from csv
df = pd.read_csv('./datasets/transcripts.csv')
df_context = pd.read_csv('./datasets/almost_complete_context.csv')
df_roads = pd.read_csv('./datasets/roads.csv')
list_of_roads = list(df_roads['road_name'])

FileNotFoundError: [Errno 2] File b'./datasets/roads.csv' does not exist: b'./datasets/roads.csv'

In [17]:
# Check first five rows
df_context.head()

Unnamed: 0.1,Unnamed: 0,transcripts,confidence,location_extraction
0,0,Stetson,0.542913,Stetson Court
4,4,Pratt for theis wonderful,0.744394,"Pratt Avenue , Theis Lane"
8,8,Airport,0.358415,Airport Road
9,9,Glory,0.447006,Glory Road
11,11,royal succession available sisters respond there,0.813985,Royal Court


# Named Entity Recognition

## Baseline

[spaCy](https://spacy.io/) finds entities in a document by tokenizing the strings and then assigning each word a tag. It then looks for patterns to get entities and classifies them with labels. Here we used spaCy to extract entities with labels related to locations (GPE and FAC) to be able to extract them from our transcripts. GPE is the acronym for "geo-political entities" and FAC, "facility", which locates airports, highways, etc.

In [20]:
# function to extract locations using spaCy pre trained labels
def location_extraction(string_in):
    doc = nlp(string_in)
    locations = []
    # loop through every entity in the transcript
    for X in doc.ents:
        if (X.label_ == 'FAC') or (X.label_ == 'GPE'):
            locations.append(X.text)
    if len(locations) != 0:
        return locations
    return None

# Add a column with the extracted locations
df['location_extraction'] = df['transcripts'].map(location_extraction)

##  spaCy Matcher

Here we use spaCy Matcher entity to be able to generate our own set of rules to look for in the text. Every rule corresponds to patterns which consists of sets of words, conditions and operators, where the word had to be found in the document following a specific condition and the operator determines how many times or how we have to observe the pattern.
Here we are looking for entities that correspond to a road name in Butte county, and since we already have a complete list of road names, we can set one rule for each road, where the pattern would specify and all words have to match exactly one time except if the name ended with a generic word like "street" or "Road" then that word could match 0 or more times.

In [7]:
# Building the Matcher entity

# Instantiate
matcher = Matcher(nlp.vocab)

# specifies what spacy does when it finds a match in the document. Here we just want to return the matches
def on_match(matcher, doc, id, matches):
    return matches

# building patterns for every road name, the condition being that the lowercase entity in the doc should match 
# the lowercase verion of the road name, so that capitalization wouldn't affect the model
def build_pattern(road_name):
    list_words = road_name.split(' ')
    # general words that appear a lot in the list. 
    # The reason why we do this is to still get a match if they are not present
    roads_general = ['lane', 'road', 
                 'court', 'drive', 
                 'avenue', 'way', 
                 'street', 'circle', 
                 'place', 'highway', 'trail']
    if list_words[-1].lower() in roads_general:
        pattern = [{'LOWER': word.lower()} for word in list_words[:-1]]
        pattern.append({'op': '*', 'LOWER' : list_words[-1].lower()})
    else:
        pattern = [{'LOWER': word.lower()} for word in list_words]
    return pattern

# Get a pattern of every road
for road in list_of_roads:
    matcher.add(road, on_match, build_pattern(road))
    
# This function takes a string as input and returns it with every word capitalized
def capitalize_string(string_in):
    words = string_in.split(' ')
    string_out = ''
    for i in words:
        string_out += i.capitalize() + ' '
    string_out = string_out[:-1]
    return string_out   
    
# Look for locations in the transcript, then extract them
def location_extraction_context(string_in):
    doc = nlp(string_in)
    string_out = ''
    list_words = string_in.split(' ')
    matches = matcher(doc)
    if len(matches) == 0:
        return None
    indeces_to_pop = []
    # loop through the matches and delete those that are a subset of another. 
    # this was done because some road names have words in commond and we were getting 2 matches for some locations
    # here we eliminate the shorter one since the longest is clearly the one intended
    for a in range(len(matches)):
        for b in range(a+1, len(matches)):
            if (matches[a][2] == matches[b][2]):
                if (matches[a][1] < matches[b][1]):
                    indeces_to_pop.append(b)
                else:
                    indeces_to_pop.append(a)
    matches_final = [tup for index, tup in enumerate(matches) if index not in indeces_to_pop]
    # loop through the matches and add them to the string to return
    # matches consist of an id and the indeces of the first and last word that constitute the pattern in the document
    # we use the ids to extrat the locations from the rules in the Matcher instance and not from the text itself,
    # to make sure they all follow the same format
    for match in matches_final:
        list_pattern = matcher.get(match[0])[1][0]
        for token in list_pattern:
            string_out += token['LOWER'] + ' '
        string_out += ', '
    string_out = string_out[:-3]
    string_out = capitalize_string(string_out)
    return string_out

# Add a column with the extracted locations
df_context['location_extraction'] = df_context['transcripts'].map(location_extraction_context)

# Since we dont care about transcripts where we didn't find any locations we drop all NAs
df_context.dropna(inplace=True)

In [16]:
# Save them as csv
df.to_csv('./datasets/mitch.csv')
df_context.to_csv('./datasets/location_context.csv')