# Part 4: Matching Violations to Google Business Address Information

In [1]:
# import statements
import pandas as pd
import json
import glob
import re
import time
import operator
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

In [2]:
# view settings
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Test code for reading in the json files

In [3]:
# # # Test to open a file
# # with open('../data/google_places_results/results_01.json') as fi:
# #     result = json.load(fi)

# # # Alternate approach to reading in the json file
# google_results = pd.read_json(r'../data/google_places_results/results_01.json')
# google_results.head()

In [4]:
# # Create an empty dataframe with the columns we want, including all of the data from inside of, and outside of, the 'results' blob
# column_names = ['mapped_location', 'address', 'name', 'types', 'address', 'lat', 'long']
# result_addresses = pd.DataFrame(columns = column_names)

# # Go through and open each json results file
# for file in list(glob.glob('../data/google_places_results/*.json')):
#     with open(file) as fi:
#         result = json.load(fi)
# # Write the contents of the 'results' field to a dataframe       
#         google_results = pd.json_normalize(result)
# # Clean up the dataframe columns
#         google_results = google_results.drop(['results'], axis = 1)
#         google_results.columns = ['mapped_location', 'address']
# # Append the contents of each json file to the results dataframe
#         result_addresses = result_addresses.append(google_results)      

### Read in the json files in two parts

One dataframe, `result_addresses`, should only include the `mapped location` and `address` fields.  
The second dataframe, `results`, should only include the fields we want from the `results` field.  
To get only one business name per address, merge the `result_addresses` and `results` fields on the `address` field and run `pd.drop_duplicates()` to get the first business name at the same address.

In [5]:
# Create an empty dataframe with the columns we want
column_names = ['mapped_location', 'address']
result_addresses = pd.DataFrame(columns = column_names)

# Go through and open each json results file
for file in list(glob.glob('../data/google_places_results/*.json')):
    with open(file) as fi:
        result = json.load(fi)
# Write the contents of the 'results' field to a dataframe       
        google_results = pd.json_normalize(result)
# Clean up the dataframe columns
        google_results = google_results.drop(['results'], axis = 1)
        google_results.columns = ['mapped_location', 'address']
# Append the contents of each json file to the results dataframe
        result_addresses = result_addresses.append(google_results)      

In [6]:
result_addresses.head()

Unnamed: 0,mapped_location,address
0,"(36.0685457, -86.6838975)","2045 Antioch Pike, Antioch"
1,"(36.211269429469425, -86.6943024067459)","2800 Opryland Dr, Nashville"
2,"(36.1235, -86.79013)","2607 12th Ave S, Nashville"
3,"(36.1941628, -86.8313936)","3200 Clarksville Pike, Nashville"
4,"(36.15843971160494, -86.7880380629013)","20 Grundy St, Nashville"


In [7]:
# Check to see how many addresses were passed through to the Google API
result_addresses.shape

(4829, 2)

In [8]:
# Check how many unique values there are
print(result_addresses.mapped_location.nunique())
print(result_addresses.address.nunique())
# It looks like some mapped locations have different addresses

4766
4829


In [9]:
# Reset the index so that there is a UID for each address/point - might be useful for deduping later
result_addresses = result_addresses.reset_index()
result_addresses.head(50)

Unnamed: 0,index,mapped_location,address
0,0,"(36.0685457, -86.6838975)","2045 Antioch Pike, Antioch"
1,1,"(36.211269429469425, -86.6943024067459)","2800 Opryland Dr, Nashville"
2,2,"(36.1235, -86.79013)","2607 12th Ave S, Nashville"
3,3,"(36.1941628, -86.8313936)","3200 Clarksville Pike, Nashville"
4,4,"(36.15843971160494, -86.7880380629013)","20 Grundy St, Nashville"
5,5,"(36.175009161611996, -86.75854459325407)","800 Main St, Nashville"
6,6,"(36.16502, -86.78968)","130 LifeWay Plaza, Nashville"
7,7,"(36.2328375, -86.75830260000001)","3131 Dickerson Rd, Nashville"
8,8,"(36.1105402, -86.7564862)","651 Thompson Ln, Nashville"
9,9,"(36.16497541501497, -86.78485420907694)","401 7th Ave N, Nashville"


In [10]:
# Check the column names
result_addresses.columns

Index(['index', 'mapped_location', 'address'], dtype='object')

In [11]:
# Create an empty dataframe with the columns we want to get the fields we need out of the results blob
column_names = ['name', 'types', 'address', 'lat', 'long']
results = pd.DataFrame(columns = column_names)

# Go through and open each json results file
for file in list(glob.glob('../data/google_places_results/*.json')):
    with open(file) as fi:
        result = json.load(fi)
# Write the contents of the 'results' field to a dataframe       
        google_results = pd.json_normalize(result, 'results')
# Clean up the dataframe columns
        google_results = google_results.drop(['business_status', 'icon', 'place_id', 'rating', 'reference', 'scope', 'user_ratings_total', 'geometry.viewport.northeast.lat', 'geometry.viewport.northeast.lng', 'geometry.viewport.southwest.lat', 'geometry.viewport.southwest.lng', 'opening_hours.open_now', 'plus_code.compound_code', 'plus_code.global_code', 'photos', 'price_level', 'permanently_closed'], axis = 1)
        google_results.columns = ['name', 'types', 'address', 'lat', 'long']
# Append the contents of each json file to the results dataframe
        results = results.append(google_results)      

In [12]:
# Take a look at the dataframe we've created
results.head()

Unnamed: 0,name,types,address,lat,long
0,Los Paisanos,"[night_club, bar, point_of_interest, establish...","2045 Antioch Pike, Antioch",36.068546,-86.683898
1,Sun Hair Salon,"[beauty_salon, hair_care, point_of_interest, e...","2049 Antioch Pike, Antioch",36.068475,-86.683967
2,Solo Style LLC,"[clothing_store, point_of_interest, store, est...","2037 Antioch Pike, Antioch",36.068658,-86.683813
3,Hai Woon Dai,"[restaurant, food, point_of_interest, establis...","2051 Antioch Pike, Antioch",36.068437,-86.683993
4,"Dr. Babajide A. Bamigboye, MD","[doctor, health, point_of_interest, establishm...","2031 Antioch Pike, Antioch",36.06872,-86.684231


In [13]:
result_addresses['address'] = result_addresses['address'].str.replace(r',.+', '')
result_addresses['address'] = result_addresses['address'].fillna('')
result_addresses['address'] = result_addresses['address'].str.replace('nan', '')
results['address'] = results['address'].str.replace(r',.+', '')
results['address'] = results['address'].fillna('')
results['address'] = results['address'].str.replace('nan', '')

In [14]:
results['address'].unique()

array(['2045 Antioch Pike', '2049 Antioch Pike', '2037 Antioch Pike', ...,
       '221 31st Avenue North', '3108 Long Boulevard',
       '3008 Poston Avenue'], dtype=object)

In [30]:
results[results['address'].isna() == True]

Unnamed: 0,name,types,address,lat,long


In [16]:
# Join the queried addresses to the Google API results. Keep all of the queried addresses
result_addresses_joined_01 = result_addresses.merge(results, how = 'left', on = 'address')
result_addresses_joined_01

Unnamed: 0,index,mapped_location,address,name,types,lat,long
0,0,"(36.0685457, -86.6838975)",2045 Antioch Pike,Los Paisanos,"[night_club, bar, point_of_interest, establish...",36.068546,-86.683898
1,0,"(36.0685457, -86.6838975)",2045 Antioch Pike,Los Paisanos,"[night_club, bar, point_of_interest, establish...",36.068546,-86.683898
2,1,"(36.211269429469425, -86.6943024067459)",2800 Opryland Dr,Best Buy Express Kiosk,"[point_of_interest, establishment]",36.211414,-86.694149
3,2,"(36.1235, -86.79013)",2607 12th Ave S,,,,
4,3,"(36.1941628, -86.8313936)",3200 Clarksville Pike,,,,
...,...,...,...,...,...,...,...
7942,27,"(36.1458305, -86.8167581)",221 31st Ave N,,,,
7943,28,"(36.08778474206125, -86.72961779427743)",3806 Nolensville Pike,Bridal & Formal by RJS,"[point_of_interest, clothing_store, store, est...",36.087755,-86.729657
7944,28,"(36.08778474206125, -86.72961779427743)",3806 Nolensville Pike,Quinceanera Dress Store,"[point_of_interest, clothing_store, store, est...",36.088002,-86.729556
7945,28,"(36.08778474206125, -86.72961779427743)",3806 Nolensville Pike,Bridal & Formal by RJS,"[point_of_interest, clothing_store, store, est...",36.087755,-86.729657


In [17]:
# Check the columns that were produced
result_addresses_joined_01.columns

Index(['index', 'mapped_location', 'address', 'name', 'types', 'lat', 'long'], dtype='object')

In [18]:
# Check how many business names were matched based on address
result_addresses_joined_01[result_addresses_joined_01['name'].isna() == False]

Unnamed: 0,index,mapped_location,address,name,types,lat,long
0,0,"(36.0685457, -86.6838975)",2045 Antioch Pike,Los Paisanos,"[night_club, bar, point_of_interest, establish...",36.068546,-86.683898
1,0,"(36.0685457, -86.6838975)",2045 Antioch Pike,Los Paisanos,"[night_club, bar, point_of_interest, establish...",36.068546,-86.683898
2,1,"(36.211269429469425, -86.6943024067459)",2800 Opryland Dr,Best Buy Express Kiosk,"[point_of_interest, establishment]",36.211414,-86.694149
7,6,"(36.16502, -86.78968)",130 LifeWay Plaza,Frankie Pierce Park,"[park, point_of_interest, establishment]",36.165024,-86.789677
11,10,"(36.160849661605994, -86.77805060674591)",418 Broadway,Layla's Honky Tonk,"[bar, point_of_interest, establishment]",36.160833,-86.778056
...,...,...,...,...,...,...,...
7939,24,"(36.25852, -86.7583)",3638 Dickerson Pike,Bellshire Wellness Center,"[health, point_of_interest, establishment]",36.258571,-86.758322
7943,28,"(36.08778474206125, -86.72961779427743)",3806 Nolensville Pike,Bridal & Formal by RJS,"[point_of_interest, clothing_store, store, est...",36.087755,-86.729657
7944,28,"(36.08778474206125, -86.72961779427743)",3806 Nolensville Pike,Quinceanera Dress Store,"[point_of_interest, clothing_store, store, est...",36.088002,-86.729556
7945,28,"(36.08778474206125, -86.72961779427743)",3806 Nolensville Pike,Bridal & Formal by RJS,"[point_of_interest, clothing_store, store, est...",36.087755,-86.729657


In [19]:
result_addresses_joined_01 = result_addresses_joined_01.drop_duplicates(subset='address', keep='first')
result_addresses_joined_01

Unnamed: 0,index,mapped_location,address,name,types,lat,long
0,0,"(36.0685457, -86.6838975)",2045 Antioch Pike,Los Paisanos,"[night_club, bar, point_of_interest, establish...",36.068546,-86.683898
2,1,"(36.211269429469425, -86.6943024067459)",2800 Opryland Dr,Best Buy Express Kiosk,"[point_of_interest, establishment]",36.211414,-86.694149
3,2,"(36.1235, -86.79013)",2607 12th Ave S,,,,
4,3,"(36.1941628, -86.8313936)",3200 Clarksville Pike,,,,
5,4,"(36.15843971160494, -86.7880380629013)",20 Grundy St,,,,
...,...,...,...,...,...,...,...
7937,24,"(36.25852, -86.7583)",3638 Dickerson Pike,Mid-Tenn Sheet Metal,"[general_contractor, point_of_interest, establ...",36.258571,-86.758322
7940,25,"(36.153299711602756, -86.7960281)",1722 West End Ave,,,,
7941,26,"(36.18837, -86.7676)",1001 Meridian St,,,,
7942,27,"(36.1458305, -86.8167581)",221 31st Ave N,,,,


### We need to do some data cleanup

If we do a literal string match with light data cleaning (removing city info), we only get matches for 18% (861) of the original 4,698 addresses identified in the hubNashville 311 violations database.  

Let's try [fuzzy string matching](https://medium.com/tim-black/fuzzy-string-matching-at-scale-41ae6ac452c2) instead.

In [20]:
# A class for matching one list of strings to another
class StringMatch():
    
    def __init__(self, source_names, target_names):
        self.source_names = source_names
        self.target_names = target_names
        self.ct_vect      = None
        self.tfidf_vect   = None
        self.vocab        = None
        self.sprse_mtx    = None
        
        
    def tokenize(self, analyzer='char_wb', n=3):
        '''
        Tokenizes the list of strings, based on the selected analyzer
        :param str analyzer: Type of analyzer ('char_wb', 'word'). Default is trigram
        :param str n: If using n-gram analyzer, the gram length
        '''
        # Create initial count vectorizer & fit it on both lists to get vocab
        self.ct_vect = CountVectorizer(analyzer=analyzer, ngram_range=(n, n))
        self.vocab   = self.ct_vect.fit(self.source_names + self.target_names).vocabulary_
        
        # Create tf-idf vectorizer
        self.tfidf_vect  = TfidfVectorizer(vocabulary=self.vocab, analyzer=analyzer, ngram_range=(n, n))
        
        
    def match(self, ntop=1, lower_bound=0, output_fmt='df'):
        '''
        Main match function. Default settings return only the top candidate for every source string.
        
        :param int ntop: The number of top-n candidates that should be returned
        :param float lower_bound: The lower-bound threshold for keeping a candidate, between 0-1.
                                   Default set to 0, so consider all canidates
        :param str output_fmt: The output format. Either dataframe ('df') or dict ('dict')
        '''
        self._awesome_cossim_top(ntop, lower_bound)
        
        if output_fmt == 'df':
            match_output = self._make_matchdf()
        elif output_fmt == 'dict':
            match_output = self._make_matchdict()
            
        return match_output
        
        
    def _awesome_cossim_top(self, ntop, lower_bound):
        ''' https://gist.github.com/ymwdalex/5c363ddc1af447a9ff0b58ba14828fd6#file-awesome_sparse_dot_top-py '''
        # To CSR Matrix, if needed
        A = self.tfidf_vect.fit_transform(self.source_names).tocsr()
        B = self.tfidf_vect.fit_transform(self.target_names).transpose().tocsr()
        M, _ = A.shape
        _, N = B.shape

        idx_dtype = np.int32

        nnz_max = M * ntop

        indptr = np.zeros(M+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=A.dtype)

        ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)

        self.sprse_mtx = csr_matrix((data,indices,indptr), shape=(M,N))
    
    
    def _make_matchdf(self):
        ''' Build dataframe for result return '''
        # CSR matrix -> COO matrix
        cx = self.sprse_mtx.tocoo()

        # COO matrix to list of tuples
        match_list = []
        for row,col,val in zip(cx.row, cx.col, cx.data):
            match_list.append((row, self.source_names[row], col, self.target_names[col], val))

        # List of tuples to dataframe
        colnames = ['Row Idx', 'Title', 'Candidate Idx', 'Candidate Title', 'Score']
        match_df = pd.DataFrame(match_list, columns=colnames)

        return match_df

    
    def _make_matchdict(self):
        ''' Build dictionary for result return '''
        # CSR matrix -> COO matrix
        cx = self.sprse_mtx.tocoo()

        # dict value should be tuple of values
        match_dict = {}
        for row,col,val in zip(cx.row, cx.col, cx.data):
            if match_dict.get(row):
                match_dict[row].append((col,val))
            else:
                match_dict[row] = [(col, val)]

In [26]:
# Application of StringMatch class, using default args
source_titles = result_addresses['address'].apply(str)
target_titles = results['address'].apply(str)
titlematch = StringMatch(source_titles, target_titles)
titlematch.tokenize()
match_df = titlematch.match()

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
all_addresses = result_addresses['address'].to_list()
all_addresses

In [None]:
# Read in the COVID-19 violation reports dataset
violations = pd.read_csv('../data/covid_violations.csv')
violations.head()

In [None]:
violations.info()

In [None]:
violations_addresses = violations['address'].to_list()
violations_addresses

In [None]:
violations_with_businesses = violations.merge(result_addresses, how = 'left', on = 'address')
violations_with_businesses

In [None]:
violations_with_businesses[violations_with_businesses['name'].isna() == False]

In [None]:
violations_with_businesses.address