In [164]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from shapely.geometry import Point
import geopandas as gpd
import folium
from folium.plugins import MarkerCluster
from folium.plugins import FastMarkerCluster
import math
import geopy.distance
from datetime import datetime
from tqdm.notebook import tqdm
import glob
import json
from fuzzywuzzy import fuzz, process

%matplotlib inline

## Read in Hub-Violations and Clusters DataFrames (geocodio versions)

In [104]:
hub_covid_violations_df = pd.read_csv('../data/hub_covid_violations.csv')

In [105]:
geocodio_df = pd.read_csv('../data/geocodio_input_df_geocodio_d5d27c584d4aa85e57bb523066377b276f89cac3.csv')

In [106]:
hub_geocodio = hub_covid_violations_df.join(geocodio, rsuffix='_geo')

In [109]:
hub_geocodio.columns

Index(['Request #', 'Status', 'Date / Time Opened', 'Date / Time Closed',
       'Contact Type', 'State Issue', 'Closed When Created', 'Address', 'City',
       'Council District', 'ZIP', 'Latitude', 'Longitude', 'Mapped Location',
       'Unnamed: 0', 'Address_geo', 'City_geo', 'ZIP_geo', 'state',
       'Latitude_geo', 'Longitude_geo', 'Accuracy Score', 'Accuracy Type',
       'Number', 'Street', 'Unit Type', 'Unit Number', 'City.1', 'State',
       'County', 'Zip', 'Country', 'Source'],
      dtype='object')

In [111]:
hub_geocodio.drop(['Latitude', 'Longitude'], axis=1)
hub_geocodio.Latitude_geo = hub_geocodio.Latitude
hub_geocodio.Longitude_geo = hub_geocodio.Longitude

In [151]:
hub_geocodio['Longitude'] = round(hub_geocodio['Longitude'], 4)
hub_geocodio['Latitude'] = round(hub_geocodio['Latitude'], 4)

In [145]:
hub_geocodio.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11621,11622,11623,11624,11625,11626,11627,11628,11629,11630
Request #,346122,339374,343513,442996,339423,407667,339307,339479,339684,343647,...,492846,492750,492949,492650,492644,492673,493369,493733,492999,493074
Status,Closed,Closed,Closed,Closed,Closed,Closed,Closed,Closed,Closed,Closed,...,Closed,Closed,Closed,Closed,Closed,Closed,Closed,Closed,Closed,Closed
Date / Time Opened,2020-04-15 11:33:26,2020-04-04 18:13:59,2020-04-10 10:40:08,2020-08-12 02:05:29,2020-04-04 20:12:27,2020-07-04 19:59:51,2020-04-04 14:51:37,2020-04-04 23:54:18,2020-04-05 21:29:17,2020-04-10 14:19:57,...,2020-11-02 00:40:15,2020-11-01 17:42:34,2020-11-02 14:10:39,2020-10-31 23:43:06,2020-10-31 22:55:32,2020-11-01 01:52:56,2020-11-02 17:41:20,2020-11-02 20:43:33,2020-11-02 14:36:22,2020-11-02 15:15:54
Date / Time Closed,2020-04-16 14:55:01,2020-04-09 13:07:22,2020-04-14 19:39:09,2020-08-12 16:03:35,2020-04-09 14:21:09,2020-07-06 17:55:19,2020-04-08 14:07:35,2020-04-10 17:26:46,2020-04-13 13:57:02,2020-04-15 15:10:42,...,2020-11-02 15:38:20,2020-11-02 15:43:56,2020-11-02 16:15:36,2020-11-02 15:41:59,2020-11-02 15:41:42,2020-11-02 16:09:09,2020-11-03 15:46:04,2020-11-03 15:51:51,2020-11-02 16:15:55,2020-11-02 16:16:19
Contact Type,,,,,,,,,,,...,,,,,,,,,,
State Issue,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Closed When Created,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Address,928 6th Ave S,500 Gallatin Ave,5720 Crossings Blvd,7689 Hwy 70 S,5310 Mt View Rd,945 Allen Rd,1635 County Hospital Rd,1524 Gallatin Ave,301 14th Ave N,5824 Nolensville Pike,...,1011 McClurkan Ave,323 Opry Mills Dr,3636 Bell Rd,1919 Division St,2600 Franklin Pike,7102 Charlotte Pike,305 Manchester Ave,2801 Foster Ave,8080 TN-100,1400 Adams St
City,NASHVILLE,NASHVILLE,,NASHVILLE,ANTIOCH,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,...,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE
Council District,17,5,,22,32,15,2,5,19,27,...,5,15,13,19,17,35,6,16,35,19


In [114]:
clusters = pd.read_csv('../data/clusters_corrected-Sheet1-2_geocodio_d08379641999d429faecf949b19c4c6f08c2732c.csv')

In [152]:
clusters['Longitude'] = round(clusters['Longitude'], 4)
clusters['Latitude'] = round(clusters['Latitude'], 4)

In [116]:
clusters = clusters.drop([
    'Latitude_wrong', 'Longitude_wrong', 'Number', 'Street.1', 'Unit Type', 'Unit Number', 'City.1', 'State.1'
], axis=1)

In [146]:
clusters.head()

Unnamed: 0,Cluster Name,Street,City,State,Facility Type,Cluster Start Date,# Cases,Latitude,Longitude,Accuracy Score,Accuracy Type,County,Zip,Country,Source
0,Vanderbilt Parties,,,TN,Social Gathering,3/11/20,49,35.859,-86.349,1.0,state,,,US,US Census Bureau
1,Event at Clementine Hall,4710 Charlotte Avenue,Nashville,TN,Social Gathering,3/14/20,23,36.152,-86.844,1.0,range_interpolation,Davidson County,37209.0,US,TIGER/Line® dataset from the US Census Bureau
2,Religious Retreat,,,TN,Social Gathering,3/25/20,18,35.859,-86.349,1.0,state,,,US,US Census Bureau
3,The Health Center at Richland Place,504 Elmington Avenue,Nashville,TN,LTCF,4/3/20,47,36.129,-86.818,1.0,range_interpolation,Davidson County,37205.0,US,TIGER/Line® dataset from the US Census Bureau
4,Trevecca Center for Rehab and Healing,329 Murfreesboro Pike,Nashville,TN,LTCF,4/4/20,102,36.145,-86.756,1.0,rooftop,Davidson County,37210.0,US,City of Nashville


In [118]:
clusters[clusters['Cluster Name'] == "Kid Rock's Big Ass Honky Tonk"]

Unnamed: 0,Cluster Name,Street,City,State,Facility Type,Cluster Start Date,# Cases,Latitude,Longitude,Accuracy Score,Accuracy Type,County,Zip,Country,Source
29,Kid Rock's Big Ass Honky Tonk,221 Broadway,Nashville,TN,Bar,6/26/20,15,36.1614,-86.7758,1.0,rooftop,Davidson County,37201.0,US,City of Nashville


In [119]:
clusters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Cluster Name        62 non-null     object 
 1   Street              51 non-null     object 
 2   City                51 non-null     object 
 3   State               56 non-null     object 
 4   Facility Type       62 non-null     object 
 5   Cluster Start Date  62 non-null     object 
 6   # Cases             62 non-null     int64  
 7   Latitude            62 non-null     float64
 8   Longitude           62 non-null     float64
 9   Accuracy Score      62 non-null     float64
 10  Accuracy Type       56 non-null     object 
 11  County              51 non-null     object 
 12  Zip                 51 non-null     float64
 13  Country             56 non-null     object 
 14  Source              56 non-null     object 
dtypes: float64(4), int64(1), object(10)
memory usage: 7.4+ KB


In [120]:
clusters['Facility Type'].value_counts().sort_values(ascending=False)

LTCF                     20
Congregate Living         8
Social Gathering          6
Correctional Facility     6
Bar                       6
Construction              5
Other                     3
Commercial-Warehouse      2
Restaurant                1
Office                    1
School                    1
College / University      1
Church                    1
Gym                       1
Name: Facility Type, dtype: int64

In [121]:
clusters['Cluster Name'].value_counts().sort_values(ascending=False)

CDM Jail                                            2
Lois DeBerry Special Needs Facility                 1
Men of Valor                                        1
Nashville Center for Rehab and Healing              1
Nashville Community Care & Rehabilitation Center    1
                                                   ..
Tyson Foods                                         1
One Stone Church Service                            1
Grand Hyatt Hotel (Power Design Inc) job site       1
Nashville Rescue Mission - Women's Campus           1
Broad West Construction                             1
Name: Cluster Name, Length: 61, dtype: int64

In [122]:
clusters[clusters['Street'].isna()]['Cluster Name'].to_list()

['Vanderbilt Parties',
 'Religious Retreat',
 'Apartments A',
 'Rolling Mill Hill job site',
 'Wedding at Farm (Out of County)',
 'CoreCivic',
 'Holiday Party',
 'University Sports Team B',
 'Middle TN Community Homes',
 'High/Middle-School Teen Party',
 'School Volleyball Team A']

In [123]:
clusters_locations = clusters[clusters['Latitude'].notna()]

In [124]:
clusters_by_type = pd.read_csv('../data/clusters_by_type.csv')

In [125]:
clusters_by_type.head()

Unnamed: 0,Cluster Type,Number of Clusters
0,Bar,7
1,Church,2
2,College / University,5
3,Commercial-Warehouse,12
4,Congregate Living,13


## Merge Clusters and Violations DataFrames
### Using coordinates - best match is 3, but currently rounding is deactivated.

In [154]:
clusters_violations = clusters[clusters['Latitude'].notna()].merge(
    hub_geocodio, 
    on=['Latitude', 'Longitude'], 
    how='inner',
    suffixes=['_clusters', '_violations']
)

In [157]:
clusters_violations[['Street_clusters', 'Number', 'Street_violations']]

Unnamed: 0,Street_clusters,Number,Street_violations
0,4710 Charlotte Avenue,4700,Charlotte Ave
1,4710 Charlotte Avenue,4710,Charlotte Ave
2,201 Cartwright Street,201,Cartwright St
3,201 Cartwright Street,201,Cartwright St
4,201 Cartwright Street,201,Cartwright St
...,...,...,...
298,207 Printers Alley,162,Printers Aly
299,1716 Rosa L Parks Blvd,1716,Rosa L Parks Blvd
300,1716 Rosa L Parks Blvd,1716,Rosa L Parks Blvd
301,1716 Rosa L Parks Blvd,1716,Rosa L Parks Blvd


In [134]:
clusters_violations['Cluster Name'].value_counts()

Kid Rock's Big Ass Honky Tonk                    67
Tootsie's                                        54
Winner's                                         47
Dawghouse Saloon                                 46
Dogwood                                          31
Miss Kelli's                                     27
Hermitage Hall                                    6
Grand Hyatt Hotel (Power Design Inc) job site     4
Nashville Rescue Mission - Women's Campus         3
Tyson Foods                                       3
Event at Clementine Hall                          2
Loser's                                           2
Life Care Center Old Hickory Village              2
One Stone Church Service                          2
Iron Tribe Belmont                                1
Creekside Center for Rehab and Healing            1
Nashville Center for Rehab and Healing            1
Green Hills Center for Rehab and Healing          1
Ahava / Grace Healthcare of White's Creek         1
Blakeford Gr

In [150]:
clusters_violations[clusters_violations['Cluster Name'] == "Kid Rock's Big Ass Honky Tonk"]['Street_y'].value_counts()

Broadway     55
3rd Ave S    11
Us Hwy 70     1
Name: Street_y, dtype: int64

In [158]:
hub_geocodio.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11621,11622,11623,11624,11625,11626,11627,11628,11629,11630
Request #,346122,339374,343513,442996,339423,407667,339307,339479,339684,343647,...,492846,492750,492949,492650,492644,492673,493369,493733,492999,493074
Status,Closed,Closed,Closed,Closed,Closed,Closed,Closed,Closed,Closed,Closed,...,Closed,Closed,Closed,Closed,Closed,Closed,Closed,Closed,Closed,Closed
Date / Time Opened,2020-04-15 11:33:26,2020-04-04 18:13:59,2020-04-10 10:40:08,2020-08-12 02:05:29,2020-04-04 20:12:27,2020-07-04 19:59:51,2020-04-04 14:51:37,2020-04-04 23:54:18,2020-04-05 21:29:17,2020-04-10 14:19:57,...,2020-11-02 00:40:15,2020-11-01 17:42:34,2020-11-02 14:10:39,2020-10-31 23:43:06,2020-10-31 22:55:32,2020-11-01 01:52:56,2020-11-02 17:41:20,2020-11-02 20:43:33,2020-11-02 14:36:22,2020-11-02 15:15:54
Date / Time Closed,2020-04-16 14:55:01,2020-04-09 13:07:22,2020-04-14 19:39:09,2020-08-12 16:03:35,2020-04-09 14:21:09,2020-07-06 17:55:19,2020-04-08 14:07:35,2020-04-10 17:26:46,2020-04-13 13:57:02,2020-04-15 15:10:42,...,2020-11-02 15:38:20,2020-11-02 15:43:56,2020-11-02 16:15:36,2020-11-02 15:41:59,2020-11-02 15:41:42,2020-11-02 16:09:09,2020-11-03 15:46:04,2020-11-03 15:51:51,2020-11-02 16:15:55,2020-11-02 16:16:19
Contact Type,,,,,,,,,,,...,,,,,,,,,,
State Issue,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Closed When Created,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Address,928 6th Ave S,500 Gallatin Ave,5720 Crossings Blvd,7689 Hwy 70 S,5310 Mt View Rd,945 Allen Rd,1635 County Hospital Rd,1524 Gallatin Ave,301 14th Ave N,5824 Nolensville Pike,...,1011 McClurkan Ave,323 Opry Mills Dr,3636 Bell Rd,1919 Division St,2600 Franklin Pike,7102 Charlotte Pike,305 Manchester Ave,2801 Foster Ave,8080 TN-100,1400 Adams St
City,NASHVILLE,NASHVILLE,,NASHVILLE,ANTIOCH,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,...,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE,NASHVILLE
Council District,17,5,,22,32,15,2,5,19,27,...,5,15,13,19,17,35,6,16,35,19


In [159]:
hub_geocodio[hub_geocodio['Accuracy Score']>.9]

Unnamed: 0,Request #,Status,Date / Time Opened,Date / Time Closed,Contact Type,State Issue,Closed When Created,Address,City,Council District,...,Number,Street,Unit Type,Unit Number,City.1,State,County,Zip,Country,Source
0,346122,Closed,2020-04-15 11:33:26,2020-04-16 14:55:01,,False,False,928 6th Ave S,NASHVILLE,17.0,...,928,6th Ave S,,,Nashville,TN,Davidson County,37203,US,City of Nashville
1,339374,Closed,2020-04-04 18:13:59,2020-04-09 13:07:22,,False,False,500 Gallatin Ave,NASHVILLE,5.0,...,500,Gallatin Ave,,,Nashville,TN,Davidson County,37206,US,City of Nashville
2,343513,Closed,2020-04-10 10:40:08,2020-04-14 19:39:09,,False,False,5720 Crossings Blvd,,,...,5720,Crossings Blvd,,,Antioch,TN,Davidson County,37013,US,TIGER/Line® dataset from the US Census Bureau
3,442996,Closed,2020-08-12 02:05:29,2020-08-12 16:03:35,,False,False,7689 Hwy 70 S,NASHVILLE,22.0,...,7689,Hwy 70 S,,,Nashville,TN,Davidson County,37221,US,TIGER/Line® dataset from the US Census Bureau
5,407667,Closed,2020-07-04 19:59:51,2020-07-06 17:55:19,,False,False,945 Allen Rd,NASHVILLE,15.0,...,945,Allen Rd,,,Nashville,TN,Davidson County,37214,US,City of Nashville
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11626,492673,Closed,2020-11-01 01:52:56,2020-11-02 16:09:09,,False,False,7102 Charlotte Pike,NASHVILLE,35.0,...,7102,Charlotte Pike,,,Nashville,TN,Davidson County,37209,US,TIGER/Line® dataset from the US Census Bureau
11627,493369,Closed,2020-11-02 17:41:20,2020-11-03 15:46:04,,False,False,305 Manchester Ave,NASHVILLE,6.0,...,305,Manchester Ave,,,Nashville,TN,Davidson County,37206,US,City of Nashville
11628,493733,Closed,2020-11-02 20:43:33,2020-11-03 15:51:51,,False,False,2801 Foster Ave,NASHVILLE,16.0,...,2801,Foster Ave,,,Nashville,TN,Davidson County,37210,US,TIGER/Line® dataset from the US Census Bureau
11629,492999,Closed,2020-11-02 14:36:22,2020-11-02 16:15:55,,False,False,8080 TN-100,NASHVILLE,35.0,...,8080,Hwy 100,,,Nashville,TN,Davidson County,37221,US,TIGER/Line® dataset from the US Census Bureau


In [178]:
clusters_clean = clusters[clusters['Street'].notna()]

In [182]:
def fuzzy_merge(df_1, df_2, key1, key2, threshold=95, limit=2):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()

    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m

    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2

    return df_1

In [184]:
clusters_violations_fuzzy = fuzzy_merge(clusters_clean, hub_geocodio, 'Street', 'Address', limit=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['matches'] = m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['matches'] = m2


In [191]:
clusters_violations_fuzzy.tail()

Unnamed: 0,Cluster Name,Street,City,State,Facility Type,Cluster Start Date,# Cases,Latitude,Longitude,Accuracy Score,Accuracy Type,County,Zip,Country,Source,matches
57,One Stone Church Service,1101 Stainback Ave,Nashville,TN,Church,9/22/20,10,36.19,-86.765,1.0,range_interpolation,Davidson County,37207.0,US,TIGER/Line® dataset from the US Census Bureau,1101 Stainback Ave
58,Miss Kelli's,207 Printers Alley,Nashville,TN,Bar,10/1/20,14,36.164,-86.778,1.0,range_interpolation,Davidson County,37201.0,US,TIGER/Line® dataset from the US Census Bureau,207 Printers Alley
59,Nashville Rescue Mission - Women's Campus,1716 Rosa L Parks Blvd,Nashville,TN,Congregate Living,10/1/20,70,36.182,-86.797,1.0,rooftop,Davidson County,37208.0,US,City of Nashville,1716 ROSA L PARKS BLVD
60,Link Systems Electric,444 McNally Dr,Nashville,TN,Other,10/16/20,12,36.093,-86.74,0.9,range_interpolation,Davidson County,37211.0,US,TIGER/Line® dataset from the US Census Bureau,444 McNally Dr
61,Iron Tribe Belmont,3201 Belmont Blvd,Nashville,TN,Gym,10/20/20,14,36.117,-86.798,1.0,rooftop,Davidson County,37212.0,US,City of Nashville,Belmont Blvd


In [190]:
clusters_violations_fuzzy[clusters_violations_fuzzy['matches'] != ''].shape

(28, 16)

In [167]:
clusters.columns

Index(['Cluster Name', 'Street', 'City', 'State', 'Facility Type',
       'Cluster Start Date', '# Cases', 'Latitude', 'Longitude',
       'Accuracy Score', 'Accuracy Type', 'County', 'Zip', 'Country',
       'Source'],
      dtype='object')

In [168]:
hub_geocodio.columns

Index(['Request #', 'Status', 'Date / Time Opened', 'Date / Time Closed',
       'Contact Type', 'State Issue', 'Closed When Created', 'Address', 'City',
       'Council District', 'ZIP', 'Latitude', 'Longitude', 'Mapped Location',
       'Unnamed: 0', 'Address_geo', 'City_geo', 'ZIP_geo', 'state',
       'Latitude_geo', 'Longitude_geo', 'Accuracy Score', 'Accuracy Type',
       'Number', 'Street', 'Unit Type', 'Unit Number', 'City.1', 'State',
       'County', 'Zip', 'Country', 'Source'],
      dtype='object')

## Merge Fuzzied Clusters to OG Violations DF

In [192]:
fuzzy_merge_df = clusters_violations_fuzzy.merge(hub_geocodio, left_on='matches', right_on='Address')

In [193]:
fuzzy_merge_df

Unnamed: 0,Cluster Name,Street_x,City_x,State_x,Facility Type,Cluster Start Date,# Cases,Latitude_x,Longitude_x,Accuracy Score_x,...,Number,Street_y,Unit Type,Unit Number,City.1,State_y,County_y,Zip_y,Country_y,Source_y
0,Trevecca Center for Rehab and Healing,329 Murfreesboro Pike,Nashville,TN,LTCF,4/4/20,102,36.145,-86.756,1.0,...,329,Murfreesboro Pike,,,Nashville,TN,Davidson County,37210,US,City of Nashville
1,Montgomery Bell Academy job site,4001 Harding Pike,Nashville,TN,Construction,5/8/20,75,36.093,-86.846,0.7,...,4001,Harding Pl,,,Nashville,TN,Davidson County,37215,US,TIGER/Line® dataset from the US Census Bureau
2,Montgomery Bell Academy job site,4001 Harding Pike,Nashville,TN,Construction,5/8/20,75,36.093,-86.846,0.7,...,4001,Harding Pl,,,Nashville,TN,Davidson County,37215,US,TIGER/Line® dataset from the US Census Bureau
3,Montgomery Bell Academy job site,4001 Harding Pike,Nashville,TN,Construction,5/8/20,75,36.093,-86.846,0.7,...,4001,Harding Pl,,,Nashville,TN,Davidson County,37215,US,TIGER/Line® dataset from the US Census Bureau
4,Montgomery Bell Academy job site,4001 Harding Pike,Nashville,TN,Construction,5/8/20,75,36.093,-86.846,0.7,...,4001,Harding Pl,,,Nashville,TN,Davidson County,37215,US,TIGER/Line® dataset from the US Census Bureau
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,Link Systems Electric,444 McNally Dr,Nashville,TN,Other,10/16/20,12,36.093,-86.740,0.9,...,444,Mc Nally Dr,,,Nashville,TN,Davidson County,37211,US,TIGER/Line® dataset from the US Census Bureau
164,Link Systems Electric,444 McNally Dr,Nashville,TN,Other,10/16/20,12,36.093,-86.740,0.9,...,444,Mc Nally Dr,,,Nashville,TN,Davidson County,37211,US,TIGER/Line® dataset from the US Census Bureau
165,Link Systems Electric,444 McNally Dr,Nashville,TN,Other,10/16/20,12,36.093,-86.740,0.9,...,444,Mc Nally Dr,,,Nashville,TN,Davidson County,37211,US,TIGER/Line® dataset from the US Census Bureau
166,Iron Tribe Belmont,3201 Belmont Blvd,Nashville,TN,Gym,10/20/20,14,36.117,-86.798,1.0,...,,Belmont Blvd,,,Nashville,TN,Davidson County,37212,US,TIGER/Line® dataset from the US Census Bureau


In [196]:
#fuzzy_merge_df.to_csv('fuzzy_merge_df.csv')

In [194]:
fuzzy_merge_df['Cluster Name'].value_counts()

Kid Rock's Big Ass Honky Tonk                    47
Winner's                                         27
Tootsie's                                        27
Dawghouse Saloon                                  9
Montgomery Bell Academy job site                  8
Hermitage Hall                                    6
Good Samaritan Health & Healing                   6
Miss Kelli's                                      6
Life Care Center Old Hickory Village              5
Link Systems Electric                             4
Debra Johnson Rehab/TN Prison for Women           3
Grand Hyatt Hotel (Power Design Inc) job site     2
Iron Tribe Belmont                                2
One Stone Church Service                          2
Dogwood                                           1
Lakeshore Meadows                                 1
Green Hills Center for Rehab and Healing          1
Lois DeBerry Special Needs Facility               1
Nashville Center for Rehab and Healing (2)        1
Belmont Vill

In [None]:
def matcher(column1, column2):
    names_array=[]
    ratio_array=[]    
    if column1 in column2:
        return 100
    else:   
        return fuzz.partial_ratio(column1,column2)

In [None]:
all_places_results['match_ratio'] = all_places_results.apply(
    lambda x: matcher(x['vicinity'].split(',')[0], x['orig_address'].split(',')[0]), axis=1)

## Create Buffer Zones around each Cluster Point
### Create a GeoDataFrame with the Clusters DataFrame

In [40]:
clusters_locations['Buffer_Geometry'] = clusters_locations.apply(lambda x: Point((float(x.Longitude),
                                              float(x.Latitude))),
                             axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clusters_locations['Buffer_Geometry'] = clusters_locations.apply(lambda x: Point((float(x.Longitude),


In [41]:
clusters_locations.head()

Unnamed: 0,Cluster Name,Facility Type,Cluster Start Date,# Cases,Latitude,Longitude,Buffer_Geometry
0,Vanderbilt Parties,Social Gathering,3/11/2020,49,36.125891,-86.822863,POINT (-86.822863 36.1258905)
1,Event at Clementine Hall,Social Gathering,3/14/2020,23,36.152444,-86.846772,POINT (-86.8467716 36.15244420000001)
3,The Health Center at Richland Place,LTCF,4/3/2020,47,36.12875,-86.819533,POINT (-86.8195333 36.12874979999999)
4,Trevecca Center for Rehab and Healing,LTCF,4/4/2020,102,36.144562,-86.756749,POINT (-86.7567485 36.1445623)
5,Tyson Foods,Commercial-Warehouse,4/6/2020,280,36.198993,-89.836757,POINT (-89.8367566 36.1989931)


In [42]:
geometry = clusters_locations['Buffer_Geometry']
clusters_locations_geodf = gpd.GeoDataFrame(clusters_locations, crs="EPSG:4326", geometry=geometry)

In [43]:
clusters_locations_geodf = clusters_locations_geodf.drop('Buffer_Geometry', axis=1)
clusters_locations_geodf['Buffer_Geometry'] = clusters_locations_geodf['geometry']
clusters_locations_geodf = clusters_locations_geodf.drop('geometry', axis=1)

In [44]:
clusters_locations_geodf.head()

Unnamed: 0,Cluster Name,Facility Type,Cluster Start Date,# Cases,Latitude,Longitude,Buffer_Geometry
0,Vanderbilt Parties,Social Gathering,3/11/2020,49,36.125891,-86.822863,POINT (-86.82286 36.12589)
1,Event at Clementine Hall,Social Gathering,3/14/2020,23,36.152444,-86.846772,POINT (-86.84677 36.15244)
3,The Health Center at Richland Place,LTCF,4/3/2020,47,36.12875,-86.819533,POINT (-86.81953 36.12875)
4,Trevecca Center for Rehab and Healing,LTCF,4/4/2020,102,36.144562,-86.756749,POINT (-86.75675 36.14456)
5,Tyson Foods,Commercial-Warehouse,4/6/2020,280,36.198993,-89.836757,POINT (-89.83676 36.19899)


In [45]:
clusters_locations_geodf['Buffer_Zone'] = clusters_locations_geodf['Buffer_Geometry'].buffer(.0001, resolution=20)


  clusters_locations_geodf['Buffer_Zone'] = clusters_locations_geodf['Buffer_Geometry'].buffer(.0001, resolution=20)


In [46]:
clusters_locations_geodf

Unnamed: 0,Cluster Name,Facility Type,Cluster Start Date,# Cases,Latitude,Longitude,Buffer_Geometry,Buffer_Zone
0,Vanderbilt Parties,Social Gathering,3/11/2020,49,36.125891,-86.822863,POINT (-86.82286 36.12589),"POLYGON ((-86.82276 36.12589, -86.82276 36.125..."
1,Event at Clementine Hall,Social Gathering,3/14/2020,23,36.152444,-86.846772,POINT (-86.84677 36.15244),"POLYGON ((-86.84667 36.15244, -86.84667 36.152..."
3,The Health Center at Richland Place,LTCF,4/3/2020,47,36.12875,-86.819533,POINT (-86.81953 36.12875),"POLYGON ((-86.81943 36.12875, -86.81943 36.128..."
4,Trevecca Center for Rehab and Healing,LTCF,4/4/2020,102,36.144562,-86.756749,POINT (-86.75675 36.14456),"POLYGON ((-86.75665 36.14456, -86.75665 36.144..."
5,Tyson Foods,Commercial-Warehouse,4/6/2020,280,36.198993,-89.836757,POINT (-89.83676 36.19899),"POLYGON ((-89.83666 36.19899, -89.83666 36.198..."
6,Nashville Center for Rehab and Healing,LTCF,4/8/2020,12,36.134557,-86.783118,POINT (-86.78312 36.13456),"POLYGON ((-86.78302 36.13456, -86.78302 36.134..."
7,CDM Jail,Correctional Facility,4/13/2020,22,36.0882,-86.686168,POINT (-86.68617 36.08820),"POLYGON ((-86.68607 36.08820, -86.68607 36.088..."
8,Bordeaux,LTCF,4/15/2020,47,36.180317,-86.850478,POINT (-86.85048 36.18032),"POLYGON ((-86.85038 36.18032, -86.85038 36.180..."
9,Cargill,Commercial-Warehouse,4/21/2020,22,36.112551,-86.759455,POINT (-86.75946 36.11255),"POLYGON ((-86.75935 36.11255, -86.75936 36.112..."
10,Bethany Center for Rehab and Healing,LTCF,4/27/2020,133,36.049614,-86.71895,POINT (-86.71895 36.04961),"POLYGON ((-86.71885 36.04961, -86.71885 36.049..."


In [47]:
center = [36.16784, -86.78166]
nashville_buffer_map = folium.Map(location = center, zoom_start=11)

In [48]:
folium.GeoJson(clusters_locations_geodf['Buffer_Zone']).add_to(nashville_buffer_map)

<folium.features.GeoJson at 0x7fbfccb48e80>

In [49]:
nashville_buffer_map

## Read in geocodio, try comparing it to clusers instead of the original DF

In [102]:
geocodio = pd.read_csv('../data/geocodio_input_df_geocodio_d5d27c584d4aa85e57bb523066377b276f89cac3.csv')

In [103]:
geocodio.columns

Index(['Unnamed: 0', 'Address', 'City', 'ZIP', 'state', 'Latitude',
       'Longitude', 'Accuracy Score', 'Accuracy Type', 'Number', 'Street',
       'Unit Type', 'Unit Number', 'City.1', 'State', 'County', 'Zip',
       'Country', 'Source'],
      dtype='object')

In [52]:
#geocodio['Longitude'] = round(geocodio['Longitude'], 3)
#geocodio['Latitude'] = round(geocodio['Latitude'], 3)

In [53]:
hub_geocodio = hub_covid_violations_df.join(geocodio, rsuffix='_geo')

In [54]:
hub_geocodio.columns

Index(['Request #', 'Status', 'Date / Time Opened', 'Date / Time Closed',
       'Contact Type', 'State Issue', 'Closed When Created', 'Address', 'City',
       'Council District', 'ZIP', 'Latitude', 'Longitude', 'Mapped Location',
       'Unnamed: 0', 'Address_geo', 'City_geo', 'ZIP_geo', 'state',
       'Latitude_geo', 'Longitude_geo', 'Accuracy Score', 'Accuracy Type',
       'Number', 'Street', 'Unit Type', 'Unit Number', 'City.1', 'State',
       'County', 'Zip', 'Country', 'Source'],
      dtype='object')

In [55]:
clusters_violations_geo = clusters[clusters['Latitude'].notna()].merge(
    hub_geocodio, 
    on=['Latitude', 'Longitude'], 
    how='inner'
)

In [56]:
clusters_violations_geo.head()

Unnamed: 0,Cluster Name,Facility Type,Cluster Start Date,# Cases,Latitude,Longitude,Request #,Status,Date / Time Opened,Date / Time Closed,...,Number,Street,Unit Type,Unit Number,City.1,State,County,Zip,Country,Source


In [57]:
clusters_violations_geo['Cluster Start Date'] = pd.to_datetime(clusters_violations_geo['Cluster Start Date'])
clusters_violations_geo['Date / Time Opened'] = pd.to_datetime(clusters_violations_geo['Date / Time Opened'])
clusters_violations_geo['Date / Time Closed'] = pd.to_datetime(clusters_violations_geo['Date / Time Closed'])

In [58]:
clusters_violations_geo['Opened_to_Cluster_Delta'] = clusters_violations_geo['Date / Time Opened'] - clusters_violations_geo['Cluster Start Date']

In [59]:
clusters_violations_geo

Unnamed: 0,Cluster Name,Facility Type,Cluster Start Date,# Cases,Latitude,Longitude,Request #,Status,Date / Time Opened,Date / Time Closed,...,Street,Unit Type,Unit Number,City.1,State,County,Zip,Country,Source,Opened_to_Cluster_Delta


In [60]:
clusters_violations_geo.sort_values('Opened_to_Cluster_Delta')

Unnamed: 0,Cluster Name,Facility Type,Cluster Start Date,# Cases,Latitude,Longitude,Request #,Status,Date / Time Opened,Date / Time Closed,...,Street,Unit Type,Unit Number,City.1,State,County,Zip,Country,Source,Opened_to_Cluster_Delta


In [61]:
clusters_violations_geo['Opened_to_Cluster_Delta'].sort_values().to_list()

[]

In [62]:
clusters_violations_geo[clusters_violations_geo[
    'Cluster Name']=="Kid Rock's Big Ass Honky Tonk"]['Opened_to_Cluster_Delta'].sort_values().to_list()

[]

In [63]:
clusters_violations_geo[clusters_violations_geo[
    'Cluster Name']!="Kid Rock's Big Ass Honky Tonk"]['Opened_to_Cluster_Delta'].sort_values().to_list()

[]

In [64]:
#clusters_violations_geo.to_csv('clusters_violations_geo_df.csv')

## Look at some Folium Maps

## Starting with just looking at all the clusters

COVID Icon - Icons made by <a href="https://www.flaticon.com/authors/freepik" title="Freepik">Freepik</a> from <a href="https://www.flaticon.com/" title="Flaticon"> www.flaticon.com</a>

In [67]:
clusters_locations

Unnamed: 0,Cluster Name,Facility Type,Cluster Start Date,# Cases,Latitude,Longitude,Buffer_Geometry,geometry
0,Vanderbilt Parties,Social Gathering,3/11/2020,49,36.125891,-86.822863,POINT (-86.822863 36.1258905),POINT (-86.82286 36.12589)
1,Event at Clementine Hall,Social Gathering,3/14/2020,23,36.152444,-86.846772,POINT (-86.8467716 36.15244420000001),POINT (-86.84677 36.15244)
3,The Health Center at Richland Place,LTCF,4/3/2020,47,36.12875,-86.819533,POINT (-86.8195333 36.12874979999999),POINT (-86.81953 36.12875)
4,Trevecca Center for Rehab and Healing,LTCF,4/4/2020,102,36.144562,-86.756749,POINT (-86.7567485 36.1445623),POINT (-86.75675 36.14456)
5,Tyson Foods,Commercial-Warehouse,4/6/2020,280,36.198993,-89.836757,POINT (-89.8367566 36.1989931),POINT (-89.83676 36.19899)
6,Nashville Center for Rehab and Healing,LTCF,4/8/2020,12,36.134557,-86.783118,POINT (-86.78311770000001 36.1345574),POINT (-86.78312 36.13456)
7,CDM Jail,Correctional Facility,4/13/2020,22,36.0882,-86.686168,POINT (-86.68616830000001 36.0882),POINT (-86.68617 36.08820)
8,Bordeaux,LTCF,4/15/2020,47,36.180317,-86.850478,POINT (-86.8504781 36.1803169),POINT (-86.85048 36.18032)
9,Cargill,Commercial-Warehouse,4/21/2020,22,36.112551,-86.759455,POINT (-86.759455 36.1125506),POINT (-86.75946 36.11255)
10,Bethany Center for Rehab and Healing,LTCF,4/27/2020,133,36.049614,-86.71895,POINT (-86.7189497 36.049614),POINT (-86.71895 36.04961)


In [65]:
center = [36.16784, -86.78166]
nashville_cluster_map = folium.Map(location = center, zoom_start=11)

In [66]:
for row_index, row_values in clusters_locations.iterrows():
    loc = [row_values['Latitude'], row_values['Longitude']]
    pop = str(row_values['Cluster Name'])
    icon = folium.features.CustomIcon('covid4.png')
    marker = folium.Marker(
        icon = icon,
        location = loc,
        popup = folium.Popup(pop,
                     min_width=200,
                     max_width=500))
    
    marker.add_to(nashville_cluster_map)
    
nashville_cluster_map

## Look at all reports
### Start by creating DataFrame without repeating coordinate locations

In [None]:
hub_covid_violations_df.head()

In [None]:
hub_covid_violations_df['Coordinates'] = list(zip(hub_covid_violations_df.Latitude, hub_covid_violations_df.Longitude))

In [None]:
hub_covid_map_df = hub_covid_violations_df.groupby('Coordinates').count()

In [None]:
center = [36.1672, -86.7816]
nashville_report_map = folium.Map(location = center, zoom_start=11)

In [None]:
#for row_index, row_values in hub_covid_violations_df[
    (hub_covid_violations_df['City'].notna()) & 
    (hub_covid_violations_df['Latitude'].notna())].iterrows():
    
    loc = [row_values['Latitude'], row_values['Longitude']]
    #pop = str(row_values['Cluster Name'])
    icon = folium.features.CustomIcon('covid4.png')
    marker = folium.Marker(
        icon = icon,
        location = loc)
        #popup = folium.Popup(pop,
                     #min_width=200,
                     #max_width=500))
    
    marker.add_to(nashville_report_map)
    
nashville_report_map

In [None]:
center = [36.16784, -86.78166]
nashville_covid_map = folium.Map(location = center, zoom_start=11)

In [None]:
for row_index, row_values in clusters_violations.iterrows():
    loc = [row_values['Latitude'], row_values['Longitude']]
    pop = str(row_values['Cluster Name'])
    icon = folium.features.CustomIcon('covid4.png')
    marker = folium.Marker(
        icon = icon,
        location = loc,
        popup = folium.Popup(pop,
                     min_width=200,
                     max_width=500))
    
    marker.add_to(nashville_covid_map)

In [None]:
nashville_covid_map

Loser's looks like it's located in the wrong spot - so it may not be correct for a cluster??

## Create a circle arounch each cluster, checking location matches that way

In [None]:
clusters_violations_geo['Cluster_Geometry']

In [None]:
for r in clusters_violations_geo:
    p = r['Cluster_Geometry']
    n_points = 20
    d = 10 # meters
    angles = np.linspace(0, 360, n_points)
    r['Cluster_Polygon'] = geog.propagate(p, angles, d)

In [None]:
n_points = 20
d = 10 * 1000  # meters
angles = np.linspace(0, 360, n_points)
polygon = geog.propagate(p, angles, d)
print(json.dumps(shapely.geometry.mapping(shapely.geometry.Polygon(polygon))))

## Import JSON files with business info

In [None]:
business_glob = glob.glob('../data/google_places_results/*.json')

In [None]:
jsons_list = []
for j in business_glob:
    with open(j) as result_file:
        jsons_list.append(json.load(result_file))

In [None]:
jsons_list[0][0]['results']

In [None]:
def process_json_file(filename):
    with open(filename) as result_file:
        results_json = json.load(result_file)
        results_json_list = []
        for result_json in results_json:
            for result_json_result in result_json['results']:
                result_json_result['orig_mapped_location'] = result_json['mapped_location']
                result_json_result['orig_address'] = result_json['address']
                results_json_list.append(result_json_result)
        return pd.DataFrame(results_json_list)

In [None]:
frames = [process_json_file(file_path) for file_path in json_files]

In [None]:
with open('data/example.json', "r") as fi:
    person = json.load(fi)

In [None]:
businesses[0]

In [None]:
with open('../data/google_places_results/results_32.json', "r") as fi:
    person = json.load(fi)

In [None]:
print(pd.DataFrame(person[0]['results'][0].T)

In [None]:
person[0]['results'][0].keys()

In [None]:
pd.DataFrame(person[0]['results'][0].T)

In [None]:
person[0]['results']