In [1]:
import os
import sys
import csv
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 

sys.path.append('../')
from envir import config

This notebook uses the csv file from running the script named: 'fuzzy_match_business_names.py'


#### connect & load data

In [2]:
engine = create_engine('postgresql+psycopg2:///yelp_abcs')
con = engine.connect

In [3]:
yelp = pd.read_sql("select * from stag.yelp_academic_business where state in ('ON', 'NV');", con=engine )
vegas = pd.read_sql("select * from clean.clean_vegas_violations;", con=engine)
toronto = pd.read_sql("select * from clean.toronto_inspections_new_cleaned;", con=engine)

In [4]:
yelp2 = yelp.loc[yelp['categories'].str.contains("Restaurant")==True][['business_id',
                                                                       'name', 'address']].copy()

yelp2.reset_index(inplace=True)

In [5]:
yelp2[yelp2.business_id=='EBHzEtuOJz474NwRQFqJbg']

Unnamed: 0,index,business_id,name,address
18159,55806,EBHzEtuOJz474NwRQFqJbg,Popeyes Louisiana Kitchen,1955 Queen Street E


#### prep for concat / append tables

In [6]:
len(toronto) + len(vegas)

264386

In [7]:
vegas.drop(columns=['index'], inplace=True)

In [8]:
violations = vegas.append(toronto, ignore_index=True, sort=True)

#### get unique business names from violations

In [9]:
fuzz_match = pd.read_csv(config.shared+'fuzzy_biz_names80.csv')

In [10]:
fuzz_match[(fuzz_match.yelp_name_address == fuzz_match.violations_name_address) & (fuzz_match.match_score < 100)]

Unnamed: 0,yelp_name_address,violations_name_address,match_score


In [11]:
fuzz_match[(fuzz_match.yelp_name_address != fuzz_match.violations_name_address) & (fuzz_match.match_score == 100)]

Unnamed: 0,yelp_name_address,violations_name_address,match_score
106,D-Spot Dessert Cafe 1060 The Queensway,D SPOT DESSERT CAFE 1060 THE QUEENSWAY,100.0
121,Kimchi House 586 Bloor St W,KIMCHI HOUSE 586 BLOOR ST W,100.0
129,Brass Taps Pizza Pub 934 College St,BRASS TAPS PIZZA PUB 934 COLLEGE ST,100.0
243,Pho Vietnam 1280 Kennedy Rd,PHO VIETNAM 1280 KENNEDY RD,100.0
250,Las Tapatias 6132 W Charleston Blvd,LAS TAPATIAS 6132 W Charleston Blvd,100.0
...,...,...,...
30864,5 Spice Dining 2826 Markham Rd,5 SPICE DINING 2826 MARKHAM RD,100.0
30889,C&C Supermarket 888 Don Mills Rd,C & C SUPERMARKET 888 DON MILLS RD,100.0
30965,Pumpernickel's 101 College St,PUMPERNICKEL'S 101 COLLEGE ST,100.0
30987,Peter Piper Pizza 6081 S Eastern Ave,PETER PIPER PIZZA 6081 S EASTERN Ave,100.0


In [12]:
fuzz_match.groupby(['yelp_name_address',
                    'violations_name_address'],as_index=False).size().sort_values(ascending=False)

yelp_name_address                                       violations_name_address                                  
Burger King 3401 E Tropicana Ave                        Burger King #2701 3401 E Tropicana Ave                       2
Ravi Soups 322 Adelaide Street W                        RAVI'S SOUP 322 ADELAIDE ST W                                2
Baja Fresh 8780 W Charleston Blvd, Ste 100              Baja Fresh 8780 W Charleston Blvd                            2
Carl's Jr. 4901 W Craig Rd                              CARL'S JR #7073 4901 W Craig Rd                              2
Pizza Pizza 260 Church Street                           PIZZA PIZZA 260 CHURCH ST                                    2
                                                                                                                    ..
Popeyes Louisiana Kitchen 150 Park Lawn Road            POPEYES LOUISIANA KITCHEN 150 PARK LAWN RD                   1
Popeyes Louisiana Kitchen 1525 Dundas Street E, Unit 

In [13]:
fuzz_match.sort_values(['violations_name_address','match_score'], ascending=False, inplace=True)

In [14]:
fuzz_match.drop_duplicates(['violations_name_address'], inplace=True)

In [15]:
yelp2

Unnamed: 0,index,business_id,name,address
0,0,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W
1,4,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens,"1775 E Tropicana Ave, Ste 29"
2,11,tstimHoMcYbkSC4eBA1wEg,Maria's Mexican Restaurant & Bakery,6055 E Lake Mead Blvd
3,15,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,1170 Queen Street W
4,17,SP_YXIEwkFPPl_9anCYmpQ,The Steady Cafe & Bar,1051 Bloor Street W
...,...,...,...,...
22615,69706,sVEE_Mp3EbWW1UIhfActVA,The King's Kitchen,"9275 Hwy 48, Unit 11"
22616,69712,lo8WwqKogC-kz4toRvkdUQ,KOKO! Share Bar,81 Yorkville Avenue
22617,69714,3HAPJH0Qoi8Ix6M9rUMfjg,Indian Hero,8920 Highway 50
22618,69716,oS0CnUbyv0GUoD3L8_3UPQ,Thai Fantasy,578 Yonge Street


In [16]:
yelp2['yelp_name_address'] = yelp2['name'] + ' ' + yelp2['address']

In [17]:
yelp2.drop(columns=['name', 'address', 'index'], inplace=True)

In [18]:
violations['violations_name_address'] = violations['name'] + ' ' + violations['address']

In [19]:
fuzz_match.groupby(['violations_name_address'],as_index=False).size().sort_values(ascending=False)

violations_name_address
iQ FOOD CO. 181 BAY ST                            1
Gusto Pizza 1 BONIS AVE                           1
HAMPTONS 440 S RAMPART Blvd 180                   1
HAN BA TANG RESTAURANT 4862 YONGE ST              1
HANA KOREA 45 BALDWIN ST                          1
                                                 ..
PTS GOLD 47 9050 W Post Rd                        1
PUBLIC HOUSE @ LUXOR 3900 S LAS VEGAS Blvd        1
PUBLICUS 1126 FREMONT St                          1
PUCK N WINGS 5625 YONGE ST                        1
#1 HAWAIIAN BARBECUE 7960 S RAINBOW Blvd 8000G    1
Length: 7720, dtype: int64

In [20]:
matched = violations.merge(fuzz_match,
                           on='violations_name_address', how='inner').merge(yelp2,
                                                                              on='yelp_name_address', how='inner')

In [21]:
matched.columns

Index(['address', 'city', 'crucial', 'establishment_type', 'inspection_date',
       'inspection_result', 'latitude', 'longitude', 'minor', 'name',
       'significant', 'violations_desc', 'violations_name_address',
       'yelp_name_address', 'match_score', 'business_id'],
      dtype='object')

In [22]:
matched_ord = matched.reindex(columns=['business_id','city','inspection_date', 'name','establishment_type',
                                       'address','latitude', 'longitude','inspection_result',
                                       'violations_desc','minor','crucial','significant','violations_desc']).copy()

In [23]:
try:
    matched_ord.to_sql("violations_matched_yelp_id", engine, schema="clean")
except:
    print('table already exists')

table already exists


In [24]:
matched_ord.to_csv(config.shared+'violations_matched_yelp_id.csv')

In [25]:
len(violations)

264386

In [26]:
len(matched_ord)

122548

In [27]:
len(matched_ord)/len(violations)

0.46351924837169894

In [42]:
matched_ord.groupby(by='city', as_index=False).agg({'business_id': pd.Series.nunique})

Unnamed: 0,city,business_id
0,Las Vegas,3420
1,toronto,4314


In [48]:
# compare to API
import requests, json
url='https://api.yelp.com/v3/businesses/search'
headers = {'Authorization': 'Bearer %s' % config.yelpApi}
params = {'location':'Toronto'}
req = requests.get(url, params=params, headers=headers)
res_json = json.loads(req.text)

In [49]:
res_json['total']

10300

In [35]:
matched_ord[matched_ord.city=='Las Vegas'].groupby('business_id').count().sort_values('inspection_date', ascending=False).head(20)

Unnamed: 0_level_0,city,inspection_date,name,establishment_type,address,latitude,longitude,inspection_result,violations_desc,minor,crucial,significant,violations_desc
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
El4FC8jcawUVgw_0EIcbaQ,2812,2805,2812,2787,2812,2812,2812,2805,2812,2812,2812,2812,2812
Ou8pYS24azDWG0ru_vUcqg,2644,2633,2644,2604,2644,2644,2644,2634,2644,2644,2644,2644,2644
eAc9Vd6loOgRQolMXQt6FA,2600,2596,2600,2517,2600,2600,2600,2596,2600,2600,2600,2600,2600
xtYiHTmunjfCN2sUaQxBjA,1741,1737,1741,1652,1741,1741,1741,1738,1741,1741,1741,1741,1741
3Gt3xskppi9jZuTrwrhLNg,1693,1692,1693,1676,1693,1693,1693,1693,1693,1693,1693,1693,1693
HoKXKkW8oC9q8XnjKLulRw,1664,1663,1664,1554,1664,1664,1664,1663,1664,1664,1664,1664,1664
RhEvP5flF6KoPriMHmVYGg,1494,1489,1494,1435,1494,1494,1494,1492,1494,1494,1494,1494,1494
iyoym9B8gU_YPXSb3m31LA,1265,1260,1265,1238,1265,1265,1265,1264,1265,1265,1265,1265,1265
VE5KGq9ztCztivwbmjNlTQ,1130,1130,1130,1086,1130,1130,1130,1130,1130,1130,1130,1130,1130
Kn23LDd740SBVJ7mum0fwg,1121,1119,1121,1108,1121,1121,1121,1119,1121,1121,1121,1121,1121


In [34]:
matched_ord['year'] = pd.DatetimeIndex(matched_ord['inspection_date']).year

In [35]:
matched_ord.groupby(['year', 'business_id']).count()['city'].describe()

count    38269.000000
mean         3.195955
std          9.895729
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max        314.000000
Name: city, dtype: float64

In [36]:
matched_ord.groupby(['year', 'business_id']).count()['city'].quantile([.1, .9,0.95])

0.10    1.0
0.90    4.0
0.95    6.0
Name: city, dtype: float64

In [37]:
matched_ord[matched_ord['business_id']=='Ou8pYS24azDWG0ru_vUcqg']

Unnamed: 0,business_id,city,inspection_date,name,establishment_type,address,latitude,longitude,inspection_result,violations_desc,minor,crucial,significant,violations_desc.1
3351,Ou8pYS24azDWG0ru_vUcqg,Las Vegas,2013-01-10 12:00:00,CAESARS PALACE HOTEL & CASINO,Buffet,3570 S Las Vegas Blvd,36.116156,-115.175058,Compliant,,0,0,0,
3352,Ou8pYS24azDWG0ru_vUcqg,Las Vegas,2014-02-21 13:00:00,CAESARS PALACE HOTEL & CASINO,Pantry,3570 S Las Vegas Blvd,36.116156,-115.175058,Compliant,,0,0,0,
3353,Ou8pYS24azDWG0ru_vUcqg,Las Vegas,2015-02-17 10:20:00,CAESARS PALACE HOTEL & CASINO,Pantry,3570 S Las Vegas Blvd,36.116156,-115.175058,Compliant,,0,0,0,
3354,Ou8pYS24azDWG0ru_vUcqg,Las Vegas,2019-03-01 11:15:00,CAESARS PALACE HOTEL & CASINO,Banquet Kitchen,3570 S Las Vegas Blvd,36.116156,-115.175058,A Grade,,0,0,0,
3355,Ou8pYS24azDWG0ru_vUcqg,Las Vegas,2007-04-09 15:25:00,CAESARS PALACE HOTEL & CASINO,Pantry,3570 S Las Vegas Blvd,36.116156,-115.175058,Compliant,Non-food contact surfaces and/or cooking devic...,2,0,2,Non-food contact surfaces and/or cooking devic...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5990,Ou8pYS24azDWG0ru_vUcqg,Las Vegas,,CAESARS PALACE HOTEL & CASINO,,3570 S Las Vegas Blvd,36.116156,-115.175058,,,0,0,0,
5991,Ou8pYS24azDWG0ru_vUcqg,Las Vegas,,CAESARS PALACE HOTEL & CASINO,,3570 S Las Vegas Blvd,36.116156,-115.175058,,,0,0,0,
5992,Ou8pYS24azDWG0ru_vUcqg,Las Vegas,2019-08-19 11:30:00,CAESARS PALACE HOTEL & CASINO,Bar / Tavern,3570 S Las Vegas Blvd,36.116156,-115.175058,A Grade,Handwashing facilities adequate in number ...,1,0,2,Handwashing facilities adequate in number ...
5993,Ou8pYS24azDWG0ru_vUcqg,Las Vegas,2019-10-16 13:20:00,CAESARS PALACE HOTEL & CASINO,Special Kitchen,3570 S Las Vegas Blvd,36.116156,-115.175058,A Grade,Food protected from potential contamination by...,1,0,2,Food protected from potential contamination by...


In [38]:
matched_ord[matched_ord['business_id']=='EBHzEtuOJz474NwRQFqJbg'].groupby(['year', 'business_id']).count()['city']

year    business_id           
2017.0  EBHzEtuOJz474NwRQFqJbg    1
2018.0  EBHzEtuOJz474NwRQFqJbg    9
2019.0  EBHzEtuOJz474NwRQFqJbg    5
Name: city, dtype: int64

339