In [1]:
import os
import sys
import csv
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 

sys.path.append('../')
from envir import config

This notebook uses the csv file from running the script named: 'fuzzy_match_business_names.py'


#### connect & load data

In [2]:
engine = create_engine('postgresql+psycopg2:///yelp_abcs')
con = engine.connect

In [3]:
yelp = pd.read_sql("select * from stag.yelp_academic_business where state in ('ON', 'NV');", con=engine )
vegas = pd.read_sql("select * from clean.clean_vegas_violations;", con=engine)
toronto = pd.read_sql("select * from clean.toronto_inspections_cleaned;", con=engine)

In [4]:
yelp2 = yelp.loc[yelp['categories'].str.contains("Restaurant")==True][['business_id',
                                                                       'name', 'address']].copy()

yelp2.reset_index(inplace=True)

In [48]:
yelp2[yelp2.business_id=='EBHzEtuOJz474NwRQFqJbg']

Unnamed: 0,business_id,yelp_name_address
18159,EBHzEtuOJz474NwRQFqJbg,Popeyes Louisiana Kitchen 1955 Queen Street E


#### prep for concat / append tables

In [5]:
len(toronto) + len(vegas)

264386

In [6]:
vegas.drop(columns=['index'], inplace=True)

In [7]:
violations = vegas.append(toronto, ignore_index=True, sort=True)

#### get unique business names from violations

In [8]:
fuzz_match = pd.read_csv(config.shared+'fuzzy_biz_names80.csv')

In [9]:
fuzz_match[(fuzz_match.yelp_name_address == fuzz_match.violations_name_address) & (fuzz_match.match_score < 100)]

Unnamed: 0,yelp_name_address,violations_name_address,match_score


In [10]:
fuzz_match[(fuzz_match.yelp_name_address != fuzz_match.violations_name_address) & (fuzz_match.match_score == 100)]

Unnamed: 0,yelp_name_address,violations_name_address,match_score
106,D-Spot Dessert Cafe 1060 The Queensway,D SPOT DESSERT CAFE 1060 THE QUEENSWAY,100.0
121,Kimchi House 586 Bloor St W,KIMCHI HOUSE 586 BLOOR ST W,100.0
129,Brass Taps Pizza Pub 934 College St,BRASS TAPS PIZZA PUB 934 COLLEGE ST,100.0
243,Pho Vietnam 1280 Kennedy Rd,PHO VIETNAM 1280 KENNEDY RD,100.0
250,Las Tapatias 6132 W Charleston Blvd,LAS TAPATIAS 6132 W Charleston Blvd,100.0
...,...,...,...
30864,5 Spice Dining 2826 Markham Rd,5 SPICE DINING 2826 MARKHAM RD,100.0
30889,C&C Supermarket 888 Don Mills Rd,C & C SUPERMARKET 888 DON MILLS RD,100.0
30965,Pumpernickel's 101 College St,PUMPERNICKEL'S 101 COLLEGE ST,100.0
30987,Peter Piper Pizza 6081 S Eastern Ave,PETER PIPER PIZZA 6081 S EASTERN Ave,100.0


In [11]:
fuzz_match.groupby(['yelp_name_address',
                    'violations_name_address'],as_index=False).size().sort_values(ascending=False)

yelp_name_address                                       violations_name_address                                  
Burger King 3401 E Tropicana Ave                        Burger King #2701 3401 E Tropicana Ave                       2
Ravi Soups 322 Adelaide Street W                        RAVI'S SOUP 322 ADELAIDE ST W                                2
Baja Fresh 8780 W Charleston Blvd, Ste 100              Baja Fresh 8780 W Charleston Blvd                            2
Carl's Jr. 4901 W Craig Rd                              CARL'S JR #7073 4901 W Craig Rd                              2
Pizza Pizza 260 Church Street                           PIZZA PIZZA 260 CHURCH ST                                    2
                                                                                                                    ..
Popeyes Louisiana Kitchen 150 Park Lawn Road            POPEYES LOUISIANA KITCHEN 150 PARK LAWN RD                   1
Popeyes Louisiana Kitchen 1525 Dundas Street E, Unit 

In [12]:
fuzz_match.sort_values(['violations_name_address','match_score'], ascending=False, inplace=True)

In [13]:
fuzz_match.drop_duplicates(['violations_name_address'], inplace=True)

In [26]:
yelp2

Unnamed: 0,business_id,yelp_name_address
0,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant 30 Eglinton Avenue W
1,PZ-LZzSlhSe9utkQYU8pFg,Carluccio's Tivoli Gardens 1775 E Tropicana Av...
2,tstimHoMcYbkSC4eBA1wEg,Maria's Mexican Restaurant & Bakery 6055 E Lak...
3,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar 1170 Queen Street W
4,SP_YXIEwkFPPl_9anCYmpQ,The Steady Cafe & Bar 1051 Bloor Street W
...,...,...
22615,sVEE_Mp3EbWW1UIhfActVA,"The King's Kitchen 9275 Hwy 48, Unit 11"
22616,lo8WwqKogC-kz4toRvkdUQ,KOKO! Share Bar 81 Yorkville Avenue
22617,3HAPJH0Qoi8Ix6M9rUMfjg,Indian Hero 8920 Highway 50
22618,oS0CnUbyv0GUoD3L8_3UPQ,Thai Fantasy 578 Yonge Street


In [14]:
yelp2['yelp_name_address'] = yelp2['name'] + ' ' + yelp2['address']

In [15]:
yelp2.drop(columns=['name', 'address', 'index'], inplace=True)

In [16]:
violations['violations_name_address'] = violations['name'] + ' ' + violations['address']

In [17]:
fuzz_match.groupby(['violations_name_address'],as_index=False).size().sort_values(ascending=False)

violations_name_address
iQ FOOD CO. 181 BAY ST                            1
Gusto Pizza 1 BONIS AVE                           1
HAMPTONS 440 S RAMPART Blvd 180                   1
HAN BA TANG RESTAURANT 4862 YONGE ST              1
HANA KOREA 45 BALDWIN ST                          1
                                                 ..
PTS GOLD 47 9050 W Post Rd                        1
PUBLIC HOUSE @ LUXOR 3900 S LAS VEGAS Blvd        1
PUBLICUS 1126 FREMONT St                          1
PUCK N WINGS 5625 YONGE ST                        1
#1 HAWAIIAN BARBECUE 7960 S RAINBOW Blvd 8000G    1
Length: 7720, dtype: int64

In [18]:
matched = violations.merge(fuzz_match,
                           on='violations_name_address', how='inner').merge(yelp2,
                                                                              on='yelp_name_address', how='inner')

In [19]:
matched.columns

Index(['address', 'city', 'crucial', 'establishment_type', 'inspection_date',
       'inspection_result', 'latitude', 'longitude', 'minor', 'name',
       'significant', 'violations_desc', 'violations_name_address',
       'yelp_name_address', 'match_score', 'business_id'],
      dtype='object')

In [20]:
matched_ord = matched.reindex(columns=['business_id','city','inspection_date', 'name','establishment_type',
                                       'address','latitude', 'longitude','inspection_result',
                                       'violations_desc','minor','crucial','significant','violations_desc']).copy()

In [21]:
try:
    matched_ord.to_sql("violations_matched_yelp_id2", engine, schema="clean")
except:
    print('table already exists')

In [22]:
matched_ord.to_csv(config.shared+'violations_matched_yelp_id.csv')

In [23]:
len(violations)

264386

In [24]:
len(matched_ord)

122548

In [25]:
len(matched_ord)/len(violations)

0.46351924837169894

In [52]:
matched_ord[matched_ord.city=='toronto'].groupby('business_id').count().sort_values('inspection_date', ascending=False).head(20)

Unnamed: 0_level_0,city,inspection_date,name,establishment_type,address,latitude,longitude,inspection_result,violations_desc,minor,crucial,significant,violations_desc
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
EBHzEtuOJz474NwRQFqJbg,15,15,15,15,15,15,15,15,15,15,15,15,15
R__NLOqjUlw8qf12KD-I5A,14,14,14,14,14,14,14,14,14,14,14,14,14
grzKamxEuXTZrTkiztLQGg,14,14,14,14,14,14,14,14,14,14,14,14,14
u0KWFEkp2KCaB0ebmvi4Kw,13,13,13,13,13,13,13,13,13,13,13,13,13
F-bdXFkJwwENiNpMTG2ntQ,13,13,13,13,13,13,13,13,13,13,13,13,13
F_oPMHJrH42R67xp5eKtQA,13,13,13,13,13,13,13,13,13,13,13,13,13
I6oxEn1HT41S2GPrDKPPvA,12,12,12,12,12,12,12,12,12,12,12,12,12
isYj0rlYyUv3bUd4ZChxlQ,12,12,12,12,12,12,12,12,12,12,12,12,12
OJLR2T3CgXQkUgFmqrkTwQ,12,12,12,12,12,12,12,12,12,12,12,12,12
yX4anPCzNGDQ-v8_uug2yg,12,12,12,12,12,12,12,12,12,12,12,12,12


In [54]:
matched_ord['year'] = pd.DatetimeIndex(matched_ord['inspection_date']).year

In [69]:
matched_ord.groupby(['year', 'business_id']).count()['city'].describe()

count    38269.000000
mean         3.195955
std          9.895729
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max        314.000000
Name: city, dtype: float64

In [70]:
matched_ord.groupby(['year', 'business_id']).count()['city'].quantile([.1, .9,0.95])

0.10    1.0
0.90    4.0
0.95    6.0
Name: city, dtype: float64

In [72]:
matched_ord[matched_ord['business_id']=='EBHzEtuOJz474NwRQFqJbg']

Unnamed: 0,business_id,city,inspection_date,name,establishment_type,address,latitude,longitude,inspection_result,violations_desc,minor,crucial,significant,violations_desc.1,year
106761,EBHzEtuOJz474NwRQFqJbg,toronto,2017-11-13,POPEYES LOUISIANA KITCHEN,Restaurant,1955 QUEEN ST E,43.669259,-79.302131,Pass,Fail to Ensure the Presence of the Holder of a...,2,2,2,Fail to Ensure the Presence of the Holder of a...,2017.0
106762,EBHzEtuOJz474NwRQFqJbg,toronto,2018-04-04,POPEYES LOUISIANA KITCHEN,Restaurant,1955 QUEEN ST E,43.669259,-79.302131,Pass,Operator fail to properly wash surfaces in rooms,1,1,1,Operator fail to properly wash surfaces in rooms,2018.0
106763,EBHzEtuOJz474NwRQFqJbg,toronto,2018-06-04,POPEYES LOUISIANA KITCHEN,Restaurant,1955 QUEEN ST E,43.669259,-79.302131,Conditional Pass,Fail to provide hot and cold water for hand wa...,6,6,6,Fail to provide hot and cold water for hand wa...,2018.0
106764,EBHzEtuOJz474NwRQFqJbg,toronto,2018-06-11,POPEYES LOUISIANA KITCHEN,Restaurant,1955 QUEEN ST E,43.669259,-79.302131,Conditional Pass,Operator fail to ensure food is not contaminat...,3,3,3,Operator fail to ensure food is not contaminat...,2018.0
106765,EBHzEtuOJz474NwRQFqJbg,toronto,2018-06-15,POPEYES LOUISIANA KITCHEN,Restaurant,1955 QUEEN ST E,43.669259,-79.302131,Conditional Pass,Operator fail to properly remove solid waste,1,1,1,Operator fail to properly remove solid waste,2018.0
106766,EBHzEtuOJz474NwRQFqJbg,toronto,2018-06-18,POPEYES LOUISIANA KITCHEN,Restaurant,1955 QUEEN ST E,43.669259,-79.302131,Pass,Operator fail to properly wash surfaces in rooms,1,1,1,Operator fail to properly wash surfaces in rooms,2018.0
106767,EBHzEtuOJz474NwRQFqJbg,toronto,2018-09-18,POPEYES LOUISIANA KITCHEN,Restaurant,1955 QUEEN ST E,43.669259,-79.302131,Pass,FOOD PREMISE NOT MAINTAINED WITH FOOD HANDLING...,2,2,2,FOOD PREMISE NOT MAINTAINED WITH FOOD HANDLING...,2018.0
106768,EBHzEtuOJz474NwRQFqJbg,toronto,2018-10-29,POPEYES LOUISIANA KITCHEN,Restaurant,1955 QUEEN ST E,43.669259,-79.302131,Conditional Pass,FAIL TO ENSURE EQUIPMENT SURFACE SANITIZED AS ...,7,7,7,FAIL TO ENSURE EQUIPMENT SURFACE SANITIZED AS ...,2018.0
106769,EBHzEtuOJz474NwRQFqJbg,toronto,2018-11-08,POPEYES LOUISIANA KITCHEN,Restaurant,1955 QUEEN ST E,43.669259,-79.302131,Pass,,1,1,1,,2018.0
106770,EBHzEtuOJz474NwRQFqJbg,toronto,2018-12-31,POPEYES LOUISIANA KITCHEN,Restaurant,1955 QUEEN ST E,43.669259,-79.302131,Conditional Pass,FAIL TO REMOVE GARBAGE WHEN NECESSARY TO MAINT...,5,5,5,FAIL TO REMOVE GARBAGE WHEN NECESSARY TO MAINT...,2018.0


In [71]:
matched_ord[matched_ord['business_id']=='EBHzEtuOJz474NwRQFqJbg'].groupby(['year', 'business_id']).count()['city']

year    business_id           
2017.0  EBHzEtuOJz474NwRQFqJbg    1
2018.0  EBHzEtuOJz474NwRQFqJbg    9
2019.0  EBHzEtuOJz474NwRQFqJbg    5
Name: city, dtype: int64