In [2]:
import pandas as pd
import numpy as np
from geopy import Nominatim

import warnings

# Desactivar todas las advertencias
warnings.filterwarnings("ignore")

In [90]:
df_rest = pd.read_parquet('Unification/df_restaurant.parquet')
df_rest.shape

(41562, 11)

In [91]:

mapping = {'Cafe': 1, 'Family': 2, 'Night': 3, 'European': 4, 'Vegetarian': 4, 'American':5, 'Asian':6, 'Central American':7,
           'South American':7, 'Fastfood':5, 'No Detail':2, 'African':6, 'Breakfast':1}

df_rest['numerical_category'] = df_rest['category'].map(mapping)

df_rest.head()

Unnamed: 0,business_id,business_name,category,avg_rating,review_count,address,state,city,postal_code,latitude,longitude,numerical_category
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Cafe,4.0,80,935 Race St,CA,Philadelphia,19107,39.955505,-75.155564,1
1,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,Family,4.5,100,2575 E Bay Dr,FL,Largo,33771,27.916116,-82.760461,2
2,kfNv-JZpuN6TVNSO6hHdkw,Hibachi Express,Cafe,4.0,20,6625 E 82nd St,PA,Indianapolis,46250,39.90432,-86.05308,1
3,sqSqqLy0sN8n2IZrAbzidQ,Domino's Pizza,American,3.5,8,3001 Highway 31 W,CA,White House,37188,36.464747,-86.659187,5
4,Mjboz24M9NlBeiOJKLEd_Q,DeSandro on Main,European,3.0,41,4105 Main St,PA,Philadelphia,19127,40.022466,-75.218314,4


In [92]:
# Obtener variables ficticias para la columna 'category'
dummies = pd.get_dummies(df_rest['numerical_category'])

# Concatenar las variables ficticias al DataFrame original
df_rest = pd.concat([df_rest, dummies], axis=1)
df_rest.head()

Unnamed: 0,business_id,business_name,category,avg_rating,review_count,address,state,city,postal_code,latitude,longitude,numerical_category,1,2,3,4,5,6,7
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Cafe,4.0,80,935 Race St,CA,Philadelphia,19107,39.955505,-75.155564,1,True,False,False,False,False,False,False
1,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,Family,4.5,100,2575 E Bay Dr,FL,Largo,33771,27.916116,-82.760461,2,False,True,False,False,False,False,False
2,kfNv-JZpuN6TVNSO6hHdkw,Hibachi Express,Cafe,4.0,20,6625 E 82nd St,PA,Indianapolis,46250,39.90432,-86.05308,1,True,False,False,False,False,False,False
3,sqSqqLy0sN8n2IZrAbzidQ,Domino's Pizza,American,3.5,8,3001 Highway 31 W,CA,White House,37188,36.464747,-86.659187,5,False,False,False,False,True,False,False
4,Mjboz24M9NlBeiOJKLEd_Q,DeSandro on Main,European,3.0,41,4105 Main St,PA,Philadelphia,19127,40.022466,-75.218314,4,False,False,False,True,False,False,False


In [93]:
def states(value):
  if value == 'CA':
    value = 'California'
  elif value == 'FL':
    value = 'Florida'
  elif value == 'PA':
    value = 'Pennsylvania'
  elif value == 'TX':
    value = 'Texas'
  return value

df_rest.state = df_rest.state.apply(states)

In [94]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

clusters = 34

# Selecciona las columnas relevantes
data = df_rest[['avg_rating']]
dummies = df_rest[[1, 2, 3, 4, 5, 6, 7]]  # Suponiendo que estas son tus columnas de dummies

# Escala la característica numérica (avg_rating)
scaler = StandardScaler()
scaled_avg_rating = scaler.fit_transform(data)

# Concatena la característica numérica escalada con las columnas de dummies
scaled_data = np.concatenate((scaled_avg_rating, dummies), axis=1)

# Entrenamiento del modelo K-Means
kmeans = KMeans(n_clusters=clusters, random_state=42)
df_rest['cluster'] = kmeans.fit_predict(scaled_data)

In [95]:
df_rest.drop(columns=['numerical_category',1,2,3,4,5,6,7],inplace=True)
df_rest.head()

Unnamed: 0,business_id,business_name,category,avg_rating,review_count,address,state,city,postal_code,latitude,longitude,cluster
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Cafe,4.0,80,935 Race St,California,Philadelphia,19107,39.955505,-75.155564,33
1,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,Family,4.5,100,2575 E Bay Dr,Florida,Largo,33771,27.916116,-82.760461,0
2,kfNv-JZpuN6TVNSO6hHdkw,Hibachi Express,Cafe,4.0,20,6625 E 82nd St,Pennsylvania,Indianapolis,46250,39.90432,-86.05308,33
3,sqSqqLy0sN8n2IZrAbzidQ,Domino's Pizza,American,3.5,8,3001 Highway 31 W,California,White House,37188,36.464747,-86.659187,16
4,Mjboz24M9NlBeiOJKLEd_Q,DeSandro on Main,European,3.0,41,4105 Main St,Pennsylvania,Philadelphia,19127,40.022466,-75.218314,32


In [96]:
df_counties = pd.read_csv('ZIP-County.csv')
df_counties

Unnamed: 0,ZIP,COUNTYNAME,STATE,STCOUNTYFP,CLASSFP
0,36003,Autauga County,AL,1001,H1
1,36006,Autauga County,AL,1001,H1
2,36067,Autauga County,AL,1001,H1
3,36066,Autauga County,AL,1001,H1
4,36703,Autauga County,AL,1001,H1
...,...,...,...,...,...
52884,850,St. Croix Island,VI,78010,H4
52885,840,St. Croix Island,VI,78010,H4
52886,820,St. Croix Island,VI,78010,H4
52887,830,St. John Island,VI,78020,H4


In [97]:
df_counties.ZIP = df_counties.ZIP.apply(str)

In [None]:
df_counties.drop_duplicates(subset='ZIP',inplace=True)

In [98]:
df_merge = pd.merge(df_rest,df_counties[['ZIP','COUNTYNAME']],left_on='postal_code',right_on='ZIP',how='left')
df_merge.head()

Unnamed: 0,business_id,business_name,category,avg_rating,review_count,address,state,city,postal_code,latitude,longitude,cluster,ZIP,COUNTYNAME
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Cafe,4.0,80,935 Race St,California,Philadelphia,19107,39.955505,-75.155564,33,19107,Philadelphia County
1,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,Family,4.5,100,2575 E Bay Dr,Florida,Largo,33771,27.916116,-82.760461,0,33771,Pinellas County
2,kfNv-JZpuN6TVNSO6hHdkw,Hibachi Express,Cafe,4.0,20,6625 E 82nd St,Pennsylvania,Indianapolis,46250,39.90432,-86.05308,33,46250,Hamilton County
3,kfNv-JZpuN6TVNSO6hHdkw,Hibachi Express,Cafe,4.0,20,6625 E 82nd St,Pennsylvania,Indianapolis,46250,39.90432,-86.05308,33,46250,Marion County
4,sqSqqLy0sN8n2IZrAbzidQ,Domino's Pizza,American,3.5,8,3001 Highway 31 W,California,White House,37188,36.464747,-86.659187,16,37188,Robertson County


In [99]:
df_merge[df_merge.COUNTYNAME.isna()]

Unnamed: 0,business_id,business_name,category,avg_rating,review_count,address,state,city,postal_code,latitude,longitude,cluster,ZIP,COUNTYNAME
29,YR0nwxBOKk6DiLHNItfsBQ,Boston Pizza,American,3.5,14,200 St. Albert Rd,Pennsylvania,St Albert,T8N 5H9,53.625071,-113.616645,16,,
31,jcL_qaGJiappzpnn-ifSoA,Fat Jakks,Asian,4.0,8,10126 - 107 Avenue NW,Florida,Edmonton,T5H 0V6,53.551551,-113.495155,2,,
50,JfGfqCnk0-Tnf6PlahPUJQ,Burger Village,American,4.0,10,6187 Currents Drive NW,Florida,Edmonton,T6W 0L9,53.436403,-113.604288,7,,
60,fNssGWMjae-SUzyyGblspQ,Apron,Family,4.5,24,47 E Kings Hwy,Florida,Haddonfield,08033,39.897107,-75.034226,0,,
70,SlLfWzeYOrVBxCd-0QILZQ,The Boiling House,Vegetarian,4.0,238,"1990 Marlton Pike E, Ste 10",Florida,Cherry Hill,08003,39.899661,-74.953300,5,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45122,0x808de6150ec03e61:0x25c5355734227bb6,Little Napoli,European,4.4,935,"Little Napoli, Dolores St, Carmel-By-The-Sea, ...",California,Carmel-by-the-Sea,92921,36.554031,-121.922531,12,,
45171,0x80dd264734b149b3:0x2a98e839aea58d8,Phở Vie Restaurant,Asian,4.0,297,"Phở Vie Restaurant, 15440 Beach Blvd, Westmins...",California,Westminster,92644,33.737926,-117.988155,2,,
45430,0x88dd08e42284924d:0x892c66e14a264c9,IHOP,American,4.2,1787,"IHOP, 2850 Ridge Way, Lake Wales, FL 33859, Un...",Florida,Lake Wales,33977,27.961174,-81.623398,7,,
45611,0x80857dc4e456b0f9:0x7129701f58df0129,Osmanthus,Asian,4.3,105,"Osmanthus, 6048 College Ave, Oakland, CA 94618...",California,Oakland,94168,37.849062,-122.252093,11,,


In [100]:
from geopy import Nominatim

geolocator = Nominatim(user_agent='pef999@hotmail.com')

def zip(row):
    try:
        if pd.isna(row['COUNTYNAME']):
            address = geolocator.reverse((row['latitude'],row['longitude'])).raw['address']
            zip = address['postcode']
            country = address['country']
            string = country + ' Printed'
            row['ZIP'] = zip
            row['COUNTYNAME'] = string
        return row
    except:
        row['ZIP'] = 'Error'
        return row

In [101]:
df_merge = df_merge.apply(zip,axis=1)

In [None]:
df_merge[df_merge.COUNTYNAME.isna()]

Unnamed: 0,business_id,business_name,category,avg_rating,review_count,address,state,city,postal_code,latitude,longitude,cluster,ZIP,COUNTYNAME
4800,GXDyR18dh33vu4lgmFiuEw,A'Pizze Tuscan Grill,European,3.5,51,120 Center Square Rd,Florida,Swedesboro,8085,39.737942,-75.328576,19,Error,
25854,0x8649b0e69f9442ef:0x40a536f0bde607d3,Redland Ballfield BBQ,American,4.7,36,"Redland Ballfield BBQ, hwy 64 & VZ County Road...",Texas,No Data,0,32.37849,-95.50357,21,Error,
38741,0x89c25a1b0cd3c8cb:0xe29a00ff230959b8,P.J. Clarke's,American,4.2,1684,"P.J. Clarke's, 250 Vesey St, New York, NY 1028...",New York,New York,0,40.713711,-74.016239,7,Error,
39446,0x8094f90050be4dcb:0x8a52470b825bc0f3,Chuck Wagon,Family,4.6,1118,"Chuck Wagon, 1203 Academy Ave, Sanger, CA 9365...",California,Sanger,0,36.699852,-119.554766,0,Error,
40717,0x80be3dffaf1721d3:0x85df382562b75703,Amigos Mexican Restaurant,Central American,4.2,276,"Amigos Mexican Restaurant, 285 N Main St, Bish...",California,Bishop,0,37.362931,-118.395612,6,Error,
41131,0x80c2a3e3c883c457:0xae5fd9b1d21d56ae,Mastro's Ocean Club,Family,4.5,1297,"Mastro's Ocean Club, 18412 Pacific Coast Hwy, ...",California,Topanga,0,34.039687,-118.576133,0,Error,
45043,0x80db44cec54fbe4f:0x682c8ce6161e7c98,Little Beijing,Asian,4.4,137,"Little Beijing, 1420 Beaumont Ave, Beaumont, C...",California,Beaumont,0,33.947794,-116.975985,11,Error,
45628,0x89ca52d06cb4121b:0x33736af1c5a6aa31,Hoss's,Family,4.2,1104,"Hoss's, 4308 Business 220, Bedford, PA 15522, ...",Pennsylvania,Bedford Township,0,40.051369,-78.510567,23,Error,


In [None]:
df_merge.loc[4800,'ZIP'] = '08085'
df_merge.loc[25854,'ZIP'] = '75754'
df_merge.loc[38741,'ZIP'] = '10281'
df_merge.loc[39446,'ZIP'] = '93657'
df_merge.loc[40717,'ZIP'] = '93514'
df_merge.loc[41131,'ZIP'] = '90265'
df_merge.loc[45043,'ZIP'] = '92223'
df_merge.loc[45628,'ZIP'] = '15522'

In [None]:
df_merge[df_merge.COUNTYNAME.apply(lambda x: 'Printed' in x if isinstance(x,str) else False)]

Unnamed: 0,business_id,business_name,category,avg_rating,review_count,address,state,city,postal_code,latitude,longitude,cluster,ZIP,COUNTYNAME
29,YR0nwxBOKk6DiLHNItfsBQ,Boston Pizza,American,3.5,14,200 St. Albert Rd,Pennsylvania,St Albert,T8N 5H9,53.625071,-113.616645,16,T8N 0P7,Canada Printed
31,jcL_qaGJiappzpnn-ifSoA,Fat Jakks,Asian,4.0,8,10126 - 107 Avenue NW,Florida,Edmonton,T5H 0V6,53.551551,-113.495155,2,T5H 4L4,Canada Printed
50,JfGfqCnk0-Tnf6PlahPUJQ,Burger Village,American,4.0,10,6187 Currents Drive NW,Florida,Edmonton,T6W 0L9,53.436403,-113.604288,7,T6W 0L9,Canada Printed
60,fNssGWMjae-SUzyyGblspQ,Apron,Family,4.5,24,47 E Kings Hwy,Florida,Haddonfield,08033,39.897107,-75.034226,0,08033,United States Printed
70,SlLfWzeYOrVBxCd-0QILZQ,The Boiling House,Vegetarian,4.0,238,"1990 Marlton Pike E, Ste 10",Florida,Cherry Hill,08003,39.899661,-74.953300,5,08003,United States Printed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44983,0x808580b9c8e6d781:0xc66d155748e1a74b,Jane The Bakery,Breakfast,4.7,608,"Jane The Bakery, 1881 Geary Blvd, San Francisc...",California,San Francisco,95115,37.783744,-122.434276,27,95115,United States Printed
45122,0x808de6150ec03e61:0x25c5355734227bb6,Little Napoli,European,4.4,935,"Little Napoli, Dolores St, Carmel-By-The-Sea, ...",California,Carmel-by-the-Sea,92921,36.554031,-121.922531,12,92921,United States Printed
45171,0x80dd264734b149b3:0x2a98e839aea58d8,Phở Vie Restaurant,Asian,4.0,297,"Phở Vie Restaurant, 15440 Beach Blvd, Westmins...",California,Westminster,92644,33.737926,-117.988155,2,92644,United States Printed
45430,0x88dd08e42284924d:0x892c66e14a264c9,IHOP,American,4.2,1787,"IHOP, 2850 Ridge Way, Lake Wales, FL 33859, Un...",Florida,Lake Wales,33977,27.961174,-81.623398,7,33977,United States Printed


In [None]:
df_merge = df_merge[df_merge.COUNTYNAME.apply(lambda x: 'Canada' not in x if isinstance(x,str) else True)]

In [None]:
df_merge.shape

(48065, 14)

In [None]:
df_merge.loc[4800,'postal_code'] = '08085'
df_merge.loc[25854,'postal_code'] = '75754'
df_merge.loc[38741,'postal_code'] = '10281'
df_merge.loc[39446,'postal_code'] = '93657'
df_merge.loc[40717,'postal_code'] = '93514'
df_merge.loc[41131,'postal_code'] = '90265'
df_merge.loc[45043,'postal_code'] = '92223'
df_merge.loc[45628,'postal_code'] = '15522'

In [None]:
df_merge = pd.merge(df_merge.drop(columns=['ZIP','COUNTYNAME']),df_counties[['ZIP','COUNTYNAME']],left_on='postal_code',right_on='ZIP',how='left')

In [None]:
df_merge.isna().sum()

business_id         0
business_name       0
category            0
avg_rating          0
review_count        0
address             0
state               0
city                0
postal_code         0
latitude            0
longitude           0
cluster             0
ZIP              1119
COUNTYNAME       1119
dtype: int64

In [None]:
def county(row):
    try:
        if pd.isna(row['COUNTYNAME']):
            address = geolocator.reverse((row['latitude'],row['longitude'])).raw['address']
            county = address['county']
            row['COUNTYNAME'] = county
        return row
    except:
        row['ZIP'] = 'Error'
        return row

In [None]:
df_merge = df_merge.apply(county,axis=1)

In [None]:
df_merge[df_merge.ZIP.isna()]

Unnamed: 0,business_id,business_name,category,avg_rating,review_count,address,state,city,postal_code,latitude,longitude,cluster,ZIP,COUNTYNAME
75,fNssGWMjae-SUzyyGblspQ,Apron,Family,4.5,24,47 E Kings Hwy,Florida,Haddonfield,08033,39.897107,-75.034226,0,,Camden County
87,SlLfWzeYOrVBxCd-0QILZQ,The Boiling House,Vegetarian,4.0,238,"1990 Marlton Pike E, Ste 10",Florida,Cherry Hill,08003,39.899661,-74.953300,5,,Camden County
113,KsDY9IYqvumohkp74w5i6A,Jersey Mike's Subs,Fastfood,4.0,21,910 Haddonfield-Berlin Rd,Florida,Voorhees,08043,39.848349,-74.977130,7,,Camden County
115,mYbEGFsRcA0tfUNSYcBJMA,Golden River Restaurant,American,5.0,8,"8 E Scott St, Ste 10",Florida,Riverside,08075,40.037703,-74.959281,21,,Burlington County
116,Mt1FB8hM4XTi3Tk4i0q5gQ,Nicola's Pizza,European,4.0,54,8 N Franklin St,Florida,Lambertville,08530,40.366418,-74.941678,5,,Hunterdon County
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59953,0x80857de82ea86dc7:0x205107e7c902f3f0,Rockridge Cafe,American,4.3,328,"Rockridge Cafe, 5492 College Ave, Oakland, CA ...",California,Oakland,94168,37.842317,-122.251453,7,,Alameda County
60170,0x808de6150ec03e61:0x25c5355734227bb6,Little Napoli,European,4.4,935,"Little Napoli, Dolores St, Carmel-By-The-Sea, ...",California,Carmel-by-the-Sea,92921,36.554031,-121.922531,12,,Monterey County
60223,0x80dd264734b149b3:0x2a98e839aea58d8,Phở Vie Restaurant,Asian,4.0,297,"Phở Vie Restaurant, 15440 Beach Blvd, Westmins...",California,Westminster,92644,33.737926,-117.988155,2,,Orange County
60538,0x88dd08e42284924d:0x892c66e14a264c9,IHOP,American,4.2,1787,"IHOP, 2850 Ridge Way, Lake Wales, FL 33859, Un...",Florida,Lake Wales,33977,27.961174,-81.623398,7,,Polk County


In [None]:
df_rest = df_merge.drop(columns='ZIP')

In [None]:
df_rest.head()

Unnamed: 0,business_id,business_name,category,avg_rating,review_count,address,state,city,postal_code,latitude,longitude,cluster,COUNTYNAME
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Cafe,4.0,80,935 Race St,California,Philadelphia,19107,39.955505,-75.155564,33,Philadelphia County
1,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,Family,4.5,100,2575 E Bay Dr,Florida,Largo,33771,27.916116,-82.760461,0,Pinellas County
2,kfNv-JZpuN6TVNSO6hHdkw,Hibachi Express,Cafe,4.0,20,6625 E 82nd St,Pennsylvania,Indianapolis,46250,39.90432,-86.05308,33,Hamilton County
3,kfNv-JZpuN6TVNSO6hHdkw,Hibachi Express,Cafe,4.0,20,6625 E 82nd St,Pennsylvania,Indianapolis,46250,39.90432,-86.05308,33,Marion County
4,kfNv-JZpuN6TVNSO6hHdkw,Hibachi Express,Cafe,4.0,20,6625 E 82nd St,Pennsylvania,Indianapolis,46250,39.90432,-86.05308,33,Hamilton County


In [None]:
df_merge.shape

(65167, 14)

In [None]:
df_merge.duplicated().sum()

17331

In [None]:
df_merge.shape[0] - df_merge.duplicated().sum()

47836

In [None]:
df_merge = df_merge.drop_duplicates()

In [None]:
df_merge.shape

(47836, 14)

In [None]:
df_merge.head()

Unnamed: 0,business_id,business_name,category,avg_rating,review_count,address,state,city,postal_code,latitude,longitude,cluster,ZIP,COUNTYNAME
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Cafe,4.0,80,935 Race St,California,Philadelphia,19107,39.955505,-75.155564,33,19107,Philadelphia County
1,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,Family,4.5,100,2575 E Bay Dr,Florida,Largo,33771,27.916116,-82.760461,0,33771,Pinellas County
2,kfNv-JZpuN6TVNSO6hHdkw,Hibachi Express,Cafe,4.0,20,6625 E 82nd St,Pennsylvania,Indianapolis,46250,39.90432,-86.05308,33,46250,Hamilton County
3,kfNv-JZpuN6TVNSO6hHdkw,Hibachi Express,Cafe,4.0,20,6625 E 82nd St,Pennsylvania,Indianapolis,46250,39.90432,-86.05308,33,46250,Marion County
6,sqSqqLy0sN8n2IZrAbzidQ,Domino's Pizza,American,3.5,8,3001 Highway 31 W,California,White House,37188,36.464747,-86.659187,16,37188,Robertson County


In [None]:
df_rest = df_merge.drop(columns='ZIP')

In [None]:
df_rest['County-cluster_count'] = df_rest.groupby(['COUNTYNAME', 'cluster'])['COUNTYNAME'].transform('count')
df_rest['County_count'] = df_rest.groupby(['COUNTYNAME'])['COUNTYNAME'].transform('count')

In [None]:
df_rest['%_Competencia'] = df_rest['County-cluster_count']/df_rest['County_count']*100
df_rest.sort_values(by='%_Competencia',ascending=False).head(20)

Unnamed: 0,business_id,business_name,category,avg_rating,review_count,address,state,city,postal_code,latitude,longitude,cluster,COUNTYNAME,County-cluster_count,County_count,%_Competencia
58930,0x86f85fd3a9de44b7:0x5a09dbfe01394d24,The Windmill City Cafe,No Detail,4.2,64,"The Windmill City Cafe, 610 4th St, Sterling C...",Texas,Sterling City,76951,31.836659,-100.987682,23,Sterling County,1.0,1.0,100.0
18308,ifBUgOthmXRjrO3rmI8k6w,Grille Billy's,American,3.0,9,1701 Fairview Blvd,Pennsylvania,Fairview,37062,35.990983,-87.092953,1,Dickson County,1.0,1.0,100.0
36795,0x87025e58e5ceb8bf:0x41b00346e4adc859,Panda Chinese Buffet,Asian,3.5,28,"Panda Chinese Buffet, 1820 W American Blvd, Mu...",Texas,Muleshoe,79347,34.237087,-102.744601,28,Bailey County,1.0,1.0,100.0
49057,0x86f8ba4240f65a57:0xf1e33cd9e7f733c4,Stoneys BBQ and Catering,American,4.5,35,"Stoneys BBQ and Catering, 1724 Hickory St, Col...",Texas,Colorado City,79512,32.404647,-100.860269,21,Mitchell County,1.0,1.0,100.0
43927,0x866825f740f430fb:0x63b65c6eaa0711b9,Castro's Cafe Restaurant,No Detail,4.5,36,"Castro's Cafe Restaurant, 201 S Alamo St, Refu...",Texas,Refugio,78377,28.303738,-97.275128,0,Refugio County,1.0,1.0,100.0
40678,0x87012afac47047b7:0x3e3bfdcdf37bba5f,Brickstreet 200,No Detail,4.0,74,"Brickstreet 200, 200 Main St, Panhandle, TX 79068",Texas,Panhandle,79068,35.343032,-101.378788,23,Carson County,1.0,1.0,100.0
14768,qKtq_mAOMLBonDn73AKXzQ,Thai Phooket II,Asian,4.0,192,915 Rivergate Pkwy,Pennsylvania,Goodletsville,37012,36.298126,-86.700302,2,DeKalb County,1.0,1.0,100.0
23140,0x8638fe34bff40e07:0xd0c3a649e173a315,Wise Eats & Treats,American,4.3,28,"Wise Eats & Treats, 219 S Magnolia St, Woodvil...",Texas,Woodville,75979,30.773648,-94.414333,7,Tyler County,1.0,1.0,100.0
46834,0x86fe4334ed7711b7:0x46abba40ac5adff2,Mama Jo's Steak House,Family,4.1,28,"Mama Jo's Steak House, 1250 S 9th St, Slaton, ...",Texas,Slaton,79364,33.427201,-101.642636,23,Lynn County,1.0,1.0,100.0
52658,0x87a84477754054b1:0x24c3a22236589341,Nana's Cafe,American,4.9,25,"Nana's Cafe, 101 S Young St, Follett, TX 79034",Texas,Follett,79034,36.433516,-100.138039,21,Lipscomb County,1.0,1.0,100.0


In [None]:
df_rest.shape

(47836, 16)

In [None]:
df_rest[df_rest.County_count > 10].shape

(47023, 16)

In [None]:
def competencia(row):
    if row['County_count'] < 3:
        row['%_Competencia'] = 0
    else:
        row['%_Competencia'] = (row['County-cluster_count']/row['County_count']) *100
    return row

In [None]:
df_rest.shape

(47836, 16)

In [None]:
df_rest = df_rest[~df_rest.COUNTYNAME.isna()]
df_rest = df_rest.apply(competencia,axis=1)

In [None]:
df_rest.sort_values(by='%_Competencia',ascending=False).head()

Unnamed: 0,business_id,business_name,category,avg_rating,review_count,address,state,city,postal_code,latitude,longitude,cluster,COUNTYNAME,County-cluster_count,County_count,%_Competencia
49817,0x809a87c9b203ffff:0x12b8e850f8273e47,Grumpy Jeff's Public House,No Detail,4.8,138,"Grumpy Jeff's Public House, 12 W Main St, Ione...",California,Ione,95640,38.352519,-120.933466,25,Amador County,2.0,3.0,66.666667
31795,0x8650fbe839b91065:0xd799ff1b49cec5de,Soda Shop,Cafe,4.8,33,"Soda Shop, 107 N Houston St, Comanche, TX 76442",Texas,Comanche,76442,31.897749,-98.605382,27,Mills County,2.0,3.0,66.666667
44527,0x809a7bbb932b6ca1:0xc34ed8ff87bb17a,Villa Privata,No Detail,4.7,38,"Villa Privata, 1218 Jackson Gate Rd, Jackson, ...",California,Jackson,95642,38.364627,-120.774068,25,Amador County,2.0,3.0,66.666667
30546,0x86fb19e8b3781fcb:0x375c717a34e9c80,El Chuco Mexican Food,No Detail,4.3,56,"El Chuco Mexican Food, 841 TX-302, Kermit, TX ...",Texas,Kermit,79745,31.852301,-103.067642,0,Winkler County,2.0,3.0,66.666667
32259,0x865086a0deae3a25:0x44296765dfbc5525,Aroma of Hope Coffee Shop,Cafe,4.7,18,"Aroma of Hope Coffee Shop, next to Family Doll...",Texas,Goldthwaite,76844,31.450837,-98.56957,27,Mills County,2.0,3.0,66.666667


In [None]:
df_rest.columns

Index(['business_id', 'business_name', 'category', 'avg_rating',
       'review_count', 'address', 'state', 'city', 'postal_code', 'latitude',
       'longitude', 'cluster', 'COUNTYNAME', 'County-cluster_count',
       'County_count', '%_Competencia'],
      dtype='object')

In [None]:
df_rest.drop(columns=['County_count','County-cluster_count'],inplace=True)

In [None]:
df_rest.describe()

Unnamed: 0,avg_rating,review_count,latitude,longitude,cluster,%_Competencia
count,47831.0,47831.0,47831.0,47831.0,47831.0,47831.0
mean,3.983036,100.599235,36.308442,-89.39653,14.113295,6.279163
std,0.740331,242.718605,5.07959,16.523745,9.934404,4.927916
min,1.0,5.0,24.547407,-124.263391,0.0,0.0
25%,3.6,24.0,32.26083,-97.669158,5.0,3.061224
50%,4.2,44.0,38.593022,-82.514307,12.0,4.972875
75%,4.5,84.0,40.339964,-75.257297,23.0,7.854153
max,5.0,7470.0,44.990914,-71.919502,33.0,66.666667


In [None]:
df_rest.shape

(47831, 14)

In [None]:
df_rest.COUNTYNAME.isna().sum()

0

In [None]:
df_reviews = pd.read_parquet('Unification/df_reviews.parquet')

In [None]:
df_reviews

Unnamed: 0,review_id,user_id,business_id,rating,year,month,sentiment_analysis
0,0,3-1va0IQfK-9tUMzfHWfTA,MTSW4McQd7CbVtyjqoe9mw,5.0,2018,5,5
1,1,KQSRUu4Aapl0hG6eu2v8iw,MTSW4McQd7CbVtyjqoe9mw,4.0,2018,3,5
2,2,qUfRCH5NUyRDsJfM6jA5PQ,MTSW4McQd7CbVtyjqoe9mw,4.0,2017,4,5
3,3,0q2W3-ieBUJWD5TTLKi3Ug,MTSW4McQd7CbVtyjqoe9mw,4.0,2016,4,5
4,4,z-yvbUGwFn8PAijEHdU_RA,MTSW4McQd7CbVtyjqoe9mw,5.0,2018,3,4
...,...,...,...,...,...,...,...
3903580,3903580,115681351373725195193,0x865ca94e2edeb237:0xdfc44b585273f4c5,5.0,2021,1,2
3903581,3903581,115300300663901754734,0x865ca94e2edeb237:0xdfc44b585273f4c5,5.0,2020,12,2
3903582,3903582,100925700082850645645,0x865ca94e2edeb237:0xdfc44b585273f4c5,5.0,2019,9,2
3903583,3903583,109106707670810407931,0x865ca94e2edeb237:0xdfc44b585273f4c5,5.0,2020,12,2


In [None]:
df_unified = pd.merge(df_reviews,df_rest,on='business_id',how='outer')

In [None]:
df_unified.isna().sum()

review_id                2599
user_id                  2599
business_id                 0
rating                   2599
year                     2599
month                    2599
sentiment_analysis       2599
business_name         1388490
category              1388490
avg_rating            1388490
review_count          1388490
address               1388490
state                 1388490
city                  1388490
postal_code           1388490
latitude              1388490
longitude             1388490
cluster               1388490
COUNTYNAME            1388490
%_Competencia         1388490
dtype: int64

In [None]:
df_unified.shape

(4312657, 20)

In [None]:
df_final = df_unified.dropna()

In [None]:
df_final.shape

(2921568, 20)

In [None]:
df_final.isna().sum()

review_id             0
user_id               0
business_id           0
rating                0
year                  0
month                 0
sentiment_analysis    0
business_name         0
category              0
avg_rating            0
review_count          0
address               0
state                 0
city                  0
postal_code           0
latitude              0
longitude             0
cluster               0
COUNTYNAME            0
%_Competencia         0
dtype: int64

In [None]:
df_final.head()

Unnamed: 0,review_id,user_id,business_id,rating,year,month,sentiment_analysis,business_name,category,avg_rating,review_count,address,state,city,postal_code,latitude,longitude,cluster,COUNTYNAME,%_Competencia
0,0.0,3-1va0IQfK-9tUMzfHWfTA,MTSW4McQd7CbVtyjqoe9mw,5.0,2018.0,5.0,5.0,St Honore Pastries,Cafe,4.0,80.0,935 Race St,California,Philadelphia,19107,39.955505,-75.155564,33.0,Philadelphia County,3.96668
1,1.0,KQSRUu4Aapl0hG6eu2v8iw,MTSW4McQd7CbVtyjqoe9mw,4.0,2018.0,3.0,5.0,St Honore Pastries,Cafe,4.0,80.0,935 Race St,California,Philadelphia,19107,39.955505,-75.155564,33.0,Philadelphia County,3.96668
2,2.0,qUfRCH5NUyRDsJfM6jA5PQ,MTSW4McQd7CbVtyjqoe9mw,4.0,2017.0,4.0,5.0,St Honore Pastries,Cafe,4.0,80.0,935 Race St,California,Philadelphia,19107,39.955505,-75.155564,33.0,Philadelphia County,3.96668
3,3.0,0q2W3-ieBUJWD5TTLKi3Ug,MTSW4McQd7CbVtyjqoe9mw,4.0,2016.0,4.0,5.0,St Honore Pastries,Cafe,4.0,80.0,935 Race St,California,Philadelphia,19107,39.955505,-75.155564,33.0,Philadelphia County,3.96668
4,4.0,z-yvbUGwFn8PAijEHdU_RA,MTSW4McQd7CbVtyjqoe9mw,5.0,2018.0,3.0,4.0,St Honore Pastries,Cafe,4.0,80.0,935 Race St,California,Philadelphia,19107,39.955505,-75.155564,33.0,Philadelphia County,3.96668


In [None]:
df_final.rename(columns={'COUNTYNAME':'county'},inplace=True)
df_final.columns

Index(['review_id', 'user_id', 'business_id', 'rating', 'year', 'month',
       'sentiment_analysis', 'business_name', 'category', 'avg_rating',
       'review_count', 'address', 'state', 'city', 'postal_code', 'latitude',
       'longitude', 'cluster', 'county', '%_Competencia'],
      dtype='object')

In [None]:
df_final.rename(columns={'%_Competencia':'%_competition'},inplace=True)
df_final.columns

Index(['review_id', 'user_id', 'business_id', 'rating', 'year', 'month',
       'sentiment_analysis', 'business_name', 'category', 'avg_rating',
       'review_count', 'address', 'state', 'city', 'postal_code', 'latitude',
       'longitude', 'cluster', 'county', '%_competition'],
      dtype='object')

In [None]:
clusters = {0:'Family/NoDetail',1:'American',2:'Asian',3:'Cafe/breakfast',4:'Latin America',5:'European',
            6:'Latin America',7:'American',8:'American',9:'Family/NoDetail',10:'Cafe/breakfast',11:'Asian',12:'European',
            13:'European',14:'Asian',15:'Family/NoDetail',16:'American',17:'Family',18:'Latin America',19:'European',
            20:'Cafe/breakfast',21:'American',22:'Central American',23:'Family/NoDetail',24:'Night',25:'Family/NoDetail',
            26:'Cafe/breakfast',27:'Cafe/breakfast',28:'Asian',29:'Latin American',30:'Cafe/breakfast',31:'American',
            32:'European',33:'Cafe/breakfast'}

df_rest['cluster_rating'] = df_rest.groupby('cluster')['avg_rating'].transform('mean')
df_rest['cluster_name'] = None

def cluster_name(row):
    row['cluster_name'] = str(round(row['cluster_rating'],1)) + ' ' + clusters[row['cluster']]
    return row

print(df_rest.shape[0])

df_rest = df_rest.apply(cluster_name,axis=1)

print(df_rest.shape[0])

47831
47831


In [None]:
print(df_rest.cluster_name.unique())

['4.0 Cafe/breakfast' '4.5 Family/NoDetail' '3.6 American' '3.0 European'
 '2.3 American' '4.1 American' '3.5 European' '4.1 Family/NoDetail'
 '3.6 Family/NoDetail' '4.0 European' '4.0 Asian' '1.4 American'
 '3.0 American' '2.3 European' '1.4 Cafe/breakfast' '3.5 Latin America'
 '4.4 Latin America' '4.6 American' '3.5 Asian' '4.5 Cafe/breakfast'
 '2.2 Family' '4.0 Latin America' '2.7 Central American'
 '3.0 Cafe/breakfast' '4.5 Asian' '3.0 Family/NoDetail'
 '4.8 Family/NoDetail' '4.5 European' '2.2 Cafe/breakfast' '2.8 Asian'
 '4.8 Cafe/breakfast' '4.7 Latin American' '3.5 Cafe/breakfast'
 '4.3 Night']


In [None]:
df_rest.columns

Index(['business_id', 'business_name', 'category', 'avg_rating',
       'review_count', 'address', 'state', 'city', 'postal_code', 'latitude',
       'longitude', 'cluster', 'COUNTYNAME', '%_Competencia', 'cluster_rating',
       'cluster_name'],
      dtype='object')

In [14]:
print(df_final.shape)
state_bounds = {
    'Florida': {'lat_min': 24.396308, 'lat_max': 31.000968, 'long_min': -87.634643, 'long_max': -80.031362},
    'California': {'lat_min': 32.534156, 'lat_max': 42.009518, 'long_min': -124.409591, 'long_max': -114.131211},
    'Pennsylvania': {'lat_min': 39.719800, 'lat_max': 42.269179, 'long_min': -80.519891, 'long_max': -74.689516},
    'New York': {'lat_min': 40.477399, 'lat_max': 45.015850, 'long_min': -79.762152, 'long_max': -71.856214},
    'Texas': {'lat_min': 25.837377, 'lat_max': 36.500704, 'long_min': -106.645646, 'long_max': -93.508292}
}

(2251808, 20)


In [26]:
def is_within_bounds(row):
    for bounds in state_bounds.values():
        if bounds['lat_min'] <= row['latitude'] <= bounds['lat_max'] and bounds['long_min'] <= row['longitude'] <= bounds['long_max']:
            return True
    return False

In [25]:
df_final.to_parquet('Unification/df_unified.parquet')

In [81]:
df_rest.rename(columns={'%_Competencia':'%_competition'},inplace=True)
df_rest.rename(columns={'COUNTYNAME':'county'},inplace=True)

In [27]:
print(df_rest.shape)
df_rest = df_rest[df_rest.apply(is_within_bounds,axis=1)]
print(df_rest.shape)

(34553, 16)
(34553, 16)


In [9]:
df_rest.to_parquet('Unification/df_restaurants.parquet')

In [85]:
keep = df_reviews.columns.tolist()

df_reviews = df_final[keep]

In [86]:
df_reviews.shape

(2921568, 7)

In [87]:
df_reviews.head()

Unnamed: 0,review_id,user_id,business_id,rating,year,month,sentiment_analysis
0,0.0,3-1va0IQfK-9tUMzfHWfTA,MTSW4McQd7CbVtyjqoe9mw,5.0,2018.0,5.0,5.0
1,1.0,KQSRUu4Aapl0hG6eu2v8iw,MTSW4McQd7CbVtyjqoe9mw,4.0,2018.0,3.0,5.0
2,2.0,qUfRCH5NUyRDsJfM6jA5PQ,MTSW4McQd7CbVtyjqoe9mw,4.0,2017.0,4.0,5.0
3,3.0,0q2W3-ieBUJWD5TTLKi3Ug,MTSW4McQd7CbVtyjqoe9mw,4.0,2016.0,4.0,5.0
4,4.0,z-yvbUGwFn8PAijEHdU_RA,MTSW4McQd7CbVtyjqoe9mw,5.0,2018.0,3.0,4.0


In [8]:
df_reviews.to_parquet('Unification/df_reviews.parquet')

In [None]:
['review_id','user_id','business_id','rating','year','month','sentiment_analysis']