In [1]:
import warnings
import pandas as pd
import numpy as np
import json
from textblob import TextBlob
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, r2_score
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import math
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

warnings.simplefilter('ignore')

In [3]:
df_review = pd.read_json('../../Samples/reviews_sample.json')
df_business = pd.read_json("../../Samples/business_sample.json")
df_review = df_review.rename(columns={'stars': 'rating'})

In [4]:
df_join = df_review.join(df_business.set_index('business_id'), on='business_id')

In [13]:
s = df_join.groupby('is_open')['review_count'].mean()
print(s)

is_open
0    326.020034
1    329.091856
Name: review_count, dtype: float64


In [16]:
df_join['date'] = df_join['date'].dt.year

In [59]:
test = df_join.groupby(['business_id','date'])['review_id'].count().reset_index() # .to_frame()

In [60]:
test

Unnamed: 0,business_id,date,review_id
0,--1UhMGODdWsrMastO9DZw,2016,11
1,--1UhMGODdWsrMastO9DZw,2017,11
2,--1UhMGODdWsrMastO9DZw,2018,4
3,--6MefnULPED_I942VcFNA,2008,1
4,--6MefnULPED_I942VcFNA,2010,3
...,...,...,...
86974,zzmIMvqiBJ_-wVKg_OnGpw,2017,28
86975,zzmIMvqiBJ_-wVKg_OnGpw,2018,30
86976,zzzaIBwimxVej4tY6qFOUQ,2012,6
86977,zzzaIBwimxVej4tY6qFOUQ,2013,20


In [61]:
test.groupby(['business_id'], sort=False)['review_id'].max()

business_id
--1UhMGODdWsrMastO9DZw    11
--6MefnULPED_I942VcFNA    13
--7zmmkVg-IMGaXbuVd0SQ    18
--8LPVSo5i0Oo61X01sV9A     1
--9QQLMTbFzLJ_oT-ON3Xw     5
                          ..
zzTM2KyJkKomLDkl6rM4dQ     2
zzf3RkMI1Y2E1QaZqeU8yA    15
zzkLY0npjBFX3gxGitmbUA     2
zzmIMvqiBJ_-wVKg_OnGpw    30
zzzaIBwimxVej4tY6qFOUQ    20
Name: review_id, Length: 19261, dtype: int64

In [63]:
idx = test.groupby(['business_id'])['review_id'].transform(max) == test['review_id']
dates_df = test[idx]

In [67]:
dates_df = dates_df.drop_duplicates(subset='business_id', keep='last')

In [68]:
dates_df

Unnamed: 0,business_id,date,review_id
1,--1UhMGODdWsrMastO9DZw,2017,11
11,--6MefnULPED_I942VcFNA,2017,13
16,--7zmmkVg-IMGaXbuVd0SQ,2017,18
21,--8LPVSo5i0Oo61X01sV9A,2018,1
25,--9QQLMTbFzLJ_oT-ON3Xw,2016,5
...,...,...,...
86956,zzTM2KyJkKomLDkl6rM4dQ,2015,2
86960,zzf3RkMI1Y2E1QaZqeU8yA,2015,15
86966,zzkLY0npjBFX3gxGitmbUA,2018,2
86975,zzmIMvqiBJ_-wVKg_OnGpw,2018,30


In [69]:
bus_date = dates_df[['business_id', 'date']]

In [70]:
bus_date

Unnamed: 0,business_id,date
1,--1UhMGODdWsrMastO9DZw,2017
11,--6MefnULPED_I942VcFNA,2017
16,--7zmmkVg-IMGaXbuVd0SQ,2017
21,--8LPVSo5i0Oo61X01sV9A,2018
25,--9QQLMTbFzLJ_oT-ON3Xw,2016
...,...,...
86956,zzTM2KyJkKomLDkl6rM4dQ,2015
86960,zzf3RkMI1Y2E1QaZqeU8yA,2015
86966,zzkLY0npjBFX3gxGitmbUA,2018
86975,zzmIMvqiBJ_-wVKg_OnGpw,2018


In [71]:
df_review['date'] = df_review['date'].dt.year

In [72]:
df_review

Unnamed: 0,review_id,user_id,business_id,rating,useful,funny,cool,text,date
0,Q1sbwvVQXV2734tPgoKj4Q,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1,6,1,0,Total bill for this horrible service? Over $8G...,2013
1,2TzJjDVDEuAW6MR5Vuc1ug,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5,3,0,0,I have to say that this office really has it t...,2016
2,1daGBpea0sleayFeeXuoYw,q3GeSW9dWN9r_ocqFkhrvg,9nTF596jDvBBia2EXXiOOg,1,1,0,0,"if i can give this place no stars i would, i o...",2014
3,0AsmPiAQduxh5jE_si8cLA,C_hUvw2z0R-Rv0yZb6QCZA,dm6sO_Y8JdKTE1ZM955yug,5,0,0,0,ended up here because Raku was closed and it r...,2014
4,E6B-2U2sGG3xgmnNWZAEew,DbccYu3OppWKl21OanZnTg,YSUcHqlKMPHHJ_cTrqtNrA,1,0,0,0,Came here on a Thursday night at 6:30 p.m. My ...,2017
...,...,...,...,...,...,...,...,...,...
658431,UFDA9yU07WeySBeAX8OAbQ,QbFTdA-NXmK3PJxgK-DVdg,SzK2HBcDTbIBKcoV_iivpw,5,2,1,2,My husband and I joined this gym specifically ...,2018
658432,_7taDBtB4SESrUrgLAACWw,4ULUWqda23gUl_VylJ2OAw,OfIBp5HCGt6SJUQT4HoO8g,5,7,2,5,By far THE BEST Walgreens I have ever gone to....,2012
658433,I_ilHf4CbAjPQ_7CSsi6RA,SXWvKA2rnjio9UjN1a3rJg,m0FxJP2atkE8mn-xO23fpQ,4,2,0,1,red tiger looks like a run of the mill pub on ...,2018
658434,vNUUOTSJh4R3dhDMrP-ASA,0Zswwlz4NzUJoG-skyWzIw,oXmFOThxSSRLYb2n75JliQ,3,4,1,1,Created my own tuna and salmon custom poke bow...,2018


In [110]:
df_rev = df_review[['business_id', 'text', 'date', 'rating','useful']]

In [111]:
df_rev

Unnamed: 0,business_id,text,date,rating,useful
0,ujmEBvifdJM6h6RLv4wQIg,Total bill for this horrible service? Over $8G...,2013,1,6
1,WTqjgwHlXbSFevF32_DJVw,I have to say that this office really has it t...,2016,5,3
2,9nTF596jDvBBia2EXXiOOg,"if i can give this place no stars i would, i o...",2014,1,1
3,dm6sO_Y8JdKTE1ZM955yug,ended up here because Raku was closed and it r...,2014,5,0
4,YSUcHqlKMPHHJ_cTrqtNrA,Came here on a Thursday night at 6:30 p.m. My ...,2017,1,0
...,...,...,...,...,...
658431,SzK2HBcDTbIBKcoV_iivpw,My husband and I joined this gym specifically ...,2018,5,2
658432,OfIBp5HCGt6SJUQT4HoO8g,By far THE BEST Walgreens I have ever gone to....,2012,5,7
658433,m0FxJP2atkE8mn-xO23fpQ,red tiger looks like a run of the mill pub on ...,2018,4,2
658434,oXmFOThxSSRLYb2n75JliQ,Created my own tuna and salmon custom poke bow...,2018,3,4


In [112]:
df = bus_date.join(df_rev.set_index(['business_id', 'date']), on=['business_id', 'date'])

In [113]:
df

Unnamed: 0,business_id,date,text,rating,useful
1,--1UhMGODdWsrMastO9DZw,2017,If you are looking for authentic Mexican stree...,5,0
1,--1UhMGODdWsrMastO9DZw,2017,Fantastic spot for lunch with great value for ...,5,2
1,--1UhMGODdWsrMastO9DZw,2017,"Tasty, authentic Mexican street food that give...",4,0
1,--1UhMGODdWsrMastO9DZw,2017,This is our first time here with the renovatio...,5,0
1,--1UhMGODdWsrMastO9DZw,2017,"We were in the mood for tacos, and came across...",5,0
...,...,...,...,...,...
86977,zzzaIBwimxVej4tY6qFOUQ,2013,"I have to say, I do like this place a lot more...",4,1
86977,zzzaIBwimxVej4tY6qFOUQ,2013,This is a familiar place to me. I frequented t...,4,1
86977,zzzaIBwimxVej4tY6qFOUQ,2013,"OK, its fast food, was quick hot and fast... c...",5,0
86977,zzzaIBwimxVej4tY6qFOUQ,2013,This place is very good. At first i thought it...,4,0


In [83]:
result = df.join(df_business[['business_id','is_open','stars']].set_index('business_id'), on='business_id')

In [84]:
result

Unnamed: 0,business_id,date,text,rating,is_open,stars
1,--1UhMGODdWsrMastO9DZw,2017,If you are looking for authentic Mexican stree...,5,1,4.0
1,--1UhMGODdWsrMastO9DZw,2017,Fantastic spot for lunch with great value for ...,5,1,4.0
1,--1UhMGODdWsrMastO9DZw,2017,"Tasty, authentic Mexican street food that give...",4,1,4.0
1,--1UhMGODdWsrMastO9DZw,2017,This is our first time here with the renovatio...,5,1,4.0
1,--1UhMGODdWsrMastO9DZw,2017,"We were in the mood for tacos, and came across...",5,1,4.0
...,...,...,...,...,...,...
86977,zzzaIBwimxVej4tY6qFOUQ,2013,"I have to say, I do like this place a lot more...",4,0,3.5
86977,zzzaIBwimxVej4tY6qFOUQ,2013,This is a familiar place to me. I frequented t...,4,0,3.5
86977,zzzaIBwimxVej4tY6qFOUQ,2013,"OK, its fast food, was quick hot and fast... c...",5,0,3.5
86977,zzzaIBwimxVej4tY6qFOUQ,2013,This place is very good. At first i thought it...,4,0,3.5


In [85]:
text_length = np.vectorize(len)

def my_aggregator(data):
    reviews = data['text']
    mean_review = np.mean(text_length(reviews))
    mean_polarity = np.mean([TextBlob(x).sentiment.polarity for x in reviews])
    mean_subjectivity = np.mean([TextBlob(x).sentiment.subjectivity for x in reviews])
    stars = data.iloc[0]['stars']
    is_open = data.iloc[0]['is_open']
    
    r = np.zeros(5)
    rating, rating_counts = np.unique(data['rating'].to_numpy(), return_counts=True)
    total = sum(rating_counts)
    ratings = rating_counts.astype(np.float64) / total
    for i, rate in enumerate(rating):
        r[rate-1] = ratings[i]
    
    return np.concatenate((r, np.array([mean_review, mean_polarity, mean_subjectivity, stars, is_open])))


h = result.groupby('business_id').apply(my_aggregator).to_numpy()
h = np.stack(h)

df_all = pd.DataFrame(data=h, index=list(range(0,len(h))), columns=["p1", "p2", "p3", "p4", "p5", "mean_review", "mean_polarity", "mean_subjectivity", "stars", "is_open"])
df_all.to_json(r'sample_features_one_year.json',orient='records')

In [98]:
df_f = pd.read_json('sample_features_one_year.json')
h = df_f.to_numpy()
features = h[:,:9]
targets = h[:,9]

In [87]:
df_all.groupby('is_open')['mean_polarity'].mean()

is_open
0.0    0.197965
1.0    0.209379
Name: mean_polarity, dtype: float64

In [88]:
df_all.groupby('is_open')['mean_subjectivity'].mean()

is_open
0.0    0.542975
1.0    0.541253
Name: mean_subjectivity, dtype: float64

In [108]:
kf = KFold(n_splits=5)

accuracy_RF = []
probs_RF = []
for train_index, test_index in kf.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = targets[train_index], targets[test_index]
    RF = RandomForestClassifier(n_estimators=200, max_depth = 5, class_weight='balanced').fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    probs_RF.append(RF.predict_proba(X_test))
    accuracy_RF.append(accuracy_score(np.where(y_pred>0,1,0), np.where(y_test>0,1,0)))

print(f'RF Classification Accuracy Score: {np.mean(accuracy_RF)}')

RF Classification Accuracy Score: 0.6411401049848819


In [109]:
npa = np.asarray(probs_RF)
npac = np.concatenate((npa[0],npa[1],npa[2],npa[3],npa[4])) # ,npa[5],npa[6],npa[7],npa[8],npa[9]
count1 = 0

for i in list(range(0,len(features))):
    if np.argmax(npac[i]) == 0:
        count1+=1
print(count1)

c = 0
for i in list(range(0,len(targets))):
    if np.argmax(npac[i]) == 0 and targets[i] == 0:
        c += 1
        
tot = len(np.argwhere(targets==0))
print(f'{c} / {tot} = {c/tot}')

7628
2094 / 3472 = 0.603110599078341


In [105]:
feats_1 = []
feats_2 = []
# feats_3 = []

d = {1: feats_1, 1.5:feats_1, 2:feats_1, 2.5:feats_1, 3:feats_1, 3.5:feats_1, 4:feats_1, 4.5:feats_2, 5:feats_2}

for i in list(range(0,len(features))):
    if targets[i] == 1:
        d[features[i,8]].append(i)      


targs = targets.copy()
targs[feats_1] = 1
targs[feats_2] = 2
# targs[feats_3] = 3

In [97]:
df_all

Unnamed: 0,p1,p2,p3,p4,p5,mean_review,mean_polarity,mean_subjectivity,stars,is_open
0,0.000000,0.000000,0.000000,0.363636,0.636364,461.090909,0.359661,0.535929,4.0,1.0
1,0.230769,0.153846,0.153846,0.307692,0.153846,704.000000,0.198037,0.495824,3.0,1.0
2,0.111111,0.166667,0.055556,0.333333,0.333333,510.833333,0.238986,0.552165,4.0,1.0
3,1.000000,0.000000,0.000000,0.000000,0.000000,514.000000,-0.015000,0.095000,3.5,1.0
4,0.400000,0.200000,0.200000,0.000000,0.200000,518.600000,0.073729,0.478273,3.5,1.0
...,...,...,...,...,...,...,...,...,...,...
19256,0.500000,0.000000,0.000000,0.000000,0.500000,1024.000000,0.282130,0.529963,3.0,1.0
19257,0.000000,0.000000,0.000000,0.066667,0.933333,717.466667,0.315555,0.552567,4.5,1.0
19258,0.000000,0.000000,0.000000,0.000000,1.000000,675.500000,0.280215,0.574681,5.0,1.0
19259,0.133333,0.100000,0.133333,0.100000,0.533333,509.000000,0.248969,0.515621,4.0,1.0


In [114]:
df_business

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,VTGfwgT8TFErsCayY0wnUg,Hidden City Pest Control,"3855 Wiggins Bay St, Unit 203",Las Vegas,NV,89129,36.231066,-115.317048,5.0,4,1,{'BusinessAcceptsCreditCards': 'True'},"Pest Control, Local Services","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
1,a8_n_O5Hk2mWkNfz47NfMA,Verizon,30171 Detroit Rd,Westlake,OH,44145,41.461004,-81.953416,4.0,4,1,"{'BusinessAcceptsCreditCards': 'True', 'Busine...","Electronics, Home Services, Internet Service P...","{'Monday': '11:0-19:0', 'Tuesday': '11:0-19:0'..."
2,7DyeAf6eh5-b0UmeVuvxDw,American Air & Appliance,"515 E Carefree Hwy, Ste 492",Phoenix,AZ,85085,33.798348,-112.067059,5.0,15,1,"{'ByAppointmentOnly': 'True', 'BusinessAccepts...","Home Services, Appliances & Repair, Local Serv...","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ..."
3,yvkV-GRXNlHA3nEG05qo4A,Sugar and Lace Bakery,28th Ave and Bell rd,Phoenix,AZ,85053,33.644393,-112.120733,5.0,5,1,"{'RestaurantsDelivery': 'True', 'DogsAllowed':...","Custom Cakes, Cupcakes, Bakeries, Food","{'Monday': '8:0-19:0', 'Tuesday': '8:0-19:0', ..."
4,Zoowy70HwGUEsMD9qNlF4Q,Lush,7014 E Camelback Rd,Scottsdale,AZ,85250,33.503899,-111.928302,4.5,132,1,"{'BikeParking': 'True', 'RestaurantsPriceRange...","Skin Care, Beauty & Spas, Cosmetics & Beauty S...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19256,LOdeVw1kMh8CLx4mtWvX4Q,Jack Saloon,"9395 Boulevard Leduc Brossard, Suite 10",Brossard,QC,J4Y 0A5,45.442238,-73.436694,3.0,14,1,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Restaurants, Comfort Food","{'Monday': '3:0-11:30', 'Tuesday': '3:0-11:30'..."
19257,gCr_bfJG-c8Al3UADWBClg,Tora Sushi,"3160 Steeles Avenue E, Unit 6",Markham,ON,L3R 4G9,43.815450,-79.344597,4.0,7,1,"{'RestaurantsDelivery': 'False', 'RestaurantsG...","Imported Food, Restaurants, Japanese, Specialt...","{'Tuesday': '10:0-15:0', 'Wednesday': '10:0-15..."
19258,W1G-6NjK1hXsU2T5QEB-UA,Sky Vapors,8643 W Sahara Ave,Las Vegas,NV,89117,36.143981,-115.280432,4.0,8,0,"{'BusinessAcceptsCreditCards': 'True', 'Busine...","Vape Shops, Tobacco Shops, Shopping, Personal ...","{'Monday': '10:0-19:0', 'Tuesday': '10:0-19:0'..."
19259,dEmNOTm8Rmm9JYZdGX_Lhw,Ristorante Beatrice,1504 Rue Sherbrooke O,Montréal,QC,H3G 1L3,45.496992,-73.580924,4.0,61,1,"{'RestaurantsAttire': 'u'dressy'', 'GoodForKid...","Restaurants, Italian","{'Monday': '18:0-23:0', 'Tuesday': '18:0-23:0'..."


One degree of latitude ~= 69 miles
1 mile = 1/69 degrees latitude

One degree of longitude ~= [ math.cosine (latitude * math.pi/180.) * 69.172 ] miles
1 mile = 1 / [ math.cosine (latitude * math.pi/180.) * 69.172 ]

Radians = Degrees * math.pi / 180

** share at least one category AND within a mile radius

In [122]:
df_bus_rad = df_business.copy()

In [127]:
rad = []
for i in list(range(0,len(df_bus_rad))):
    rad.append([df_bus_rad.iloc[i].latitude + 1/69, 
                df_bus_rad.iloc[i].latitude - 1/69, 
                df_bus_rad.iloc[i].longitude + 1 / (math.cos (df_bus_rad.iloc[i].latitude * math.pi/180.) * 69.172),
                df_bus_rad.iloc[i].longitude - 1 / (math.cos (df_bus_rad.iloc[i].latitude * math.pi/180.) * 69.172)])
    
df_bus_rad['range'] = rad

In [129]:
df_bus_rad[['latitude','longitude','range']]

Unnamed: 0,latitude,longitude,range
0,36.231066,-115.317048,"[36.245558753623186, 36.21657324637681, -115.2..."
1,41.461004,-81.953416,"[41.475497216823186, 41.44651170957681, -81.93..."
2,33.798348,-112.067059,"[33.81284105362319, 33.78385554637681, -112.04..."
3,33.644393,-112.120733,"[33.658885316223184, 33.62989980897681, -112.1..."
4,33.503899,-111.928302,"[33.518391753623185, 33.48940624637681, -111.9..."
...,...,...,...
19256,45.442238,-73.436694,"[45.456730953623186, 45.42774544637681, -73.41..."
19257,43.815450,-79.344597,"[43.82994240352319, 43.800956896276816, -79.32..."
19258,36.143981,-115.280432,"[36.15847345362319, 36.12948794637681, -115.26..."
19259,45.496992,-73.580924,"[45.51148445362319, 45.482498946376815, -73.56..."


In [130]:
df_bus_rad.iloc[0]['range'][0]

36.245558753623186

In [133]:
print(len(df_bus_rad))
print(len(df_bus_rad['name'].drop_duplicates()))

19261
16723


In [144]:
test = df_bus_rad.groupby('name').count().reset_index()
test1 = test[test['business_id'] > 1]
print(test1[['name','business_id']])

                                        name  business_id
23                      180 Smoke Vape Store            2
50                                 241 Pizza            2
78     360 Physical Therapy & Aquatic Center            5
118                                 7-Eleven           13
144                     99 Cents Only Stores            5
...                                      ...          ...
16636                                  Zoup!            3
16648                                 b.good            2
16683                             iRepair.ca            2
16693                    kate spade new york            2
16714                             uBreakiFix            5

[826 rows x 2 columns]


In [155]:
il_df = pd.read_json('../../yelp_dataset/business.json', lines=True)

In [162]:
il_df.groupby('state')['state'].value_counts()

state  state
AB     AB        8012
AK     AK           2
AL     AL           3
AR     AR           1
AZ     AZ       56686
BAS    BAS          1
BC     BC           1
CA     CA          19
CON    CON          1
CT     CT           3
DOW    DOW          1
DUR    DUR          1
FL     FL           4
GA     GA           2
IL     IL        1932
NC     NC       14720
NE     NE           2
NJ     NJ           1
NM     NM           1
NV     NV       36312
NY     NY          22
OH     OH       14697
ON     ON       33412
PA     PA       11216
QC     QC        9219
SC     SC        1162
TN     TN           1
TX     TX           6
UT     UT           1
VA     VA           2
VT     VT           2
WA     WA           3
WI     WI        5154
XGL    XGL          1
XGM    XGM          4
XWY    XWY          2
Name: state, dtype: int64

In [175]:
print(len(il_df[il_df['state']=='AZ']['name'].drop_duplicates()))
print(len(il_df))

names = il_df[il_df['state']=='AZ'][['business_id','name']]

43832
192609


In [179]:
repeat_names = names.groupby('name').count().reset_index()

In [181]:
repeat_names[repeat_names['business_id']>10]

Unnamed: 0,name,business_id
145,24 Hour Fitness,11
325,7-Eleven,22
374,99 Cents Only Stores,23
759,AAMCO Transmissions & Total Car Care,16
982,AT&T Store,35
...,...,...
42975,Wingstop,20
43326,Yogis Grill,14
43372,Youfit Health Clubs,15
43599,Zipps Sports Grill,11
