## Predict the Rating for Starbucks on Yelp by Demographics in the Nearby Neighborhood

In [31]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [12]:
df = pd.read_csv('cleaned_sb_only.csv')
df.head()

Unnamed: 0,Name,Avg_rating,Avg_review_count,Number,Zipcode,Boro,Total_Income,Median_Income,Mean_Income,Population,...,Unemployment_tot,Men_pct,Hispanic_pct,White_pct,Black_pct,Native_pct,Asian_pct,Poverty_pct,Unemployment_pct,Women_pct
0,1,2.585938,43.453125,8,10017,Manhattan,5952175,102524,149724,16231,...,1285,47.458003,6.455983,72.89029,1.788265,0.078822,16.577171,7.061924,3.165181,52.541997
1,1,2.618056,48.229167,8,10036,Manhattan,5718183,62393,89879,18660,...,2531,56.533284,22.541803,51.879798,6.680513,0.049953,16.594805,14.428436,6.654222,43.466716
2,1,2.910714,106.285714,7,10011,Manhattan,9207647,92359,160937,45899,...,4043,50.70698,10.394548,70.416171,5.008382,0.301256,11.062656,10.530599,4.911202,49.29302
3,1,2.480159,33.087302,7,10019,Manhattan,7306115,84786,133175,39048,...,3515,50.436119,17.62764,58.931952,4.806214,0.079294,15.99644,12.177361,5.688162,49.563881
4,1,2.586735,37.377551,7,10022,Manhattan,12169071,93107,158965,26460,...,2054,44.135946,5.792102,80.829136,2.116223,0.114276,9.954713,4.920218,4.346722,55.864054


In [14]:
from sklearn.model_selection import train_test_split
import random


y = df['Avg_rating']
x = df[['Avg_review_count', 'Total_Income', 'Population', 'Men_pct', 'White_pct', 'Black_pct', 'Asian_pct', 'Poverty_pct']]

x_train,x_test, y_train, y_test = train_test_split(x, y)

random.seed(12345)

In [32]:
# knn

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

knn = KNeighborsRegressor().fit(x_train, y_train)

print("Training set score: {:.3f}".format(knn.score(x_train, y_train)))
print("Test set score: {:.3f}".format(knn.score(x_test, y_test)))
print("Average cross validation score: {:.3f}".format(
    np.mean(cross_val_score(knn, x_train, y_train)))) 

Training set score: 0.063
Test set score: -0.228
Average cross validation score: -0.155


In [33]:
# OLS

from sklearn.linear_model import LinearRegression

linear = LinearRegression().fit(x_train, y_train)

print("LinearReg Coefficients: {}".format(linear.coef_))
print("Training set score: {:.3f}".format(linear.score(x_train, y_train)))
print("Test set score: {:.3f}".format(linear.score(x_test, y_test)))
print("Average cross validation score: {:.3f}".format(np.mean(cross_val_score(linear, x_train, y_train))))

LinearReg Coefficients: [ 1.34383129e-04 -6.21562130e-08  3.67297636e-06  5.89332706e-03
 -1.89511149e-03 -1.57181214e-03 -5.73618294e-03 -3.88829624e-02]
Training set score: 0.232
Test set score: 0.051
Average cross validation score: -0.707


In [34]:
# Ridge

from sklearn.linear_model import Ridge

ridge = Ridge().fit(x_train, y_train)

print("Ridge Coefficients: {}".format(ridge.coef_))
print("Training set score: {:.3f}".format(ridge.score(x_train, y_train)))
print("Test set score: {:.3f}".format(ridge.score(x_test, y_test)))
print("Average cross validation score: {:.3f}".format(
    np.mean(cross_val_score(ridge, x_train, y_train))))

Ridge Coefficients: [ 1.34145410e-04 -6.21487711e-08  3.67228437e-06  5.87807427e-03
 -1.89044496e-03 -1.57247633e-03 -5.73113630e-03 -3.88637242e-02]
Training set score: 0.232
Test set score: 0.051
Average cross validation score: -0.707


In [35]:
# Lasso

from sklearn.linear_model import Lasso

lasso = Lasso(max_iter=100000).fit(x_train, y_train)

print("Lasso Coefficients: {}".format(lasso.coef_))
print("Training set score: {:.3f}".format(lasso.score(x_train, y_train)))
print("Test set score: {:.3f}".format(lasso.score(x_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))
print("Average cross validation score: {:.3f}".format(
    np.mean(cross_val_score(lasso, x_train, y_train))))

Lasso Coefficients: [ 0.00000000e+00 -3.43126220e-08  2.21225271e-06  0.00000000e+00
  1.35537037e-03 -0.00000000e+00  0.00000000e+00 -1.42639514e-02]
Training set score: 0.158
Test set score: 0.112
Number of features used: 4
Average cross validation score: -0.497


### Scaling the data with the StandardScaler

In [37]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(StandardScaler(), knn)
pipe.fit(x_train, y_train)
pipe.score(x_test, y_test) 

0.14937603567518387

In [38]:
pipe = make_pipeline(StandardScaler(), linear)
pipe.fit(x_train, y_train)
pipe.score(x_test, y_test)

0.05096365806789971

In [39]:
pipe = make_pipeline(StandardScaler(), ridge)
pipe.fit(x_train, y_train)
pipe.score(x_test, y_test)

0.05915593129896868

In [40]:
pipe = make_pipeline(StandardScaler(), lasso)
pipe.fit(x_train, y_train)
pipe.score(x_test, y_test)

-0.030878636261923772

### GridSearch CV

In [44]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': np.arange(1, 15, 2)}
grid = GridSearchCV(KNeighborsRegressor(), param_grid=param_grid)
grid.fit(x_train, y_train)

print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(x_test, y_test)))

best mean cross-validation score: -0.059
best parameters: {'n_neighbors': 9}
test-set score: -0.140


In [47]:
# Tune whether to calculate the intercept for this model ('fit_intercept'), 
# whether to normalize the regressors ('normalize'), and whether to overwrite X ('copy_X')
param_grid = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
grid = GridSearchCV(LinearRegression(), param_grid=param_grid)
grid.fit(x_train, y_train)

print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters for OLS: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(x_test, y_test)))

best mean cross-validation score: -0.731
best parameters for OLS: {'copy_X': True, 'fit_intercept': True, 'normalize': True}
test-set score: 0.051


In [49]:
# Tune the regularization strength 'alpha'
param_grid = {'alpha': np.arange(1, 10, 1)}
grid = GridSearchCV(Ridge(), param_grid=param_grid)
grid.fit(x_train, y_train)

print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters for Ridge: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(x_test, y_test)))

best mean cross-validation score: -0.726
best parameters for Ridge: {'alpha': 9}
test-set score: 0.052


In [51]:
# Tune the regularization strength 'alpha'
param_grid = {'alpha': np.arange(1, 10, 1)}
grid = GridSearchCV(Lasso(max_iter=10000000), param_grid=param_grid)
grid.fit(x_train, y_train)

print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters for Lasso: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(x_test, y_test)))

best mean cross-validation score: -0.103
best parameters for Lasso: {'alpha': 9}
test-set score: 0.002


In [52]:
print("Ridge Coefficients: {}".format(ridge.coef_))
print("Lasso Coefficients: {}".format(lasso.coef_))

Ridge Coefficients: [ 0.00342676 -0.17644286  0.09043076  0.01559346 -0.03299452 -0.01970553
 -0.06917628 -0.32888098]
Lasso Coefficients: [ 0. -0.  0.  0.  0. -0.  0. -0.]
