## Predict the Rating for the Dunkin' on Yelp by Demographics in the Nearby Neighborhood

In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:
df = pd.read_csv('cleaned_dd_only.csv')
df.head()

Unnamed: 0,Name,Avg_rating,Avg_review_count,Number,Zipcode,Boro,Total_Income,Median_Income,Mean_Income,Population,...,Unemployment_tot,Men_pct,Hispanic_pct,White_pct,Black_pct,Native_pct,Asian_pct,Poverty_pct,Unemployment_pct,Women_pct
0,0,2.326923,15.653846,13,10001,Manhattan,2820411,71245,123113,17678,...,3719,49.76607,11.217709,59.960678,7.108918,0.3113,18.80359,14.887768,6.770312,50.23393
1,0,2.458333,13.75,12,10018,Manhattan,2452452,84799,112292,4255,...,1483,52.122819,20.697978,48.019626,7.830491,0.075179,21.346892,14.746963,5.867922,47.877181
2,0,2.35,19.7,10,10016,Manhattan,8419008,96760,144872,49904,...,3805,47.851902,10.220971,65.417577,4.294796,0.016804,17.41164,12.24304,5.328236,52.148098
3,0,2.416667,12.944444,9,11101,Queens,1950293,39955,51519,22556,...,5963,50.508276,32.060472,34.79897,9.539945,0.156393,20.324515,21.380164,9.714258,49.491724
4,0,2.875,15.125,8,10038,Manhattan,1788906,55937,89760,15435,...,3119,49.143701,10.347221,35.814547,5.272113,1.083525,44.769303,19.695986,6.982471,50.856299


In [6]:
from sklearn.model_selection import train_test_split
import random

y = df['Avg_rating']
x = df[['Avg_review_count', 'Total_Income', 'Population', 'Men_pct', 'White_pct', 'Black_pct', 'Asian_pct', 'Poverty_pct']]

x_train,x_test, y_train, y_test = train_test_split(x, y)

random.seed(12345)

In [10]:
# knn

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

knn = KNeighborsRegressor().fit(x_train, y_train)

print("Training set score: {:.3f}".format(knn.score(x_train, y_train)))
print("Test set score: {:.3f}".format(knn.score(x_test, y_test)))
print("Average cross validation score: {:.3f}".format(
    np.mean(cross_val_score(knn, x_train, y_train)))) 

Training set score: 0.135
Test set score: -0.301
Average cross validation score: -0.132


In [11]:
# OLS

from sklearn.linear_model import LinearRegression

linear = LinearRegression().fit(x_train, y_train)

print("LinearReg Coefficients: {}".format(linear.coef_))
print("Training set score: {:.3f}".format(linear.score(x_train, y_train)))
print("Test set score: {:.3f}".format(linear.score(x_test, y_test)))
print("Average cross validation score: {:.3f}".format(np.mean(cross_val_score(linear, x_train, y_train))))

LinearReg Coefficients: [-1.78018433e-02  3.70553568e-08 -3.82044252e-06  1.77622355e-02
 -1.84320401e-03 -5.45541231e-03 -1.43936370e-03  1.03073322e-02]
Training set score: 0.171
Test set score: -0.063
Average cross validation score: -0.056


In [12]:
# Ridge

from sklearn.linear_model import Ridge

ridge = Ridge().fit(x_train, y_train)

print("Ridge Coefficients: {}".format(ridge.coef_))
print("Training set score: {:.3f}".format(ridge.score(x_train, y_train)))
print("Test set score: {:.3f}".format(ridge.score(x_test, y_test)))
print("Average cross validation score: {:.3f}".format(
    np.mean(cross_val_score(ridge, x_train, y_train))))

Ridge Coefficients: [-1.77977412e-02  3.70489478e-08 -3.82025903e-06  1.77208512e-02
 -1.84519371e-03 -5.45854246e-03 -1.44120362e-03  1.03047120e-02]
Training set score: 0.171
Test set score: -0.063
Average cross validation score: -0.056


In [13]:
# Lasso

from sklearn.linear_model import Lasso

lasso = Lasso(max_iter=100000).fit(x_train, y_train)

print("Lasso Coefficients: {}".format(lasso.coef_))
print("Training set score: {:.3f}".format(lasso.score(x_train, y_train)))
print("Test set score: {:.3f}".format(lasso.score(x_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))
print("Average cross validation score: {:.3f}".format(
    np.mean(cross_val_score(lasso, x_train, y_train))))

Lasso Coefficients: [-1.51996837e-03  1.20274909e-08 -1.45604633e-06  0.00000000e+00
 -0.00000000e+00 -3.47753309e-04 -0.00000000e+00  4.74129533e-04]
Training set score: 0.026
Test set score: -0.049
Number of features used: 5
Average cross validation score: -0.088


### Scaling the data with the StandardScaler

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(StandardScaler(), knn)
pipe.fit(x_train, y_train)
pipe.score(x_test, y_test) 

0.017344877327694075

In [15]:
pipe = make_pipeline(StandardScaler(), linear)
pipe.fit(x_train, y_train)
pipe.score(x_test, y_test)

-0.06268978937442049

In [16]:
pipe = make_pipeline(StandardScaler(), ridge)
pipe.fit(x_train, y_train)
pipe.score(x_test, y_test)

-0.05930826489623109

In [17]:
pipe = make_pipeline(StandardScaler(), lasso)
pipe.fit(x_train, y_train)
pipe.score(x_test, y_test)

-2.8418810457342403e-05

### GridSearch CV

In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': np.arange(1, 15, 2)}
grid = GridSearchCV(KNeighborsRegressor(), param_grid=param_grid)
grid.fit(x_train, y_train)

print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(x_test, y_test)))

best mean cross-validation score: -0.080
best parameters: {'n_neighbors': 9}
test-set score: -0.058


In [19]:
# Tune whether to calculate the intercept for this model ('fit_intercept'), 
# whether to normalize the regressors ('normalize'), and whether to overwrite X ('copy_X')
param_grid = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
grid = GridSearchCV(LinearRegression(), param_grid=param_grid)
grid.fit(x_train, y_train)

print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters for OLS: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(x_test, y_test)))

best mean cross-validation score: -0.056
best parameters for OLS: {'copy_X': True, 'fit_intercept': True, 'normalize': False}
test-set score: -0.063


In [20]:
# Tune the regularization strength 'alpha'
param_grid = {'alpha': np.arange(1, 10, 1)}
grid = GridSearchCV(Ridge(), param_grid=param_grid)
grid.fit(x_train, y_train)

print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters for Ridge: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(x_test, y_test)))

best mean cross-validation score: -0.056
best parameters for Ridge: {'alpha': 9}
test-set score: -0.062


In [21]:
# Tune the regularization strength 'alpha'
param_grid = {'alpha': np.arange(1, 10, 1)}
grid = GridSearchCV(Lasso(max_iter=10000000), param_grid=param_grid)
grid.fit(x_train, y_train)

print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters for Lasso: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(x_test, y_test)))

best mean cross-validation score: -0.086
best parameters for Lasso: {'alpha': 2}
test-set score: -0.058


In [22]:
print("Ridge Coefficients: {}".format(ridge.coef_))
print("Lasso Coefficients: {}".format(lasso.coef_))

Ridge Coefficients: [-0.15026981  0.0888584  -0.09489671  0.04084934 -0.04316488 -0.12274379
 -0.02015968  0.09858014]
Lasso Coefficients: [-0.  0. -0.  0. -0. -0. -0.  0.]
