## Building Quick Prediction Model for Cases per Capita

In [73]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import scale

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [74]:
case_per_neighborhood = pd.read_csv('data/case_per_neighborhood.csv', encoding='utf-8')

case_per_neighborhood.head()

Unnamed: 0,borough,uhf34_neigh,uhf42_neigh,population,positive_rate,case_per_capita,test_per_capita,obesity_percent,copd_rate,asthma_percent,heart_attack_percent,cluster
0,Queens,Bayside Little Neck-Fresh Meadows,Bayside - Little Neck,87423,35.44,0.014,0.04,53.4,11.7,1.9,11.7,1
1,Queens,Bayside Little Neck-Fresh Meadows,Fresh Meadows,95537,39.06,0.022,0.055,53.4,12.0,1.9,16.8,1
2,Brooklyn,Bedford Stuyvesant - Crown Heights,Bedford Stuyvesant - Crown Heights,316269,39.09,0.017,0.044,67.2,21.4,7.5,26.9,2
3,Brooklyn,Bensonhurst - Bay Ridge,Bensonhurst - Bay Ridge,201541,37.38,0.015,0.04,50.3,40.8,2.2,20.3,0
4,Brooklyn,Borough Park,Borough Park,322563,38.58,0.025,0.065,56.3,15.5,2.2,25.1,0


In [75]:
X = scale(case_per_neighborhood.iloc[:,7:-1])
y = scale(case_per_neighborhood.iloc[:,5])

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Linear Regression

In [77]:
reg = LinearRegression().fit(X_train, y_train)

In [78]:
reg.coef_

array([ 0.85887405,  0.14129685, -0.11570496, -0.05015685])

In [79]:
reg.score(X_train, y_train)

0.619418725417302

In [80]:
mean_squared_error(y_test, reg.predict(X_test))

0.10073608637488657

### Ridge Regression

In [81]:
ridge = RidgeCV(alphas=[0.1, 0.5, 1,10,20, 30, 100]).fit(X_train, y_train)

In [82]:
ridge.get_params

<bound method BaseEstimator.get_params of RidgeCV(alphas=array([  0.1,   0.5,   1. ,  10. ,  20. ,  30. , 100. ]),
    cv=None, fit_intercept=True, gcv_mode=None, normalize=False,
    scoring=None, store_cv_values=False)>

In [83]:
ridge.coef_

array([ 0.79874235,  0.15002517, -0.09970111, -0.0237967 ])

In [84]:
ridge.score(X_train, y_train)

0.6179214787102545

In [85]:
mean_squared_error(y_test, ridge.predict(X_test))

0.08917140759773017

### Random Forest

In [94]:
regr = RandomForestRegressor(n_estimators=100,max_depth=2, random_state=0)
regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [95]:
regr.score(X_train, y_train)

0.7248403558585208

In [96]:
mean_squared_error(y_test, regr.predict(X_test))

0.11595213705573311

### Random Forest with Randomized Search

In [89]:
# Number of trees in random forest
n_estimators = [1, 5, 10, 20, 30, 50, 100, 200]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [1,2,3,4,5,6,7,8,9,10, None]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [90]:
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=0, n_jobs = -1)

rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 126 out of 300 | elapsed:    1.7s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    2.6s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [1, 5, 10, 20, 30, 50, 100, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [91]:
rf_random.best_params_

{'n_estimators': 5,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 4,
 'bootstrap': True}

In [92]:
rf_random.score(X_train,y_train)

0.8671445049910735

In [93]:
mean_squared_error(y_test, rf_random.predict(X_test))

0.12570558352701044

The random forest regressor with randomized search yield the not the lowest MSE on the test set but its R-squared score on train set is the highest. This could however be due to overfitting.

Ridge regression model with the lowest MSE seems to be the right model.