## Building Quick Prediction Model for COVID-19 Positive Rate

In [57]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [3]:
case_per_neighborhood = pd.read_csv('data/case_per_neighborhood.csv', encoding='utf-8')

case_per_neighborhood.head()

Unnamed: 0,borough,uhf34_neigh,uhf42_neigh,population,positive_rate,test_per_capita,obesity_percent,excercise_percent,copd_rate,smoking_percent,drinking_percent,cluster
0,Queens,Bayside Little Neck-Fresh Meadows,Bayside - Little Neck,87423,35.44,0.04,53.4,73.4,14.4,11.2,11.7,0
1,Queens,Bayside Little Neck-Fresh Meadows,Fresh Meadows,95537,39.06,0.055,53.4,73.4,14.5,11.2,11.7,0
2,Brooklyn,Bedford Stuyvesant - Crown Heights,Bedford Stuyvesant - Crown Heights,316269,39.09,0.044,67.2,71.4,27.7,18.9,16.6,1
3,Brooklyn,Bensonhurst - Bay Ridge,Bensonhurst - Bay Ridge,201541,37.38,0.04,50.3,69.5,38.3,11.7,15.4,1
4,Brooklyn,Borough Park,Borough Park,322563,38.58,0.065,56.3,65.2,16.9,9.8,9.5,0


In [6]:
X = case_per_neighborhood.iloc[:,5:-1]
y = case_per_neighborhood.iloc[:,3]

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Linear Regression

In [59]:
reg = LinearRegression().fit(X_train, y_train)

In [63]:
reg.score(X_train, y_train)

0.21873054793069613

In [62]:
mean_squared_error(y_test, reg.predict(X_test))

5351462969.875393

### Ridge Regression

In [64]:
ridge = RidgeCV(alphas=[1,10,20, 30, 100]).fit(X_train, y_train)

In [65]:
ridge.score(X_train, y_train)

0.21102226463061868

In [66]:
mean_squared_error(y_test, ridge.predict(X_test))

4927282152.527488

### Random Forest

In [67]:
regr = RandomForestRegressor(n_estimators=100,max_depth=3, random_state=0)
regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [68]:
regr.score(X_train, y_train)

0.6730696770035177

In [71]:
mean_squared_error(y_test, regr.predict(X_test))

5599287986.050116

### Random Forest with Randomized |Search

In [49]:
# Number of trees in random forest
n_estimators = [1, 5, 10, 20, 30, 50, 100, 200]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [1,2,3,4,5,6,7,8,9,10, None]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [72]:
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, n_jobs = -1)

rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    3.1s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [1, 5, 10, 20, 30, 50, 100, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [73]:
rf_random.best_params_

{'n_estimators': 5,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 2,
 'bootstrap': True}

In [74]:
rf_random.score(X_train,y_train)

0.3818559750599647

In [75]:
mean_squared_error(y_test, rf_random.predict(X_test))

3508620597.1829057

The random forest regressor with randomized search yield the lowest MSE on the test set. Therefore, it is the best model out of the above though the accuracy is still low.