### import statements 

In [18]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from pprint import pprint

### Question 3 Regression

#### 3.1 & 3.2

In [19]:
# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

# Split the data into training/testing sets
# Split the targets into training/testing sets

diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(diabetes_X, diabetes_y, test_size=0.20, random_state=42)


#### 3.3

In [20]:
# Create linear regression object
linreg = linear_model.LinearRegression()

# Train the model using the training sets
linreg.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = linreg.predict(diabetes_X_test)

In [21]:
# The coefficients
print('Coefficients: \n', linreg.coef_)

# The coefficient of determination: 1 is perfect prediction
print('\nR-squared score: %.2f'
      % r2_score(diabetes_y_test, diabetes_y_pred))

Coefficients: 
 [  37.90031426 -241.96624835  542.42575342  347.70830529 -931.46126093
  518.04405547  163.40353476  275.31003837  736.18909839   48.67112488]

R-squared score: 0.45


#### 3.4

In [22]:
#Using 10-fold cross validation to fit and validate linear regression models on the whole data set. Printing the scores for each validation.

scores = cross_val_score(linreg, diabetes_X_train, diabetes_y_train, cv=10)
count = 1
for i in scores:
    print("Cross Validation Score for fold ", count, "is: ", i)
    count = count + 1

print("\nMean Accuracy: %0.2f with STD DEVIATION:  %0.2f" % (np.mean(np.abs(scores)), scores.std() * 2))

Cross Validation Score for fold  1 is:  0.5310818612097871
Cross Validation Score for fold  2 is:  0.48929200594262445
Cross Validation Score for fold  3 is:  0.6155148865078137
Cross Validation Score for fold  4 is:  -0.07802446434722854
Cross Validation Score for fold  5 is:  0.4650954815453411
Cross Validation Score for fold  6 is:  0.5338119174559128
Cross Validation Score for fold  7 is:  0.706563282930452
Cross Validation Score for fold  8 is:  0.5253790639706339
Cross Validation Score for fold  9 is:  -0.23181569661350765
Cross Validation Score for fold  10 is:  0.37529064060113176

Mean Accuracy: 0.46 with STD DEVIATION:  0.58


#### 3.5

In [26]:
# Use sklearn to create RandomForestRegressor model, and fit the training data into it.

rforest = RandomForestRegressor(n_estimators=1000, max_depth=None, min_samples_split=2, random_state=1)
rforest.fit(diabetes_X_train, diabetes_y_train)

diabetes_y_pred = rforest.predict(diabetes_X_test)
#calculating the r square score
print('\nR-squared score: %.2f'% r2_score(diabetes_y_test, diabetes_y_pred))

#calculating root mean sq error
print("Root mean squared test error = {0}".format(np.sqrt(np.mean((rforest.predict(diabetes_X_test) - diabetes_y_test)**2))))



R-squared score: 0.44
Root mean squared test error = 54.465358204535065


#### 3.6

In [24]:
#Using Grid Search to find the optimal hyper-parameters

#setup randomforest
rforest = RandomForestRegressor(n_estimators=1000, max_depth=None, min_samples_split=2, random_state=1)

#setup and train GridSearchCV
clf = GridSearchCV(rforest, {'max_depth': [None, 7, 4],'min_samples_split': [2, 10, 20]}, cv=10)
model = clf.fit(diabetes_X_train, diabetes_y_train)

# print optimal hyper-parameters
pprint(model.best_estimator_.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 20,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}


In [25]:
#print R-squared score
diabetes_y_pred = model.predict(diabetes_X_test)
print('\nR-squared score: %.2f'% r2_score(diabetes_y_test, diabetes_y_pred))


R-squared score: 0.46
