In [3]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'In_Class_Assignments/insurance.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
insurance = pd.read_csv(file_content_stream)
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
## Changing labels to numbers 
insurance['sex'] = np.where(insurance['sex'] == 'female', 0, 1)
insurance['smoker'] = np.where(insurance['smoker'] == 'no', 0, 1)

## Extracting region dummies
region_dummies = pd.get_dummies(insurance['region']).iloc[:, 0:3]

## Appending dummies 
insurance = pd.concat([insurance, region_dummies], axis = 1)
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northeast,northwest,southeast
0,19,0,27.9,0,1,southwest,16884.924,0,0,0
1,18,1,33.77,1,0,southeast,1725.5523,0,0,1
2,28,1,33.0,3,0,southeast,4449.462,0,0,1
3,33,1,22.705,0,0,northwest,21984.47061,0,1,0
4,32,1,28.88,0,0,northwest,3866.8552,0,1,0


In [5]:
## Computing interactions from chapter 4
insurance['interaction_1'] = np.where((insurance['smoker'] == 0) & (insurance['age'] <= 32.5), 1, 0)
insurance['interaction_2'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 32.5) & (insurance['age'] <= 44.5), 1, 0)
insurance['interaction_3'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 44.5) & (insurance['age'] < 51.5), 1, 0)
insurance['interaction_4'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 51.5), 1, 0)

insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northeast,northwest,southeast,interaction_1,interaction_2,interaction_3,interaction_4
0,19,0,27.9,0,1,southwest,16884.924,0,0,0,0,0,0,0
1,18,1,33.77,1,0,southeast,1725.5523,0,0,1,1,0,0,0
2,28,1,33.0,3,0,southeast,4449.462,0,0,1,1,0,0,0
3,33,1,22.705,0,0,northwest,21984.47061,0,1,0,0,1,0,0
4,32,1,28.88,0,0,northwest,3866.8552,0,1,0,1,0,0,0


In [10]:
## Defining the input and target variables
X = insurance[['age', 'bmi', 'children', 'smoker', 'interaction_4']]
Y = insurance['charges']

## Splitting the data 
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2)
X_val, X_test, Y_val, Y_test = train_test_split(X_val, Y_val, test_size = 0.5)

## Changing the scale 
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.fit_transform(X_val)
X_test = scaler.fit_transform(X_test)

In [13]:
## Defining the hyper-parameters
RF_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split' : [10, 15],
                 'min_samples_leaf' : [5, 7],
                 'max_depth' : [3, 5, 7]}

## Performing grid search
RF_grid_search = GridSearchCV(RandomForestRegressor(), RF_param_grid, cv = 3, scoring = 'neg_mean_squared_error', n_jobs = -1).fit(X_train, Y_train)

## Extraciting the best model 
RF_md = RF_grid_search.best_estimator_

## Predicting on validation and test
RF_val_pred = RF_md.predict(X_val)
RF_test_pred = RF_md.predict(X_test)

## Computing the mean square error 
val_mse = mean_squared_error(Y_val, RF_val_pred)
print('The RF mse on validation dataset is', val_mse)

test_mse = mean_squared_error(Y_test, RF_test_pred)
print('The RF mse on test dataset is', test_mse)

The RF mse on validation dataset is 28665448.13533988
The RF mse on test dataset is 23339933.99146837


In [14]:
## Defining the hyper-parameters
SVM_param_grid = {'kernel': ['rbf', 'poly', 'sigmoid'],
                  'C': [0.01, 0.1, 1, 10],
                  'gamma': [0.01, 0.1, 1]}

## Performing grid search
svm_grid_search = GridSearchCV(SVR(), SVM_param_grid, cv = 3, scoring = 'neg_mean_squared_error', n_jobs = -1).fit(X_train, Y_train)

## Extraciting the best model 
svm_md = svm_grid_search.best_estimator_

## Predicting on validation and test
svm_val_pred = svm_md.predict(X_val)
svm_test_pred = svm_md.predict(X_test)

## Computing the mean square error 
val_mse = mean_squared_error(Y_val, svm_val_pred)
print('The SVM mse on validation dataset is', val_mse)

test_mse = mean_squared_error(Y_test, svm_test_pred)
print('The SVM mse on test dataset is', test_mse)

The SVM mse on validation dataset is 71999034.82989629
The SVM mse on test dataset is 108618334.67117436


In [15]:
## Building the ensemble
X_ensemble = pd.concat([pd.DataFrame(RF_val_pred), pd.DataFrame(svm_val_pred)], axis = 1)

## Performing grid search with 3 folds 
RF_grid_search_ensemble = GridSearchCV(RandomForestRegressor(), RF_param_grid, cv = 3, scoring = 'neg_mean_squared_error', n_jobs = -1).fit(X_ensemble, Y_val)

## Extracting the best model 
RF_md_ensemble = RF_grid_search_ensemble.best_estimator_

## Consolidating prediction for the ensemble
X = pd.concat([pd.DataFrame(RF_test_pred), pd.DataFrame(svm_test_pred)], axis = 1)

## Predicting 
ensemble_pred = RF_md_ensemble.predict(X)

## Computing the mse of the ensemble
test_mse = mean_squared_error(Y_test, ensemble_pred)
print('The ensemble mse on test dataset is', test_mse)

The ensemble mse on test dataset is 24218431.59406269
