In [1]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [4]:
#-- Importing the dataset --#
med_df_raw = pd.read_csv("/Users/lindasegalini/Desktop/WGU/New Program/D209 Data Mining/medical_clean.csv")

#-- Dropping missing values --#
med_df_raw.dropna() 


#-- Dropping unessesary features --#
med_df = med_df_raw.drop(columns = ['Gender','Complication_risk','Initial_admin','Additional_charges','TotalCharge','Initial_days','VitD_levels','Age','CaseOrder','Allergic_rhinitis','Reflux_esophagitis','Doc_visits','Full_meals_eaten','vitD_supp','Soft_drink','Customer_id','Interaction','UID','City','State','County','Zip','Lat','Lng', 'Area','TimeZone','Job','Income','Services','Item1','Item2','Item3','Item4','Item5','Item6','Item7','Item8',                       
                                       'Arthritis','Hyperlipidemia','BackPain'])

#-- Changing categorical to binary with get_dummies() and dropping the first column. 
med_df = pd.get_dummies(med_df, drop_first =True)

#-- Dropping the previously combined columns to keep the number of columns down and create a tidier dataset --#
med_clean = med_df.drop(columns = ['Marital_Never Married','Marital_Separated','Marital_Widowed'])

In [5]:
#-- Saving a copy of the cleaned dataset --#
med_clean.to_csv('/Users/lindasegalini/Desktop/WGU/New Program/D209 Data Mining/RandomForests/Submissions.csv')

In [6]:
med_clean.head()

Unnamed: 0,Population,Children,Marital_Married,ReAdmis_Yes,HighBlood_Yes,Stroke_Yes,Overweight_Yes,Diabetes_Yes,Anxiety_Yes,Asthma_Yes
0,2951,1,0,0,1,0,0,1,1,1
1,11303,3,1,0,1,0,1,0,0,0
2,17125,3,0,0,1,0,1,1,0,0
3,2162,0,1,0,0,1,0,0,0,1
4,5287,1,0,0,0,0,0,0,0,0


In [7]:
# Split the data into X & y

X = med_clean.drop('ReAdmis_Yes', axis = 1).values
y = med_clean['ReAdmis_Yes']

y = y.astype(int)

# Make train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    shuffle=True, random_state=2)

In [8]:
#-- Instantiate RandomForrestClassifier  --#

rc = RandomForestClassifier(random_state = 1)
#-- print out the 'rc's hyperparameters --#

rc.get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [10]:
#-- Define the grid of hyperparameters 'params_RC' --#

params_RC ={'max_depth': [3,4,5,6,8,10],
            'min_samples_leaf': [0.04, 0.06,0.08,0.10,0.12],
            'max_features': [0.1, 0.2, 0.3, 0.4, 0.6, 0.8]}

#-- Instanitate a 10-fold CV grid search object 'grid_RC' --#

grid_RC = GridSearchCV(estimator = rc,
                        param_grid = params_RC,
                        scoring = 'accuracy',
                        cv = 10,
                        n_jobs =-1)

#-- Fit 'grid_RC' to the training set --#

grid_RC.fit(X_train, y_train)

#-- Predict the test set labels 'y_pred' --#
y_pred = grid_RC.predict(X_test)

#-- Extracting the best hyperparameters from 'grid_RC' --#

best_hyperparams = grid_RC.best_params_
print('Best hyperparameters : \n', best_hyperparams)

#-- Extracting the best model from 'grid_RC' --#

best_model = grid_RC.best_estimator_

#-- Evaluate the accuracy --#

test_acc = best_model.score(X_test, y_test)

#-- Print the best_model accuracy --#
print("Test set accuracy of the best model: {:.3f}".format(test_acc))

#-- Evaluate test set Mean Squared Error --#
mse_test = MSE(y_test, y_pred)# **(1/2)

#-- Print 'mse_test' --#
print('Test set MSE: {:.2f}'.format(mse_test))


Best hyperparameters : 
 {'max_depth': 3, 'max_features': 0.1, 'min_samples_leaf': 0.04}
Test set accuracy of the best model: 0.634
Test set MSE: 0.37


In [11]:
#-- Feature importance is an average of how each variables effect in reducing the model's noise--#
importances = grid_RC.best_estimator_.feature_importances_
importances

array([0.27930629, 0.28463848, 0.03965782, 0.07506462, 0.05229334,
       0.05102822, 0.04435589, 0.07372725, 0.09992809])

In [12]:
#-- Displaying feature names matched with the amount of correlation to the target variable 
#-- 'Population' is the first in the array above with 'Children' next at 28.4% and so on. 

med_clean.drop('ReAdmis_Yes', axis = 1)

Unnamed: 0,Population,Children,Marital_Married,HighBlood_Yes,Stroke_Yes,Overweight_Yes,Diabetes_Yes,Anxiety_Yes,Asthma_Yes
0,2951,1,0,1,0,0,1,1,1
1,11303,3,1,1,0,1,0,0,0
2,17125,3,0,1,0,1,1,0,0
3,2162,0,1,0,1,0,0,0,1
4,5287,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
9995,4762,2,0,1,0,0,0,1,0
9996,1251,4,0,1,0,1,1,0,1
9997,532,3,0,1,0,1,0,1,0
9998,271,3,0,0,0,1,0,0,0


In [13]:
#-- Saving a copy of the training and testing datasets --#
X_train =pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train = pd.DataFrame(y_train)

y_test = pd.DataFrame(y_test)

X_train.to_csv('/Users/lindasegalini/Desktop/WGU/New Program/D209 Data Mining/RandomForests/Submissions.csv')

X_test.to_csv('/Users/lindasegalini/Desktop/WGU/New Program/D209 Data Mining/RandomForests/Submissions.csv')

y_train.to_csv('/Users/lindasegalini/Desktop/WGU/New Program/D209 Data Mining/RandomForests/Submissions.csv')

y_test.to_csv('/Users/lindasegalini/Desktop/WGU/New Program/D209 Data Mining/RandomForests/Submissions.csv')
