# Random Forest 

Loading Modules

In [1]:
# import modules
import pandas as pd
import numpy as np

#Import scikit-learn dataset library
from sklearn import datasets

# Import train_test_split function
from sklearn.model_selection import train_test_split

# import oversampling and undersampling packages
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler 

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report 

# Cross Validation packages
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# import required modules for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# import module for gridsearch (to find optimal hyper-parameters)
from sklearn.model_selection import GridSearchCV

# import module (to test execution time of a codeblock to run)
import time

import warnings 

from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC



In [2]:
# packages for Receiver Operating Characteristic (ROC) with cross validation
# source: https://scikit-learn.org/0.18/auto_examples/model_selection/plot_roc_crossval.html
print(__doc__)

import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import plot_roc_curve

Automatically created module for IPython interactive environment


<a id = ""> <h2> Target Variable 'Death Rate Categories' </h2> </a>
___

<b> read csv file to a pandas dataframe

In [26]:
#Load dataset
# read csv file to a pandas dataframe
df_death = pd.read_csv("final_death_data.csv")
df_death.head()

Unnamed: 0,Total Population,Total Citizen Income,Hispanic or Latino,Total Citizen Educated in US,Total Households,White Race,Total Families,Hispanic or Latino_iqr,Citizen High School Graduate,Citizen Some College Education,...,Citizen Income over 75000,Citizen No Income,Citizen Income 1-9999,Pacific Islander Race,Native American Race,Citizen Graduate or Professional Degree,Citizen Less than High School Education,Citizen Income 15000-24999,Citizen Income 35000-49999,Death Rate Categories
0,54571,44109,53261,36757,20221,9643,14613,53261.0,12363,10697,...,5298,6377,6955,466,474,3860,4521,6000,5727,2
1,182265,166364,174273,143022,73180,17105,41898,140466.75,39771,45286,...,17954,20201,25080,3631,1348,14731,13997,26852,21502,1
2,27457,21627,26070,18434,9820,12875,6015,26070.0,6549,4707,...,1019,3968,5090,894,107,815,4960,3419,1820,1
3,22915,18743,22509,15859,7953,5047,5201,22509.0,6958,3971,...,953,4170,3268,185,22,808,2833,2926,1860,1
4,57322,46501,52696,39475,21578,761,14106,52696.0,12740,13583,...,2690,10165,6561,2347,117,1756,7980,6956,5639,1


<b> show all columns in dataset

In [4]:
# show all columns in dataset
print('There are', len(df_death.columns)-1, 'features, and the target `Death Rate Categories`:')
list(df_death.columns)
# we will not include state and county names or state abbr

There are 20 features, and the target `Death Rate Categories`:


['Total Population',
 'Total Citizen Income',
 'Hispanic or Latino',
 'Total Citizen Educated in US',
 'Total Households',
 'White Race',
 'Total Families',
 'Hispanic or Latino_iqr',
 'Citizen High School  Graduate',
 'Citizen Some College  Education',
 'Citizen College Degree',
 'Citizen Income over 75000',
 'Citizen No Income',
 'Citizen Income 1-9999',
 'Pacific Islander Race',
 'Native American Race',
 'Citizen Graduate or Professional Degree',
 'Citizen Less than High School  Education',
 'Citizen Income 15000-24999',
 'Citizen Income 35000-49999',
 'Death Rate Categories']

In [5]:
# trim the spaces of the cells from the csv file
df_death.columns = df_death.columns.to_series().apply(lambda x: x.strip())

In [8]:
df_death['Death Rate Categories'] = df_death['Death Rate Categories'].astype('category')

<a id = ""> <h2> 1.0 Basic Model Random Forest </h2> </a>

> <b> Declare Features and Target

In [15]:
# Define Features and Target variables
features = ['Total Population','Total Citizen Income','Hispanic or Latino','Total Citizen Educated in US',
                                    'Total Households','White Race','Total Families','Citizen College Degree','Citizen Some College  Education',
                                    'Citizen Income over 75000','Citizen High School  Graduate',
                                    'Citizen No Income','Pacific Islander Race','Citizen Graduate or Professional Degree',
                                    'Citizen Income 1-9999','Citizen Less than High School  Education',
                                     'Citizen Income 15000-24999','Citizen Income 15000-24999','Citizen Income 35000-49999',]

#X = df_death[features] # from the list above
X = df_death.iloc[:, :-1]
Y = df_death.iloc[:, -1] # Target is the last column in the dataframe: 'Death Rate Categories'

> <b> Split the data

In [16]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2019) # 70% training and 30% test

> <b> Model Development and Prediction

In [17]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Random Forest Classifier
clf = RandomForestClassifier(random_state=2019)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

#check the predictive performance using the actual and predicted values
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.83      0.76       390
           1       0.39      0.49      0.43       203
           2       0.41      0.38      0.40       228
           3       0.29      0.06      0.09        89
           4       0.50      0.07      0.13        27
           5       0.00      0.00      0.00         5

    accuracy                           0.55       942
   macro avg       0.38      0.30      0.30       942
weighted avg       0.52      0.55      0.52       942



  _warn_prf(average, modifier, msg_start, len(result))


<a id = ""> <h2> Target Variable 'Case Rate Categories' </h2> </a>
___

In [28]:
df_cases = pd.read_csv("final_case_data.csv")
df_cases.head()

Unnamed: 0,Total Population,Total Citizen Income,Hispanic or Latino,Total Citizen Educated in US,Total Households,White Race,Total Families,Citizen College Degree,Citizen Some College Education,Citizen Income over 75000,Citizen High School Graduate,Citizen No Income,Pacific Islander Race,Citizen Graduate or Professional Degree,Citizen Income 1-9999,Citizen Less than High School Education,Citizen Income 15000-24999,Citizen Income 15000-24999.1,Citizen Income 35000-49999,Case Rate Categories
0,54571,44109,53261,36757,20221,9643,14613,5316,10697,5298,12363,6377,466,3860,6955,4521,6000,6000,5727,2
1,182265,166364,174273,143022,73180,17105,41898,29237,45286,17954,39771,20201,3631,14731,25080,13997,26852,26852,21502,2
2,27457,21627,26070,18434,9820,12875,6015,1403,4707,1019,6549,3968,894,815,5090,4960,3419,3419,1820,3
3,22915,18743,22509,15859,7953,5047,5201,1289,3971,953,6958,4170,185,808,3268,2833,2926,2926,1860,2
4,57322,46501,52696,39475,21578,761,14106,3416,13583,2690,12740,10165,2347,1756,6561,7980,6956,6956,5639,2


In [29]:
# trim the spaces of the cells from the csv file
df_cases.columns = df_cases.columns.to_series().apply(lambda x: x.strip())

In [30]:
features = ['Total Population','Total Citizen Income','Hispanic or Latino','Total Citizen Educated in US',
                                    'Total Households','White Race','Total Families','Citizen College Degree','Citizen Some College  Education',
                                    'Citizen Income over 75000','Citizen High School  Graduate',
                                    'Citizen No Income','Pacific Islander Race','Citizen Graduate or Professional Degree',
                                    'Citizen Income 1-9999','Citizen Less than High School  Education',
                                     'Citizen Income 15000-24999','Citizen Income 15000-24999','Citizen Income 35000-49999']


X = df_cases.iloc[:, :-1] # all columns except last one
Y = df_cases.iloc[:, -1] # Target is the last column in the dataframe: 'Case Rate Categories'

> <b> Split the data

In [31]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2019) # 70% training and 30% test

> <b> Model Development and Prediction

In [32]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Random Forest Classifier
clf = RandomForestClassifier(random_state=2019)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

#check the predictive performance using the actual and predicted values
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.47      0.54        40
           1       0.64      0.79      0.70       431
           2       0.54      0.54      0.54       352
           3       0.29      0.08      0.12        75
           4       0.00      0.00      0.00        23
           5       0.50      0.05      0.09        21

    accuracy                           0.59       942
   macro avg       0.43      0.32      0.33       942
weighted avg       0.55      0.59      0.56       942

