# Random Forest 

Loading Modules

In [3]:
# import modules
import pandas as pd
import numpy as np

#Import scikit-learn dataset library
from sklearn import datasets

# Import train_test_split function
from sklearn.model_selection import train_test_split

# import oversampling and undersampling packages
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler 

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report 

# Cross Validation packages
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# import required modules for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# import module for gridsearch (to find optimal hyper-parameters)
from sklearn.model_selection import GridSearchCV

# import module (to test execution time of a codeblock to run)
import time

import warnings 

from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC



In [4]:
# packages for Receiver Operating Characteristic (ROC) with cross validation
# source: https://scikit-learn.org/0.18/auto_examples/model_selection/plot_roc_crossval.html
print(__doc__)

import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import plot_roc_curve

Automatically created module for IPython interactive environment


<a id = ""> <h2> Target Variable 'Death Rate Categories' </h2> </a>
___

<b> read csv file to a pandas dataframe

In [5]:
#Load dataset
# read csv file to a pandas dataframe
df_death = pd.read_csv("census_covid_cat_final.csv")

<b> show all columns in dataset

In [6]:
# show all columns in dataset
print('There are', len(df_death.columns)-1, 'features, and the target `Death Rate Categories`:')
list(df_death.columns)
# we will not include state and county names or state abbr

There are 42 features, and the target `Death Rate Categories`:


['Full Name',
 'Country Name',
 'State',
 'State Abbr',
 ' Total Population ',
 ' Households SNAP ',
 ' Estimated Individuals SNAP ',
 ' Total Citizen Educated in US ',
 ' Citizen Less than High School  Education ',
 ' Citizen High School  Graduate ',
 ' Citizen Some College  Education ',
 ' Citizen College Degree  ',
 ' Citizen Graduate or Professional Degree ',
 ' Total Citizen Income ',
 ' Citizen No Income ',
 'Citizen Income 1-9999',
 'Citizen Income 10000-14999',
 'Citizen Income 15000-24999',
 'Citizen Income 25000-34999',
 'Citizen Income 35000-49999',
 'Citizen Income 50000-64999',
 'Citizen Income 65000-74999',
 'Citizen Income over 75000',
 ' One Race Population ',
 ' White Race ',
 ' Black Race ',
 ' Native American Race ',
 ' Asian Race ',
 ' Pacific Islander Race ',
 ' Other Race Alone ',
 ' Hispanic or Latino ',
 'Median Age',
 'Male Median Age',
 'Female Median Age',
 ' Total Households ',
 ' Average Household Size ',
 ' Total Families ',
 'Total Confirmed Cases',
 'Tot

In [7]:
# trim the spaces of the cells from the csv file
df_death.columns = df_death.columns.to_series().apply(lambda x: x.strip())

In [8]:
# need to change int64 to cat type for target vars
df_death['Death Rate Categories'] = df_death['Death Rate Categories'].astype('category')

<a id = ""> <h2> 1.0 Basic Model Random Forest </h2> </a>

> <b> Declare Features and Target

In [9]:
# Define Features and Target variables
features = ['Total Population','Households SNAP','Estimated Individuals SNAP','Total Citizen Educated in US',
                       'Citizen Less than High School  Education','Citizen High School  Graduate',
                       'Citizen Some College  Education','Citizen College Degree','Citizen Graduate or Professional Degree',
                       'Total Citizen Income','Citizen No Income','Citizen Income 1-9999','Citizen Income 10000-14999',
                       'Citizen Income 15000-24999','Citizen Income 25000-34999','Citizen Income 35000-49999',
                       'Citizen Income 50000-64999','Citizen Income 65000-74999','Citizen Income over 75000',
                       'One Race Population','White Race','Black Race','Native American Race','Asian Race',
                       'Pacific Islander Race','Other Race Alone','Hispanic or Latino','Median Age', 
                        'Male Median Age','Female Median Age','Total Households',
                       'Average Household Size','Total Families']

X = df_death[features] # from the list above
Y = df_death.iloc[:, -1] # Target is the last column in the dataframe: 'Death Rate Categories'

> <b> Split the data

In [10]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2019) # 70% training and 30% test

> <b> Model Development and Prediction

In [11]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Random Forest Classifier
clf = RandomForestClassifier(random_state=2019)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

#check the predictive performance using the actual and predicted values
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.82      0.75       390
           1       0.41      0.50      0.45       203
           2       0.40      0.35      0.37       228
           3       0.28      0.06      0.09        89
           4       0.38      0.11      0.17        27
           5       0.00      0.00      0.00         5

    accuracy                           0.54       942
   macro avg       0.36      0.31      0.31       942
weighted avg       0.50      0.54      0.51       942



  _warn_prf(average, modifier, msg_start, len(result))


## After norm data

In [12]:
df_death_norm = pd.read_csv("final_data.csv")

In [13]:
# trim the spaces of the cells from the csv file
df_death_norm.columns = df_death_norm.columns.to_series().apply(lambda x: x.strip())

In [14]:
features_norm = ['Total Population_iqr_bc',
                        'Total Citizen Educated in US_iqr_bc','Citizen Less than High School  Education_iqr_bc',
                        'Citizen High School  Graduate_iqr_bc','Citizen Some College  Education_iqr_bc',
                         'Citizen College Degree_iqr_bc','Total Citizen Income_iqr_bc','Citizen No Income_iqr_bc',
                          'Citizen Income 1-9999_iqr_bc','Citizen Income 10000-14999_iqr_bc','Citizen Income 15000-24999_iqr_bc',
                         'Citizen Income 25000-34999_iqr_bc','Citizen Income 35000-49999_iqr_bc',
                        'Citizen Income 50000-64999_iqr_bc','Citizen Income over 75000_iqr_bc',
                        'Hispanic or Latino_iqr_bc','Median Age_iqr_bc',
                         'Male Median Age_iqr_bc', 'Female Median Age_iqr_bc','Total Households_iqr_bc',
                       'Average Household Size_iqr_bc']


X = df_death_norm[features_norm] # from the list above
Y = df_death_norm['Death Rate Categories'] # Target is the last column in the dataframe: 'Death Rate Categories'

> <b> Split the data

In [15]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2019) # 70% training and 30% test

> <b> Model Development and Prediction

In [16]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Random Forest Classifier
clf = RandomForestClassifier(random_state=2019)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

#check the predictive performance using the actual and predicted values
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.82      0.73       390
           1       0.41      0.48      0.44       203
           2       0.40      0.35      0.37       228
           3       0.00      0.00      0.00        89
           4       0.12      0.04      0.06        27
           5       0.00      0.00      0.00         5

    accuracy                           0.53       942
   macro avg       0.27      0.28      0.27       942
weighted avg       0.46      0.53      0.49       942



<a id = ""> <h2> Target Variable 'Case Rate Categories' </h2> </a>
___

In [20]:
df_cases = pd.read_csv("census_covid_cat_final.csv")

In [21]:
# trim the spaces of the cells from the csv file
df_cases.columns = df_cases.columns.to_series().apply(lambda x: x.strip())

In [22]:
# drop Death Rate Categories -- need Case Rate Categories to be the target variable
df_cases = df_cases.iloc[:, :-1]

In [23]:
X = df_cases.iloc[:, 4:-7] # Features is all columns in the dataframe except the last 6 columns
Y = df_cases.iloc[:, -1] # Target is the last column in the dataframe: 'Case Rate Categories'

In [24]:
features_norm = ['Total Population_iqr_bc',
                        'Total Citizen Educated in US_iqr_bc','Citizen Less than High School  Education_iqr_bc',
                        'Citizen High School  Graduate_iqr_bc','Citizen Some College  Education_iqr_bc',
                         'Citizen College Degree_iqr_bc','Total Citizen Income_iqr_bc','Citizen No Income_iqr_bc',
                          'Citizen Income 1-9999_iqr_bc','Citizen Income 10000-14999_iqr_bc','Citizen Income 15000-24999_iqr_bc',
                         'Citizen Income 25000-34999_iqr_bc','Citizen Income 35000-49999_iqr_bc',
                        'Citizen Income 50000-64999_iqr_bc','Citizen Income over 75000_iqr_bc',
                        'Hispanic or Latino_iqr_bc','Median Age_iqr_bc',
                         'Male Median Age_iqr_bc', 'Female Median Age_iqr_bc','Total Households_iqr_bc',
                       'Average Household Size_iqr_bc']


X = df_death_norm[features_norm] # from the list above
Y = df_death_norm['Death Rate Categories'] # Target is the last column in the dataframe: 'Death Rate Categories'

> <b> Split the data

In [25]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2019) # 70% training and 30% test

> <b> Model Development and Prediction

In [26]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Random Forest Classifier
clf = RandomForestClassifier(random_state=2019)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

#check the predictive performance using the actual and predicted values
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.82      0.73       390
           1       0.41      0.48      0.44       203
           2       0.40      0.35      0.37       228
           3       0.00      0.00      0.00        89
           4       0.12      0.04      0.06        27
           5       0.00      0.00      0.00         5

    accuracy                           0.53       942
   macro avg       0.27      0.28      0.27       942
weighted avg       0.46      0.53      0.49       942

