# Decision Tree #

<b> Table of Contents: </b>
<br> [Import and Load the Dataset](#10)
<br> [Data Exploration and Analysis](#20)
<br> [Decision Tree Model: Target Variable - COVID Confirmed Cases](#30)
<br> [Decision Tree Model: Target Variable - COVID Deaths](#40)


<a id = "10"> <h2> Import and Load the Dataset </h2> </a>
___

_Import Modules_

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.preprocessing import Normalizer

_Loading the Dataset_

In [2]:
# load the dataset
df_tristate_cases = pd.read_csv("tristate_data_cases.csv")

In [3]:
df_tristate_cases.head()

Unnamed: 0,NAME,Population,Covid Case Rate (per 1000),Covid Death Rate (per 1000),Households SNAP,Estimated Individuals SNAP,Total Citizen Educated in US,Citizen Less than High School Education,Citizen High School Graduate,Citizen Some College Education,...,Pacific Islander Race,Other Race Alone,Hispanic or Latino,Median Age,Male Median Age,Female Median Age,Total Households,Average Household Size,Total Families,Covid Confirmed Cases
0,"Albany County, New York",304204,289.092188,13.112911,15087,34247,204833,15255,50589,55932,...,4744,7647,289287,38.5,36.8,40.0,126251,2,60631,87943
1,"Allegany County, New York",48946,53.140195,2.574266,2994,7305,30331,3219,11464,9645,...,206,557,48276,37.8,36.2,39.2,18208,2,10576,2601
2,"Bronx County, New York",1385108,1837.024983,168.955056,184934,512267,927374,264309,256066,227450,...,351011,73243,643695,32.8,30.6,34.9,483449,3,368196,2544478
3,"Broome County, New York",200600,120.727817,8.983051,13226,30684,129802,12429,40700,40265,...,1912,5087,193822,40.2,38.1,42.2,82167,2,40559,24218
4,"Cattaraugus County, New York",80317,48.246324,1.593685,5801,13980,53201,6188,21015,15930,...,305,1363,78972,40.7,39.8,41.6,32263,2,18801,3875


<a id = "20"> <h2> Data Exploration and Analysis </h2> </a>
___

In [4]:
df_tristate_cases.describe()

Unnamed: 0,Population,Covid Case Rate (per 1000),Covid Death Rate (per 1000),Households SNAP,Estimated Individuals SNAP,Total Citizen Educated in US,Citizen Less than High School Education,Citizen High School Graduate,Citizen Some College Education,Citizen College Degree,...,Pacific Islander Race,Other Race Alone,Hispanic or Latino,Median Age,Male Median Age,Female Median Age,Total Households,Average Household Size,Total Families,Covid Confirmed Cases
count,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,...,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0
mean,348836.2,494.782434,30.00964,18293.857143,47275.835165,245107.7,30821.10989,65796.32967,59239.230769,51549.593407,...,24173.087912,10097.010989,288933.4,40.156044,38.840659,41.445055,130804.417582,2.43956,79199.538462,345605.7
std,468867.0,569.68126,36.520683,35039.23259,93199.614618,339009.4,58806.714504,84838.264728,73146.413099,78263.212835,...,57081.322656,17237.898715,362236.9,3.098071,3.229894,3.023473,174684.685378,0.499083,106185.924341,706010.2
min,4836.0,16.823179,0.0,112.0,235.0,3604.0,441.0,1155.0,1286.0,313.0,...,4.0,53.0,4785.0,29.8,28.5,31.1,2262.0,2.0,795.0,304.0
25%,64956.0,88.358613,3.93298,3339.0,8134.5,44547.5,4884.5,15221.0,13450.5,5913.0,...,526.0,1006.5,62624.5,38.5,37.0,39.9,25598.0,2.0,13431.0,5696.0
50%,149265.0,230.123991,13.455905,6896.0,17693.0,95679.0,9121.0,31537.0,26080.0,18515.0,...,2211.0,3036.0,126664.0,40.4,39.2,41.9,54244.0,2.0,30884.0,36536.0
75%,467878.0,698.363567,48.039981,16277.0,41301.5,322509.5,29749.5,85696.0,78371.0,64578.5,...,18173.0,11900.0,377046.0,41.7,40.55,42.95,166551.5,3.0,108228.0,297332.0
max,2504700.0,2523.785079,168.955056,231794.0,623526.0,1789355.0,345445.0,461228.0,376485.0,397987.0,...,351011.0,100913.0,2008415.0,51.3,50.9,51.8,916856.0,3.0,594378.0,3587059.0


In [5]:
df_tristate_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 38 columns):
NAME                                          91 non-null object
Population                                    91 non-null int64
Covid Case Rate (per 1000)                    91 non-null float64
Covid Death Rate (per 1000)                   91 non-null float64
 Households SNAP                              91 non-null int64
 Estimated Individuals SNAP                   91 non-null int64
 Total Citizen Educated in US                 91 non-null int64
 Citizen Less than High School  Education     91 non-null int64
 Citizen High School  Graduate                91 non-null int64
 Citizen Some College  Education              91 non-null int64
 Citizen College Degree                       91 non-null int64
 Citizen Graduate or Professional Degree      91 non-null int64
 Total Citizen Income                         91 non-null int64
 Citizen No Income                            91 non-null in

In [6]:
df_tristate_cases.columns.values

array(['NAME', 'Population', 'Covid Case Rate (per 1000)',
       'Covid Death Rate (per 1000)', ' Households SNAP ',
       ' Estimated Individuals SNAP ', ' Total Citizen Educated in US ',
       ' Citizen Less than High School  Education ',
       ' Citizen High School  Graduate ',
       ' Citizen Some College  Education ', ' Citizen College Degree  ',
       ' Citizen Graduate or Professional Degree ',
       ' Total Citizen Income ', ' Citizen No Income ',
       'Citizen Income $1-$9,999', 'Citizen Income $10,000-$14,999',
       'Citizen Income $15,000-$24,999', 'Citizen Income $25,000-$34,999',
       'Citizen Income $35,000-$49,999', 'Citizen Income $50,000-$64,999',
       'Citizen Income $65,000-$74,999', 'Citizen Income $75,000 +',
       ' Total Population ', ' One Race Population ', ' White Race ',
       ' Black Race ', ' Native American Race ', ' Asian Race ',
       ' Pacific Islander Race ', ' Other Race Alone ',
       ' Hispanic or Latino ', 'Median Age', 'Male Med

__Take Away__

There are 37 features plus the 2 target features that will be tested while running different decision tree models: COVID Confirmed Cases and COVID Deaths. 

Testing features that have shown to impact COVID cases.
1. Income and 
2. Race (Specifically 'Black Race' and 'Hispantic or Latino')

In [None]:
# WORK ON THIS SECTION
confirmed_case = 'confirmed'
death = 'not survived'
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(10, 4))

low_income = df_tristate_cases[df_tristate_cases['Citizen Income $10,000-$14,999']=='female']

men = train_df[train_df['Sex']=='male']
ax = sns.distplot(women[women['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[0], kde =False)
ax = sns.distplot(women[women['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[0], kde =False)
ax.legend()
ax.set_title('Female')
ax = sns.distplot(men[men['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[1], kde = False)
ax = sns.distplot(men[men['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[1], kde = False)
ax.legend()
_ = ax.set_title('Male')

<a id = "30"> <h2> Decision Tree Model: Target Variable - COVID Confirmed Cases </h2> </a>
___

In [7]:
# ---------------------------------------------------------------------------------------------------
# Extract needed features as a list of columns 
# excluding target variable 'COVID Confirmed Cases' 
# ---------------------------------------------------------------------------------------------------
## create a list of all the columns
list_all_columns = df_tristate_cases.columns.tolist()

## remove features in list, which are not needed for testing for feature selection, namely:
list_remove_features = ['NAME','Households SNAP', 'Total Families','Covid Case Rate (per 1000)','Covid Death Rate (per 1000)','Covid Confirmed Cases'] 

# Create sets of a,b
setA = set(list_all_columns)
setB = set(list_remove_features)

# Get new set with elements that are only in a but not in b
setlist_X_columns = setA.difference(list_remove_features)

# convert set object to a list
list_X_columns = list(setlist_X_columns)

# Define dependent variables
X = df_tristate_cases[list_X_columns].to_numpy()

#show the columns that need to be included as features
## sort alphabetically
list_X_columns.sort()
## show list 
list_X_columns

[' Asian Race ',
 ' Average Household Size ',
 ' Black Race ',
 ' Citizen College Degree  ',
 ' Citizen Graduate or Professional Degree ',
 ' Citizen High School  Graduate ',
 ' Citizen Less than High School  Education ',
 ' Citizen No Income ',
 ' Citizen Some College  Education ',
 ' Estimated Individuals SNAP ',
 ' Hispanic or Latino ',
 ' Households SNAP ',
 ' Native American Race ',
 ' One Race Population ',
 ' Other Race Alone ',
 ' Pacific Islander Race ',
 ' Total Citizen Educated in US ',
 ' Total Citizen Income ',
 ' Total Families ',
 ' Total Households ',
 ' Total Population ',
 ' White Race ',
 'Citizen Income $1-$9,999',
 'Citizen Income $10,000-$14,999',
 'Citizen Income $15,000-$24,999',
 'Citizen Income $25,000-$34,999',
 'Citizen Income $35,000-$49,999',
 'Citizen Income $50,000-$64,999',
 'Citizen Income $65,000-$74,999',
 'Citizen Income $75,000 +',
 'Female Median Age',
 'Male Median Age',
 'Median Age',
 'Population']

In [8]:
# Define Features and Target variables
X = df_tristate_cases[list_X_columns]
Y = df_tristate_cases['Covid Confirmed Cases']

In [9]:
# Split dataset into training set and test set 
# 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=2019) 

In [10]:
# method 1 for decision tree
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, y_train)  
y_pred = decision_tree.predict(X_test)  
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)


In [None]:
# method 2 for decision tree 
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [11]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.0


In [12]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         304       0.00      0.00      0.00       1.0
         577       0.00      0.00      0.00       0.0
        2601       0.00      0.00      0.00       1.0
        2603       0.00      0.00      0.00       0.0
        3302       0.00      0.00      0.00       0.0
        3875       0.00      0.00      0.00       1.0
        3917       0.00      0.00      0.00       0.0
        4362       0.00      0.00      0.00       0.0
        4423       0.00      0.00      0.00       1.0
        5039       0.00      0.00      0.00       1.0
        5173       0.00      0.00      0.00       0.0
        5584       0.00      0.00      0.00       0.0
        7702       0.00      0.00      0.00       0.0
        8545       0.00      0.00      0.00       0.0
        8556       0.00      0.00      0.00       1.0
       10751       0.00      0.00      0.00       1.0
       10814       0.00      0.00      0.00       1.0
       13840       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
print(metrics.confusion_matrix(y_test, y_pred))

[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


<a id = "40"> <h2> Decision Tree Model: Target Variable - COVID Deaths </h2> </a>
___

_Loading the Dataset_

In [13]:
# load the dataset
df_tristate_deaths = pd.read_csv("tristate_data_deaths.csv")

In [14]:
df_tristate_deaths.head()

Unnamed: 0,NAME,Population,Covid Case Rate (per 1000),Covid Death Rate (per 1000),Households SNAP,Estimated Individuals SNAP,Total Citizen Educated in US,Citizen Less than High School Education,Citizen High School Graduate,Citizen Some College Education,...,Pacific Islander Race,Other Race Alone,Hispanic or Latino,Median Age,Male Median Age,Female Median Age,Total Households,Average Household Size,Total Families,Covid Deaths
0,"Albany County, New York",304204,289.092188,13.112911,15087,34247,204833,15255,50589,55932,...,4744,7647,289287,38.5,36.8,40.0,126251,2,60631,3989
1,"Allegany County, New York",48946,53.140195,2.574266,2994,7305,30331,3219,11464,9645,...,206,557,48276,37.8,36.2,39.2,18208,2,10576,126
2,"Bronx County, New York",1385108,1837.024983,168.955056,184934,512267,927374,264309,256066,227450,...,351011,73243,643695,32.8,30.6,34.9,483449,3,368196,234021
3,"Broome County, New York",200600,120.727817,8.983051,13226,30684,129802,12429,40700,40265,...,1912,5087,193822,40.2,38.1,42.2,82167,2,40559,1802
4,"Cattaraugus County, New York",80317,48.246324,1.593685,5801,13980,53201,6188,21015,15930,...,305,1363,78972,40.7,39.8,41.6,32263,2,18801,128


In [15]:
df_tristate_deaths.describe()

Unnamed: 0,Population,Covid Case Rate (per 1000),Covid Death Rate (per 1000),Households SNAP,Estimated Individuals SNAP,Total Citizen Educated in US,Citizen Less than High School Education,Citizen High School Graduate,Citizen Some College Education,Citizen College Degree,...,Pacific Islander Race,Other Race Alone,Hispanic or Latino,Median Age,Male Median Age,Female Median Age,Total Households,Average Household Size,Total Families,Covid Deaths
count,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,...,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0
mean,348836.2,494.782434,30.00964,18293.857143,47275.835165,245107.7,30821.10989,65796.32967,59239.230769,51549.593407,...,24173.087912,10097.010989,288933.4,40.156044,38.840659,41.445055,130804.417582,2.43956,79199.538462,24234.989011
std,468867.0,569.68126,36.520683,35039.23259,93199.614618,339009.4,58806.714504,84838.264728,73146.413099,78263.212835,...,57081.322656,17237.898715,362236.9,3.098071,3.229894,3.023473,174684.685378,0.499083,106185.924341,59348.612086
min,4836.0,16.823179,0.0,112.0,235.0,3604.0,441.0,1155.0,1286.0,313.0,...,4.0,53.0,4785.0,29.8,28.5,31.1,2262.0,2.0,795.0,0.0
25%,64956.0,88.358613,3.93298,3339.0,8134.5,44547.5,4884.5,15221.0,13450.5,5913.0,...,526.0,1006.5,62624.5,38.5,37.0,39.9,25598.0,2.0,13431.0,222.5
50%,149265.0,230.123991,13.455905,6896.0,17693.0,95679.0,9121.0,31537.0,26080.0,18515.0,...,2211.0,3036.0,126664.0,40.4,39.2,41.9,54244.0,2.0,30884.0,1815.0
75%,467878.0,698.363567,48.039981,16277.0,41301.5,322509.5,29749.5,85696.0,78371.0,64578.5,...,18173.0,11900.0,377046.0,41.7,40.55,42.95,166551.5,3.0,108228.0,19315.0
max,2504700.0,2523.785079,168.955056,231794.0,623526.0,1789355.0,345445.0,461228.0,376485.0,397987.0,...,351011.0,100913.0,2008415.0,51.3,50.9,51.8,916856.0,3.0,594378.0,347696.0


In [24]:
df_tristate_deaths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 38 columns):
NAME                                          91 non-null object
Population                                    91 non-null int64
Covid Case Rate (per 1000)                    91 non-null float64
Covid Death Rate (per 1000)                   91 non-null float64
 Households SNAP                              91 non-null int64
 Estimated Individuals SNAP                   91 non-null int64
 Total Citizen Educated in US                 91 non-null int64
 Citizen Less than High School  Education     91 non-null int64
 Citizen High School  Graduate                91 non-null int64
 Citizen Some College  Education              91 non-null int64
 Citizen College Degree                       91 non-null int64
 Citizen Graduate or Professional Degree      91 non-null int64
 Total Citizen Income                         91 non-null int64
 Citizen No Income                            91 non-null in

In [25]:
# ---------------------------------------------------------------------------------------------------
# Extract needed features as a list of columns 
# excluding target variable 'COVID Deaths' 
# ---------------------------------------------------------------------------------------------------
## create a list of all the columns
list_all_columns = df_tristate_deaths.columns.tolist()

## remove features in list, which are not needed for testing for feature selection, namely:
list_remove_features = ['NAME','Households SNAP', 'Total Families', 'Covid Case Rate (per 1000)', 'Covid Death Rate (per 1000)', 'Covid Deaths'] 

# Create sets of a,b
setA = set(list_all_columns)
setB = set(list_remove_features)

# Get new set with elements that are only in a but not in b
setlist_X_columns = setA.difference(list_remove_features)

# convert set object to a list
list_X_columns = list(setlist_X_columns)

# Define dependent variables
X = df_tristate_deaths[list_X_columns].to_numpy()

#show the columns that need to be included as features
## sort alphabetically
list_X_columns.sort()
## show list 
list_X_columns

[' Asian Race ',
 ' Average Household Size ',
 ' Black Race ',
 ' Citizen College Degree  ',
 ' Citizen Graduate or Professional Degree ',
 ' Citizen High School  Graduate ',
 ' Citizen Less than High School  Education ',
 ' Citizen No Income ',
 ' Citizen Some College  Education ',
 ' Estimated Individuals SNAP ',
 ' Hispanic or Latino ',
 ' Households SNAP ',
 ' Native American Race ',
 ' One Race Population ',
 ' Other Race Alone ',
 ' Pacific Islander Race ',
 ' Total Citizen Educated in US ',
 ' Total Citizen Income ',
 ' Total Families ',
 ' Total Households ',
 ' Total Population ',
 ' White Race ',
 'Citizen Income $1-$9,999',
 'Citizen Income $10,000-$14,999',
 'Citizen Income $15,000-$24,999',
 'Citizen Income $25,000-$34,999',
 'Citizen Income $35,000-$49,999',
 'Citizen Income $50,000-$64,999',
 'Citizen Income $65,000-$74,999',
 'Citizen Income $75,000 +',
 'Covid Deaths ',
 'Female Median Age',
 'Male Median Age',
 'Median Age',
 'Population']

In [26]:
# Define Features and Target variables
X = df_tristate_deaths[list_X_columns]
Y = df_tristate_deaths['Covid Deaths']

KeyError: 'Covid Deaths'

In [22]:
# Define Features and Target variables
# does not work because I want to drop certain columns 'NAME' is an obj
#X = df_tristate_deaths.iloc[:, :-1] #features is all columns in the df except the last column
#Y = df_tristate_deaths.iloc[:, -1] # target is the last column of the df 'Covid Deaths'

In [23]:
# Split dataset into training set and test set 
# 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=2019) 

ValueError: Found input variables with inconsistent numbers of samples: [90, 91]

In [21]:
# dont run yet
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

ValueError: could not convert string to float: 'Schoharie County, New York'

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
print(metrics.confusion_matrix(y_test, y_pred))