# ![](https://img.webmd.com/dtmcms/live/webmd/consumer_assets/site_images/articles/health_tools/did_you_know_this_could_lead_to_heart_disease_slideshow/493ss_thinkstock_rf_heart_illustration.jpg) EDA & Feature Engineering...

### Importing Libraries

In [1]:
# Import our libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Importing Dataset

datasource : https://chronicdata.cdc.gov/Heart-Disease-Stroke-Prevention/National-Vital-Statistics-System-NVSS-National-Car/kztq-p2jf

In [2]:
df = pd.read_csv('./datasets/Stroke_data.csv', low_memory=False) #importing the dataset

In [3]:
df.head()

Unnamed: 0,Year,LocationAbbr,LocationDesc,DataSource,PriorityArea1,PriorityArea2,PriorityArea3,PriorityArea4,Category,Topic,...,Break_Out_Category,Break_Out,CategoryId,TopicId,IndicatorID,Data_Value_TypeID,BreakOutCategoryId,BreakOutId,LocationID,GeoLocation
0,2013,IL,Illinois,NVSS,,,,,Cardiovascular Diseases,Stroke,...,Gender,Female,C1,T6,NV007,AgeStdz,BOC02,GEN02,17,POINT (-88.99771017799969 40.48501028300046)
1,2000,MT,Montana,NVSS,,,,,Cardiovascular Diseases,Major Cardiovascular Disease,...,Race,Other,C1,T1,NV001,AgeStdz,BOC04,RAC07,30,POINT (-109.42442064499971 47.06652897200047)
2,2000,VT,Vermont,NVSS,,,,,Cardiovascular Diseases,Major Cardiovascular Disease,...,Age,75+,C1,T1,NV001,Crude,BOC03,AGE08,50,POINT (-72.51764079099962 43.62538123900049)
3,2001,KS,Kansas,NVSS,,,,,Cardiovascular Diseases,Stroke,...,Race,Hispanic,C1,T6,NV007,Crude,BOC04,RAC04,20,POINT (-98.20078122699965 38.34774030000045)
4,2004,AL,Alabama,NVSS,,,,,Cardiovascular Diseases,Stroke,...,Age,25-44,C1,T6,NV007,Crude,BOC03,AGE04,1,POINT (-86.63186076199969 32.84057112200048)


### Looking for target variable

First we will take a look at the unique values and number of unique values in a particular column to find out how many categories or diseases are we dealing with

In [4]:
df['Break_Out_Category'].unique()
#checking the unique values of the column

array(['Gender', 'Race', 'Age', 'Overall'], dtype=object)

In [5]:
df['Break_Out'].unique()
#checking the unique values of the column

array(['Female', 'Other', '75+', 'Hispanic', '25-44', '45-64', '65+',
       'Non-Hispanic Black', '35+', 'Male', 'Non-Hispanic White',
       'Overall', '18-24'], dtype=object)

In [6]:
df['Break_Out'].nunique()
#checking the number of unique values of the column

13

In [7]:
df['Topic'].unique()
#checking the unique values in the column

array(['Stroke', 'Major Cardiovascular Disease',
       'Diseases of the Heart (Heart Disease)',
       'Acute Myocardial Infarction (Heart Attack)', 'Heart Failure',
       'Coronary Heart Disease'], dtype=object)

In [8]:
df['Topic'].nunique()
#checking the number of unique values in the column

6

In [9]:
df['Category'].nunique()
#checking the number of unique values in the column

1

In [10]:
df[['Break_Out_Category', 'Break_Out']]
#comparing the two columns side by side

Unnamed: 0,Break_Out_Category,Break_Out
0,Gender,Female
1,Race,Other
2,Age,75+
3,Race,Hispanic
4,Age,25-44
...,...,...
158075,Race,Hispanic
158076,Gender,Male
158077,Overall,Overall
158078,Race,Other


In [11]:
df[['Data_Value', 'Data_Value_Alt']]
#comparing the two columns side by side

Unnamed: 0,Data_Value,Data_Value_Alt
0,30.0,30
1,362.3,362.3
2,3637.0,3637
3,24.1,24.1
4,1.6,1.6
...,...,...
158075,,-2
158076,9.1,9.1
158077,11.0,11
158078,9.9,9.9


### Dummifying the variables

We will first convert the categories into binary columns so that we can take a look at them separately by dummifying it

In [12]:
df2 = pd.get_dummies(df[['Break_Out_Category', 'Break_Out']])
df2
#dummifying the two columns and creating a new dataset out of it

Unnamed: 0,Break_Out_Category_Age,Break_Out_Category_Gender,Break_Out_Category_Overall,Break_Out_Category_Race,Break_Out_18-24,Break_Out_25-44,Break_Out_35+,Break_Out_45-64,Break_Out_65+,Break_Out_75+,Break_Out_Female,Break_Out_Hispanic,Break_Out_Male,Break_Out_Non-Hispanic Black,Break_Out_Non-Hispanic White,Break_Out_Other,Break_Out_Overall
0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158075,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
158076,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
158077,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
158078,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


No we will do the same thing with the "Topic" column to see the different diseases in separate columns

In [13]:
df3 = pd.get_dummies(df[['Topic']])
df3

Unnamed: 0,Topic_Acute Myocardial Infarction (Heart Attack),Topic_Coronary Heart Disease,Topic_Diseases of the Heart (Heart Disease),Topic_Heart Failure,Topic_Major Cardiovascular Disease,Topic_Stroke
0,0,0,0,0,0,1
1,0,0,0,0,1,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,0,0,1
...,...,...,...,...,...,...
158075,0,0,0,0,0,1
158076,0,0,0,0,0,1
158077,0,0,0,0,0,1
158078,0,0,0,0,0,1


Now that we have checked the things we needed to check, we will now look at the null and missing values in the dataset

In [14]:
df.isnull().sum()[df.isnull().sum() != 0].sort_values(ascending=False)
#checking the null values in the dataframe

Data_Value_Footnote           129093
Data_Value_Footnote_Symbol    129093
HighConfidenceLimit            28987
LowConfidenceLimit             28987
Data_Value                     28987
GeoLocation                     3040
dtype: int64

In [15]:
df2.shape #checking the shape of df2

(158080, 17)

In [16]:
df3.shape #checking the shape of df3

(158080, 6)

### Combining the three datasets

The reason to combine the three datasets is that we want to run models on different diseases separately by using the categories. We want to know how much better our model can predict on a particular Disease

In [17]:
stroke = pd.concat([df,df2,df3],axis=1)
#concatinating the two datasets

In [18]:
stroke.head()
#viewing the newly created dataset

Unnamed: 0,Year,LocationAbbr,LocationDesc,DataSource,PriorityArea1,PriorityArea2,PriorityArea3,PriorityArea4,Category,Topic,...,Break_Out_Non-Hispanic Black,Break_Out_Non-Hispanic White,Break_Out_Other,Break_Out_Overall,Topic_Acute Myocardial Infarction (Heart Attack),Topic_Coronary Heart Disease,Topic_Diseases of the Heart (Heart Disease),Topic_Heart Failure,Topic_Major Cardiovascular Disease,Topic_Stroke
0,2013,IL,Illinois,NVSS,,,,,Cardiovascular Diseases,Stroke,...,0,0,0,0,0,0,0,0,0,1
1,2000,MT,Montana,NVSS,,,,,Cardiovascular Diseases,Major Cardiovascular Disease,...,0,0,1,0,0,0,0,0,1,0
2,2000,VT,Vermont,NVSS,,,,,Cardiovascular Diseases,Major Cardiovascular Disease,...,0,0,0,0,0,0,0,0,1,0
3,2001,KS,Kansas,NVSS,,,,,Cardiovascular Diseases,Stroke,...,0,0,0,0,0,0,0,0,0,1
4,2004,AL,Alabama,NVSS,,,,,Cardiovascular Diseases,Stroke,...,0,0,0,0,0,0,0,0,0,1


In [19]:
stroke.isnull().sum()[stroke.isnull().sum() != 0].sort_values(ascending=False)
#checking the null values in the dataset

Data_Value_Footnote           129093
Data_Value_Footnote_Symbol    129093
HighConfidenceLimit            28987
LowConfidenceLimit             28987
Data_Value                     28987
GeoLocation                     3040
dtype: int64

In [20]:
#checking the percentage of missing values of a particular column
missing = pd.concat([stroke.isnull().sum().sort_values(ascending=False), 
                     ((stroke.isnull().sum().sort_values(ascending=False)/stroke.shape[0])*100)], 
                    axis=1)
missing.columns = ['count', 'percentage']
missing

Unnamed: 0,count,percentage
Data_Value_Footnote,129093,81.663082
Data_Value_Footnote_Symbol,129093,81.663082
HighConfidenceLimit,28987,18.336918
LowConfidenceLimit,28987,18.336918
Data_Value,28987,18.336918
GeoLocation,3040,1.923077
Topic_Stroke,0,0.0
Data_Value_TypeID,0,0.0
TopicId,0,0.0
CategoryId,0,0.0


In [28]:
stroke.info()
#checking the dataframe information

<class 'pandas.core.frame.DataFrame'>
Int64Index: 126055 entries, 0 to 158078
Data columns (total 49 columns):
 #   Column                                            Non-Null Count   Dtype 
---  ------                                            --------------   ----- 
 0   Year                                              126055 non-null  int64 
 1   LocationAbbr                                      126055 non-null  object
 2   LocationDesc                                      126055 non-null  object
 3   DataSource                                        126055 non-null  object
 4   PriorityArea1                                     126055 non-null  object
 5   PriorityArea2                                     126055 non-null  object
 6   PriorityArea3                                     126055 non-null  object
 7   PriorityArea4                                     126055 non-null  object
 8   Category                                          126055 non-null  object
 9   Topic          

We will drop the columns we wont be using in the future either because there are too many missing values in it or they are basically the same as the other column. 

In [22]:
stroke = stroke.drop('Data_Value_Footnote_Symbol', axis=1) #dropping unneeded column

In [23]:
stroke = stroke.drop('Data_Value_Footnote', axis=1) #dropping unneeded column

In [24]:
stroke = stroke.drop('Data_Value_Alt', axis=1) #dropping unneeded column

In [25]:
stroke.dropna(axis = 0,inplace=True) #dropping the rows having null values in them

In [26]:
stroke.isnull().sum()[stroke.isnull().sum() != 0].sort_values(ascending=False)
#checking the remaining null values in the dataset

Series([], dtype: int64)

In [27]:
stroke.describe()
#describing the whole dataframe

Unnamed: 0,Year,LocationID,Break_Out_Category_Age,Break_Out_Category_Gender,Break_Out_Category_Overall,Break_Out_Category_Race,Break_Out_18-24,Break_Out_25-44,Break_Out_35+,Break_Out_45-64,...,Break_Out_Non-Hispanic Black,Break_Out_Non-Hispanic White,Break_Out_Other,Break_Out_Overall,Topic_Acute Myocardial Infarction (Heart Attack),Topic_Coronary Heart Disease,Topic_Diseases of the Heart (Heart Disease),Topic_Heart Failure,Topic_Major Cardiovascular Disease,Topic_Stroke
count,126055.0,126055.0,126055.0,126055.0,126055.0,126055.0,126055.0,126055.0,126055.0,126055.0,...,126055.0,126055.0,126055.0,126055.0,126055.0,126055.0,126055.0,126055.0,126055.0,126055.0
mean,2009.047019,28.717242,0.285042,0.245282,0.122994,0.346682,0.005593,0.038142,0.061497,0.056943,...,0.087732,0.122288,0.069652,0.122994,0.122407,0.133815,0.140423,0.106406,0.142763,0.354187
std,5.489337,15.588215,0.451436,0.430256,0.328432,0.475915,0.074576,0.19154,0.240241,0.231735,...,0.282905,0.327619,0.254561,0.328432,0.327756,0.340454,0.347427,0.308358,0.349832,0.478268
min,2000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2004.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2009.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2014.0,41.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,2018.0,56.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
stroke['PriorityArea1'].unique() #unique values in priority Area1

array(['None', 'Million Hearts'], dtype=object)

In [30]:
stroke['PriorityArea2'].unique() #unique values in Priority Area 2

array(['None'], dtype=object)

In [31]:
stroke['PriorityArea3'].unique() #unique values in Priority Area3

array(['None', 'Healthy People 2020'], dtype=object)

In [32]:
stroke['PriorityArea4'].unique() #unique values in Priority Area4

array(['None'], dtype=object)

In [33]:
stroke['PriorityArea1'].str.contains(r'Million Hearts').sum()
#counting the number of times "Million Hearts" appeared in Priority Area1

31303

In [34]:
stroke['PriorityArea3'].str.contains(r'Healthy People 2020').sum()
#counting the number of times "Healthy People 2020" appeared in Priority Area3

32741

Dropping another set of columns we won't be needing in the future

In [35]:
stroke = stroke.drop(['PriorityArea1','PriorityArea2','PriorityArea3','PriorityArea4'], axis=1) #dropping unneeded columns

In [36]:
stroke = stroke.drop(['IndicatorID'], axis=1) #dropping unneeded columns

In [37]:
stroke = stroke.drop(['CategoryId'], axis=1)

In [38]:
stroke = stroke.drop(['DataSource'], axis=1)

In [39]:
stroke.head()

Unnamed: 0,Year,LocationAbbr,LocationDesc,Category,Topic,Indicator,Data_Value_Type,Data_Value_Unit,Data_Value,LowConfidenceLimit,...,Break_Out_Non-Hispanic Black,Break_Out_Non-Hispanic White,Break_Out_Other,Break_Out_Overall,Topic_Acute Myocardial Infarction (Heart Attack),Topic_Coronary Heart Disease,Topic_Diseases of the Heart (Heart Disease),Topic_Heart Failure,Topic_Major Cardiovascular Disease,Topic_Stroke
0,2013,IL,Illinois,Cardiovascular Diseases,Stroke,Rate of ischemic stroke mortality among US adu...,Age-Standardized,"Rate per 100,000",30.0,30.0,...,0,0,0,0,0,0,0,0,0,1
1,2000,MT,Montana,Cardiovascular Diseases,Major Cardiovascular Disease,Rate of major cardiovascular disease mortality...,Age-Standardized,"Rate per 100,000",362.3,358.6,...,0,0,1,0,0,0,0,0,1,0
2,2000,VT,Vermont,Cardiovascular Diseases,Major Cardiovascular Disease,Rate of major cardiovascular disease mortality...,Crude,"Rate per 100,000",3637.0,3636.9,...,0,0,0,0,0,0,0,0,1,0
3,2001,KS,Kansas,Cardiovascular Diseases,Stroke,Rate of ischemic stroke mortality among US adu...,Crude,"Rate per 100,000",24.1,23.5,...,0,0,0,0,0,0,0,0,0,1
4,2004,AL,Alabama,Cardiovascular Diseases,Stroke,Rate of ischemic stroke mortality among US adu...,Crude,"Rate per 100,000",1.6,1.6,...,0,0,0,0,0,0,0,0,0,1


In [40]:
stroke["Data_Value"]=pd.to_numeric(stroke["Data_Value"],errors='coerce') #converting to numeric form

In [41]:
stroke["Data_Value_Type"].nunique()

2

In [42]:
stroke["Data_Value_Type"].unique()

array(['Age-Standardized', 'Crude'], dtype=object)

In [43]:
#creating a dummy of Data_Value_Type column and giving it binary data
stroke['Data_Value_Type'] = stroke['Data_Value_Type'].map({'Age-Standardized': 0, 'Crude': 1})

In [44]:
#converting to numeric form
stroke["LowConfidenceLimit"]=pd.to_numeric(stroke["LowConfidenceLimit"],errors='coerce') 

In [45]:
 #converting to numeric form
stroke["HighConfidenceLimit"]=pd.to_numeric(stroke["HighConfidenceLimit"],errors='coerce')

In [46]:
stroke.isnull().sum()[stroke.isnull().sum() != 0].sort_values(ascending=False)
#checking the null values in the dataframe

HighConfidenceLimit    4672
LowConfidenceLimit     4665
Data_Value             4665
dtype: int64

In [47]:
stroke.dropna(axis = 0,inplace=True) #dropping the rows having null values in them

In [48]:
#checking the percentage diff between High Confidence Limit and Low Confidence Limit
stroke['AvgConfidenceLimit']= stroke['LowConfidenceLimit']/stroke['HighConfidenceLimit']

### Fixing Outliers

In [49]:
from scipy import stats
z_scores = stats.zscore(stroke[['Data_Value','LowConfidenceLimit','HighConfidenceLimit']])
#calculate z-scores of `df`

abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)

In [50]:
stroke[['Data_Value','LowConfidenceLimit','HighConfidenceLimit']] = stroke[['Data_Value','LowConfidenceLimit','HighConfidenceLimit']][filtered_entries]

In [51]:
stroke.dropna(axis = 0,inplace=True) #dropping the rows having null values in them

### Saving the updated Dataset

In [52]:
stroke.to_csv(r'./datasets/stroke.csv', index = False) #saving the dataset