# Stroke Predicton - Data Analysis

##  Import Python Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import statistics

## Import Dataset

In [None]:
dataframe = pd.read_csv('healthcare-dataset-stroke-data.csv',index_col=0)
print(dataframe.head(5))

Attribute Information
* 1) id: unique identifier p
* 2) gender: "Male", "Female" or "Other"
* 3) age: age of the patient
* 4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
* 5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
* 6) ever_married: "No" or "Yes"
* 7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
* 8) Residence_type: "Rural" or "Urban"
* 9) avg_glucose_level: average glucose level in blood
* 10) bmi: body mass index
* 11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
* 12) stroke: 1 if the patient had a stroke or 0 if not
* Note: "Unknown" in smoking_status means that the information is unavailable for this patient

### Data Quality

In [None]:
dataframe.info()

In [None]:
#Ver se existe alguma row duplicada no dataset
dataframe[dataframe.duplicated() == True]

In [None]:
#ver missing values dentro do dataset
dataframe.isnull().sum()

## Exploration Data

In [None]:
cols = ['age','avg_glucose_level','bmi']
np.round(dataframe[cols].describe(), 2).T[['mean', 'std', 'min', 'max',"25%","50%","75%"]]

In [None]:
boxplot_1 = dataframe.boxplot(column=['age','avg_glucose_level','bmi'], 
                            grid=False, rot=45, fontsize=15)
print(boxplot_1)

### Stroke

In [None]:
stroke_1 = dataframe[dataframe['stroke']==1]
stroke_1 = stroke_1['stroke'].count()
stroke_0 = dataframe[dataframe['stroke']==0]
stroke_0 = stroke_0['stroke'].count()

objects = ('Stroke 1', 'Stroke 0')
Count = [stroke_1, stroke_0]
y_pos = np.arange(len(objects))

plt.bar(objects, Count)
plt.xticks(y_pos, objects)
plt.ylabel('Count')
plt.title('Stroke (1; 0) Count')


for index,data in enumerate(Count):
    plt.text(x=index , y =data+1 , s=f"{data}" , fontdict=dict(fontsize=20))

plt.show()

In [None]:
print("Stroke Probability:", round(stroke_1/(stroke_1+stroke_0)*100,2),"%")

### Gender

In [None]:
gender_m = dataframe[dataframe['gender']=="Male"]
gender_m = gender_m['gender'].count()
gender_f = dataframe[dataframe['gender']=="Female"]
gender_f = gender_f['gender'].count()

objects = ('Male', 'Female')
Count = [gender_m, gender_f]
y_pos = np.arange(len(objects))

plt.bar(objects, Count)
plt.xticks(y_pos, objects)
plt.ylabel('Count')
plt.title('Male and Female')


for index,data in enumerate(Count):
    plt.text(x=index , y =data+1 , s=f"{data}" , fontdict=dict(fontsize=20))

plt.show()

In [None]:
print("Male Percentage:", round(gender_m/(gender_m+gender_f)*100,2),"%")
print("Female Percentage:", round(gender_f/(gender_m+gender_f)*100,2),"%")

### Gender - Stroke

In [None]:
gender_m = dataframe[dataframe['gender']=="Male"]
gender_m_1 = gender_m[gender_m['stroke']==1]
gender_m_1 = gender_m_1['gender'].count()
gender_m = gender_m['gender'].count()
gender_f = dataframe[dataframe['gender']=="Female"]
gender_f_1 =gender_f[gender_f['stroke']==1]
gender_f_1 = gender_f_1['gender'].count()
gender_f = gender_f['gender'].count()
p_male_stroke = round(gender_m_1/gender_m*100,2)
p_female_stroke = round(gender_f_1/gender_f*100,2)


fig = go.Figure(data=[go.Table(
    header=dict(values=['Gender', 'Total', 'Stroke','% Stroke'],
                line_color='darkslategray',
                fill_color='lightskyblue',
                align='left'),
    cells=dict(values=[['Male', 'Female'], # 1st column
                       [gender_m, gender_f], # 2nd column
                       [gender_m_1, gender_f_1], # 3nd column
                       [str(p_male_stroke)+" %", str(p_female_stroke)+" %"], # 4nd column
                      ], 
               line_color='darkslategray',
               fill_color='lightcyan',
               align='left'))])

fig.update_layout(width=500, height=300)
fig.show()

### Age - Stroke

In [None]:
def bpl(dataframe, col, y):
    var_1 = dataframe[dataframe[y]==1]
    var_1 = var_1[col].to_numpy()
    var_0 = dataframe[dataframe[y]==0]
    var_0 = var_0[col].to_numpy()
    all_arr = [[var_0],
               [var_1]]
    return sns.boxplot(data=all_arr)
    


bpl(dataframe, "age", "stroke")

### Avg_glucose_level - Stroke

In [None]:
bpl(dataframe, "avg_glucose_level", "stroke")

### Bmi - Stroke

In [None]:
bpl(dataframe, "bmi", "stroke")

### Hypertension

In [None]:
hypertension_1 = dataframe[dataframe['hypertension']==1]
hypertension_1 = hypertension_1['hypertension'].count()
hypertension_0 = dataframe[dataframe['hypertension']==0]
hypertension_0 = hypertension_0['hypertension'].count()

objects = ('1', '0')
Count = [hypertension_1, hypertension_0]
y_pos = np.arange(len(objects))

plt.bar(objects, Count)
plt.xticks(y_pos, objects)
plt.xlabel('Hypertension')
plt.ylabel('Count')
plt.title('Hypertension')

for index,data in enumerate(Count):
    plt.text(x=index , y =data+1 , s=f"{data}" , fontdict=dict(fontsize=20))

plt.show()



### Hypertension - Stroke

In [None]:
hypertension_m = dataframe[dataframe['hypertension']==1]
hypertension_m_1 = hypertension_m[hypertension_m['stroke']==1]
hypertension_m_1 = hypertension_m_1['hypertension'].count()
hypertension_m = hypertension_m['hypertension'].count()

hypertension_f = dataframe[dataframe['hypertension']==0]
hypertension_f_1 =hypertension_f[hypertension_f['stroke']==1]
hypertension_f_1 = hypertension_f_1['hypertension'].count()
hypertension_f = hypertension_f['hypertension'].count()
p_hypertension_stroke = round(hypertension_m_1/hypertension_m*100,2)
p_hypertension_n_stroke = round(hypertension_f_1/hypertension_f*100,2)


fig = go.Figure(data=[go.Table(
    header=dict(values=['Hypertension', 'Total', 'Stroke','% Stroke'],
                line_color='darkslategray',
                fill_color='lightskyblue',
                align='left'),
    cells=dict(values=[['1', '0'], # 1st column
                       [hypertension_m, hypertension_f], # 2nd column
                       [hypertension_m_1, hypertension_f_1], # 3nd column
                       [str(p_hypertension_stroke)+" %", str(p_hypertension_n_stroke)+" %"], # 4nd column
                      ], 
               line_color='darkslategray',
               fill_color='lightcyan',
               align='left'))])

fig.update_layout(width=600, height=300)
fig.show()

### Heart_disease

In [None]:
heart_disease_1 = dataframe[dataframe['heart_disease']==1]
heart_disease_1 = heart_disease_1['heart_disease'].count()
heart_disease_0 = dataframe[dataframe['heart_disease']==0]
heart_disease_0 = heart_disease_0['heart_disease'].count()

objects = ('1', '0')
Count = [heart_disease_1, heart_disease_0]
y_pos = np.arange(len(objects))

plt.bar(objects, Count)
plt.xticks(y_pos, objects)
plt.xlabel('Heart Disease')
plt.ylabel('Count')
plt.title('Heart Disease')

for index,data in enumerate(Count):
    plt.text(x=index , y =data+1 , s=f"{data}" , fontdict=dict(fontsize=20))

plt.show()

### Heart_disease - Stroke

In [None]:
heart_disease_m = dataframe[dataframe['heart_disease']==1]
heart_disease_m_1 = heart_disease_m[heart_disease_m['stroke']==1]
heart_disease_m_1 = heart_disease_m_1['heart_disease'].count()
heart_disease_m = heart_disease_m['heart_disease'].count()
heart_disease_f = dataframe[dataframe['heart_disease']==0]
heart_disease_f_1 =heart_disease_f[heart_disease_f['stroke']==1]
heart_disease_f_1 = heart_disease_f_1['heart_disease'].count()
heart_disease_f = heart_disease_f['heart_disease'].count()
p_heart_disease_stroke = round(heart_disease_m_1/heart_disease_m*100,2)
p_heart_disease_no_stroke = round(heart_disease_f_1/heart_disease_f*100,2)


fig = go.Figure(data=[go.Table(
    header=dict(values=['Heart_disease', 'Total', 'Stroke','% Stroke'],
                line_color='darkslategray',
                fill_color='lightskyblue',
                align='left'),
    cells=dict(values=[['1', '0'], # 1st column
                       [heart_disease_m, heart_disease_f], # 2nd column
                       [heart_disease_m_1, heart_disease_f_1], # 3nd column
                       [str(p_heart_disease_stroke)+" %", str(p_heart_disease_no_stroke)+" %"], # 4nd column
                      ], 
               line_color='darkslategray',
               fill_color='lightcyan',
               align='left'))])

fig.update_layout(width=600, height=300)
fig.show()

### Ever_married

In [None]:
ever_married_y = dataframe[dataframe['ever_married']=="Yes"]
ever_married_y = ever_married_y['ever_married'].count()
ever_married_n = dataframe[dataframe['ever_married']=="No"]
ever_married_n = ever_married_n['ever_married'].count()

objects = ('Yes', 'No')
Count = [ever_married_y, ever_married_n]
y_pos = np.arange(len(objects))

plt.bar(objects, Count)
plt.xticks(y_pos, objects)
plt.xlabel('Ever Married')
plt.ylabel('Count')
plt.title('Ever Married')

for index,data in enumerate(Count):
    plt.text(x=index , y =data+1 , s=f"{data}" , fontdict=dict(fontsize=20))

plt.show()

### Ever_married - Stroke

In [None]:
ever_married_m = dataframe[dataframe['ever_married']=="Yes"]
ever_married_m_1 = ever_married_m[ever_married_m['stroke']==1]
ever_married_m_1 = ever_married_m_1['ever_married'].count()
ever_married_m = ever_married_m['ever_married'].count()
ever_married_f = dataframe[dataframe['ever_married']=="No"]
ever_married_f_1 =ever_married_f[ever_married_f['stroke']==1]
ever_married_f_1 = ever_married_f_1['ever_married'].count()
ever_married_f = ever_married_f['ever_married'].count()
p_ever_married_stroke = round(ever_married_m_1/ever_married_m*100,2)
p_ever_married_no_stroke = round(ever_married_f_1/ever_married_f*100,2)


fig = go.Figure(data=[go.Table(
    header=dict(values=['Ever_married', 'Total', 'Stroke','% Stroke'],
                line_color='darkslategray',
                fill_color='lightskyblue',
                align='left'),
    cells=dict(values=[['Yes', 'No'], # 1st column
                       [ever_married_m, ever_married_f], # 2nd column
                       [ever_married_m_1, ever_married_f_1], # 3nd column
                       [str(p_ever_married_stroke)+" %", str(p_ever_married_no_stroke)+" %"], # 4nd column
                      ], 
               line_color='darkslategray',
               fill_color='lightcyan',
               align='left'))])

fig.update_layout(width=600, height=300)
fig.show()

### Work_type

In [None]:
work_type_m = dataframe[dataframe['work_type']=="Private"]
work_type_m = work_type_m['work_type'].count()

work_type_f = dataframe[dataframe['work_type']=="Self-employed"]
work_type_f = work_type_f['work_type'].count()

work_type_g = dataframe[dataframe['work_type']=="Govt_job"]
work_type_g = work_type_g['work_type'].count()

work_type_h = dataframe[dataframe['work_type']=="children"]
work_type_h= work_type_h['work_type'].count()

work_type_i = dataframe[dataframe['work_type']=="Never_worked"]
work_type_i = work_type_i['work_type'].count()

objects = ('Private', 'Self-employed', 'Govt_job', 'Children', 'Never_worked')
Count = [work_type_m, work_type_f,work_type_g,work_type_h,work_type_i]
y_pos = np.arange(len(objects))

plt.bar(objects, Count)
plt.xticks(y_pos, objects)
plt.xlabel('Work type')
plt.ylabel('Count')
plt.title('Work type')

for index,data in enumerate(Count):
    plt.text(x=index , y =data+1 , s=f"{data}" , fontdict=dict(fontsize=20))

plt.show()

### Work_type - Stroke

In [None]:
work_type_m = dataframe[dataframe['work_type']=="Private"]
work_type_m_1 = work_type_m[work_type_m['stroke']==1]
work_type_m_1 = work_type_m_1['work_type'].count()
work_type_m = work_type_m['work_type'].count()

work_type_f = dataframe[dataframe['work_type']=="Self-employed"]
work_type_f_1 = work_type_f[work_type_f['stroke']==1]
work_type_f_1 = work_type_f_1['work_type'].count()
work_type_f = work_type_f['work_type'].count()

work_type_g = dataframe[dataframe['work_type']=="Govt_job"]
work_type_g_1 =work_type_g[work_type_g['stroke']==1]
work_type_g_1 = work_type_g_1['work_type'].count()
work_type_g = work_type_g['work_type'].count()

work_type_h = dataframe[dataframe['work_type']=="children"]
work_type_h_1 =work_type_h[work_type_h['stroke']==1]
work_type_h_1 = work_type_h_1['work_type'].count()
work_type_h = work_type_h['work_type'].count()

work_type_i = dataframe[dataframe['work_type']=="Never_worked"]
work_type_i_1 =work_type_i[work_type_i['stroke']==1]
work_type_i_1 = work_type_i_1['work_type'].count()
work_type_i = work_type_i['work_type'].count()

p_work_type_stroke = round(work_type_m_1/work_type_m*100,2)
p_work_type_f_stroke = round(work_type_f_1/work_type_f*100,2)
p_work_type_g_stroke = round(work_type_g_1/work_type_g*100,2)
p_work_type_h_stroke = round(work_type_h_1/work_type_h*100,2)
p_work_type_i_stroke = round(work_type_i_1/work_type_i*100,2)


fig = go.Figure(data=[go.Table(
    header=dict(values=['Work Type', 'Total', 'Stroke','% Stroke'],
                line_color='darkslategray',
                fill_color='lightskyblue',
                align='left'),
    cells=dict(values=[['Private', 'Self-employed', 'Govt_job', 'Children', 'Never_worked'], # 1st column
                       [work_type_m, work_type_f, work_type_g, work_type_h, work_type_i], # 2nd column
                       [work_type_m_1, work_type_f_1, work_type_g_1, work_type_h_1, work_type_i_1], # 3nd column
                       [str(p_work_type_stroke)+" %", str(p_work_type_f_stroke)+" %", str(p_work_type_g_stroke)+" %", str(p_work_type_h_stroke)+" %" , str(p_work_type_i_stroke)+" %" ], # 4nd column
                      ],
               line_color='darkslategray',
               fill_color='lightcyan',
               align='left'))])

fig.update_layout(width=800, height=500)
fig.show()

### Residence_type

In [None]:
residence_type_y = dataframe[dataframe['Residence_type']=="Urban"]
residence_type_y = residence_type_y['Residence_type'].count()
residence_type_n = dataframe[dataframe['Residence_type']=="Rural"]
residence_type_n = residence_type_n['Residence_type'].count()

objects = ('Urban', 'Rural')
Count = [residence_type_y, residence_type_n]
y_pos = np.arange(len(objects))

plt.bar(objects, Count)
plt.xticks(y_pos, objects)
plt.xlabel('Residence type')
plt.ylabel('Count')
plt.title('Residence type')

for index,data in enumerate(Count):
    plt.text(x=index , y =data+1 , s=f"{data}" , fontdict=dict(fontsize=20))

plt.show()

### Residence_type - Stroke

In [None]:
residence_type_m = dataframe[dataframe['Residence_type']=="Urban"]
residence_type_m_1 = residence_type_m[residence_type_m['stroke']==1]
residence_type_m_1 = residence_type_m_1['Residence_type'].count()
residence_type_m = residence_type_m['Residence_type'].count()
residence_type_f = dataframe[dataframe['Residence_type']=="Rural"]
residence_type_f_1 =residence_type_f[residence_type_f['stroke']==1]
residence_type_f_1 = residence_type_f_1['Residence_type'].count()
residence_type_f = residence_type_f['Residence_type'].count()
p_residence_type_stroke = round(residence_type_m_1/residence_type_m*100,2)
p_residence_type_rural_stroke = round(residence_type_f_1/residence_type_f*100,2)


fig = go.Figure(data=[go.Table(
    header=dict(values=['Residence Type', 'Total', 'Stroke','% Stroke'],
                line_color='darkslategray',
                fill_color='lightskyblue',
                align='left'),
    cells=dict(values=[['Urban', 'Rural'], # 1st column
                       [residence_type_m, residence_type_f], # 2nd column
                       [residence_type_m_1, residence_type_f_1], # 3nd column
                       [str(p_residence_type_stroke)+" %", str(p_residence_type_rural_stroke)+" %"], # 4nd column
                      ], 
               line_color='darkslategray',
               fill_color='lightcyan',
               align='left'))])

fig.update_layout(width=600, height=300)
fig.show()

### Smoking_status

In [None]:
smoking_status_m = dataframe[dataframe['smoking_status']=="formerly smoked"]
smoking_status_m = smoking_status_m['smoking_status'].count()

smoking_status_f = dataframe[dataframe['smoking_status']=="never smoked"]
smoking_status_f = smoking_status_f['smoking_status'].count()

smoking_status_g = dataframe[dataframe['smoking_status']=="smokes"]
smoking_status_g = smoking_status_g['smoking_status'].count()

smoking_status_h = dataframe[dataframe['smoking_status']=="Unknown"]
smoking_status_h = smoking_status_h['smoking_status'].count()

objects = ('Formerly smoked', 'Never smoked', 'Smokes', 'Unknown')
Count = [smoking_status_m, smoking_status_f,smoking_status_g, smoking_status_h]
y_pos = np.arange(len(objects))

plt.bar(objects, Count)
plt.xticks(y_pos, objects)
plt.xlabel('Smoking Status')
plt.ylabel('Count')
plt.title('Smoking Status')

for index,data in enumerate(Count):
    plt.text(x=index , y =data+1 , s=f"{data}" , fontdict=dict(fontsize=10))

plt.show()

### Smoking_status - Stroke

In [None]:
smoking_status_m = dataframe[dataframe['smoking_status']=="formerly smoked"]
smoking_status_m_1 = smoking_status_m[smoking_status_m['stroke']==1]
smoking_status_m_1 = smoking_status_m_1['smoking_status'].count()
smoking_status_m = smoking_status_m['smoking_status'].count()

smoking_status_f = dataframe[dataframe['smoking_status']=="never smoked"]
smoking_status_f_1 =smoking_status_f[smoking_status_f['stroke']==1]
smoking_status_f_1 = smoking_status_f_1['smoking_status'].count()
smoking_status_f = smoking_status_f['smoking_status'].count()

smoking_status_g = dataframe[dataframe['smoking_status']=="smokes"]
smoking_status_g_1 =smoking_status_g[smoking_status_g['stroke']==1]
smoking_status_g_1 = smoking_status_g_1['smoking_status'].count()
smoking_status_g = smoking_status_g['smoking_status'].count()

smoking_status_h = dataframe[dataframe['smoking_status']=="Unknown"]
smoking_status_h_1 =smoking_status_h[smoking_status_h['stroke']==1]
smoking_status_h_1 = smoking_status_h_1['smoking_status'].count()
smoking_status_h = smoking_status_h['smoking_status'].count()

p_smoking_status_stroke = round(smoking_status_m_1/smoking_status_m*100,2)
p_smoking_status_f_stroke = round(smoking_status_f_1/smoking_status_f*100,2)
p_smoking_status_g_stroke = round(smoking_status_g_1/smoking_status_g*100,2)
p_smoking_status_h_stroke = round(smoking_status_h_1/smoking_status_h*100,2)


fig = go.Figure(data=[go.Table(
    header=dict(values=['Smoking_status', 'Total', 'Stroke','% Stroke'],
                line_color='darkslategray',
                fill_color='lightskyblue',
                align='left'),
    cells=dict(values=[['Formerly Smoked', 'Never Smoked', 'Smokes', "Unkown"], # 1st column
                       [smoking_status_m, smoking_status_f, smoking_status_g, smoking_status_h], # 2nd column
                       [smoking_status_m_1, smoking_status_f_1, smoking_status_g_1, smoking_status_h_1], # 3nd column
                       [str(p_smoking_status_stroke)+" %", str(p_smoking_status_f_stroke)+" %", str(p_smoking_status_g_stroke)+" %", str(p_smoking_status_h_stroke)+" %" ], # 4nd column
                      ], 
               line_color='darkslategray',
               fill_color='lightcyan',
               align='left'))])

fig.update_layout(width=800, height=500)
fig.show()

### Correlation

In [None]:
df = dataframe[["age", "avg_glucose_level","bmi"]]
corrMatrix = df[df.columns.difference(['stroke'])].corr(method="pearson")
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
df = dataframe[["age", "avg_glucose_level","bmi","stroke"]]
sns.pairplot(df, hue="stroke")