In [None]:
from google.colab import files
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sklearn
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from yellowbrick.regressor import residuals_plot
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.tree import export_text
files.upload()

In [None]:
from yellowbrick.regressor.residuals import ResidualsPlot
from yellowbrick.base import Visualizer
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
sns.set(rc={'figure.figsize':(20,9)})
df = df.drop(columns='id')
df = df.loc[df["gender"] != "Other"]
bmiAvg = df["bmi"].mean()
df["bmi"] = df["bmi"].fillna(round(bmiAvg,1))



encodedDf = df
nonStrokeDf = df.loc[df["stroke"] == 0]
strokeDf = df.loc[df["stroke"] == 1];
nonStrokeDfCd = nonStrokeDf[:200]

frames = [nonStrokeDfCd,strokeDf]
le = preprocessing.LabelEncoder()
df1 = pd.concat(frames)

temp = pd.get_dummies(encodedDf["gender"])
encodedDf = pd.concat([encodedDf,temp],axis=1)
encodedDf = encodedDf.drop(columns = "gender")

#Show up as the dummy columns Yes, No
temp = pd.get_dummies(encodedDf["ever_married"])
encodedDf = pd.concat([encodedDf,temp],axis=1)
encodedDf = encodedDf.drop(columns = "ever_married")

temp = pd.get_dummies(encodedDf["work_type"])
encodedDf = pd.concat([encodedDf,temp],axis=1)
encodedDf = encodedDf.drop(columns = "work_type")

temp = pd.get_dummies(encodedDf["Residence_type"])
encodedDf = pd.concat([encodedDf,temp],axis=1)
encodedDf = encodedDf.drop(columns = "Residence_type")

temp = pd.get_dummies(encodedDf["smoking_status"])
encodedDf = pd.concat([encodedDf,temp],axis=1)
encodedDf = encodedDf.drop(columns = "smoking_status")

encodedDf["mean_bmi"] = encodedDf["bmi"].mean()
encodedDf["mean_age"] = encodedDf["age"].mean()
encodedDf["mean_glucose_level"] = encodedDf["avg_glucose_level"].mean()

y_data = encodedDf.stroke
x_data = encodedDf.drop(columns = "stroke")

labels = ["non-Stroke","Stroke"]
feature_names = x_data.columns

scaler = preprocessing.StandardScaler()
x_data = scaler.fit_transform(x_data)

x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,test_size=.10, random_state=42)

reg = linear_model.LinearRegression()
reg = reg.fit(x_train, y_train)
print(reg.score(x_train,y_train))

clf = DecisionTreeClassifier(max_depth =3, random_state = 42)
clf = clf.fit(x_train,y_train)

print(clf.score(x_train,y_train))

visualizers = ResidualsPlot(reg)
visualizers.score(x_test,y_test)
visualizers.show()

In [None]:

a = tree.plot_tree(clf, feature_names = feature_names,class_names = labels, rounded = True, filled = True, fontsize=14)

plt.show()

tree_rules = export_text(clf,feature_names = list(feature_names))

print(tree_rules)

In [None]:

def plotRegScores(dataframe):
  testValues = [.10,.20,.30,.40,.50,.60,.70,.80,.90]
  regScores = []
  for i in testValues:
    y_data = dataframe.stroke
    x_data = dataframe.drop(columns = "stroke")

    scaler = preprocessing.StandardScaler()
    x_data = scaler.fit_transform(x_data)

    x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,test_size= i, random_state=42)

    reg = linear_model.LinearRegression()
    reg = reg.fit(x_train, y_train)
    regScores.append(reg.score(x_train,y_train))

  print(sns.lineplot(x=testValues,y=regScores))
  

plotRegScores(encodedDf)

In [None]:
def plotClfScores(dataframe):
  testValues = [.10,.20,.30,.40,.50,.60,.70,.80,.90]
  clfScores = []
  for i in testValues:
    y_data = dataframe.stroke
    x_data = dataframe.drop(columns = "stroke")

    scaler = preprocessing.StandardScaler()
    x_data = scaler.fit_transform(x_data)

    x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,test_size= i, random_state=42)

    clf = DecisionTreeClassifier(max_depth =3, random_state = 42)
    clf = clf.fit(x_train,y_train)

    clfScores.append(clf.score(x_train,y_train))

  print(sns.lineplot(x=testValues,y=clfScores))

plotClfScores(encodedDf)


In [None]:
sns.countplot(data=df, x="stroke",hue="gender")

In [None]:
sns.pairplot(df,hue="stroke")

In [None]:
bins = [0,11,18,30,45,65]
labels = ["0-10","11-17","18-29","30-44","45-64"]

strokeDf['AgeGroup'] = pd.cut(strokeDf['age'], bins=bins, labels=labels, right=False)
strokeDf['AgeGroup'] = strokeDf['AgeGroup'].cat.add_categories('65+').fillna('65+')

df1['AgeGroup'] = pd.cut(df1["age"],bins=bins,labels=labels, right=False)
df1['AgeGroup'] = df1['AgeGroup'].cat.add_categories('65+').fillna('65+')

df["AgeGroup"] = pd.cut(df["age"],bins=bins,labels=labels,right=False)
df["AgeGroup"] = df["AgeGroup"].cat.add_categories('65+').fillna('65+')

sns.barplot(data=strokeDf,x="AgeGroup",y=strokeDf.groupby("AgeGroup")["stroke"].transform("count"))



In [None]:
sns.scatterplot(data=df1,x="avg_glucose_level",y="age",hue="stroke")
sns.kdeplot(data=df1,x="avg_glucose_level",y="age",hue="stroke")


In [None]:
sns.countplot(data=df,x="AgeGroup",hue="stroke")

In [None]:
sns.displot(data=df,x="stroke")
df.groupby("stroke")["stroke"].count()

In [None]:
print(df["age"].describe())
sns.displot(data=df,x="age")

In [None]:
sns.displot(data=df,x='gender')
print(df.groupby("gender")["gender"].count())

In [None]:
sns.displot(data=df,x="avg_glucose_level")
print(df["avg_glucose_level"].describe())