# Prediction Stroke

In [None]:
# To make sure all of the correct libraries are installed, import each module and print the version number
# Check versions of the libraries
import sys
import numpy
import pandas

print('Python:     {}'.format(sys.version))
print('numpy:      {}'.format(numpy.__version__))
print('pandas:     {}'.format(pandas.__version__))

In [None]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_csv("data/healthcare-dataset-stroke-data.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# drop id variabel
df = df.drop('id', axis = 1)
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
#mengatasi missing value dengan median pada variabel MonthlyCharges dan TotalCharges
columns = ['bmi']
for i in columns:
    median = df[i].median()
    df[i].fillna(median, inplace=True)

In [None]:
#memeriksa kembali data
df.isnull().sum().sort_values(ascending=False)

In [None]:
#deteksi nilai yang tidak standar
columns = ['gender', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type','smoking_status', 'stroke']

for i in columns:
    print('Nilai unik dari variabel {},' .format(i))
    print(df[i].value_counts())
    print('\n')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#membuat pie chart 'stroke'
plt.figure(figsize=(5,5))
plt.pie(df['stroke'].value_counts(), colors=['khaki','brown'],
        labels=['0','1'], autopct='%.2f%%')
plt.title('Persentase Stroke', loc='left', fontsize=15)
plt.show()

In [None]:
#histogram 'gender','ever_married','work_type'
fig, ax=plt.subplots(1,3,figsize=(20,6))
sns.countplot(x='gender',hue='stroke',data=df,ax=ax[0])
sns.countplot(x='ever_married',hue='stroke',data=df,ax=ax[1])
sns.countplot(x='work_type',hue='stroke',data=df,ax=ax[2])

In [None]:
fig, ax=plt.subplots(1,2,figsize=(20,6))
sns.countplot(x='hypertension',hue='stroke',data=df,ax=ax[0])
sns.countplot(x='heart_disease',hue='stroke',data=df,ax=ax[1])

In [None]:
fig, ax=plt.subplots(1,3,figsize=(20,6))
sns.countplot(x='Residence_type',hue='stroke',data=df,ax=ax[0])
sns.countplot(x='smoking_status',hue='stroke',data=df,ax=ax[1])
sns.countplot(x='heart_disease',hue='stroke',data=df,ax=ax[2])

In [None]:
sns.set()
features = ['age','avg_glucose_level','bmi']
fig, ax = plt.subplots(figsize=(8,6))
plt.subplots_adjust(right=2, top=2)
for i, n in enumerate(features,1):
    plt.subplot(2,2,i)
    sns.boxplot(x=df[n])
    plt.xlabel('{}'.format(n), labelpad=10, size=15)
    plt.tick_params(axis='x', labelsize=15, size=8)
    plt.tick_params(axis='y', labelsize=15, size=8)
    plt.title('Boxplot {} '.format(n), y=1, size=15)
plt.show()

In [None]:
corrMatrix = df.corr()
sns.heatmap(corrMatrix, annot=True)
plt.title('Correlation Matrix', loc='center', fontsize=15)
plt.show()

# Encoding data

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# encoding data
for i in df.columns:
    if df[i].dtype == np.number:
        continue
    df[i]=LabelEncoder().fit_transform(df[i])
df.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = df.drop(columns = 'stroke')
y = df['stroke']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 3)
print("train set :", x_train.shape, y_train.shape)
print("test set : ", x_test.shape, y_test.shape)

# Data Modelling

In [None]:
from sklearn.tree import DecisionTreeClassifier #model 1
from sklearn.linear_model import LogisticRegression #model 2
from sklearn.ensemble import RandomForestClassifier #model 3

#menghitung akurasi 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

#menampilkan confusion matriks
from sklearn.metrics import confusion_matrix

# Decision Tree

In [None]:
#model 1
dt=DecisionTreeClassifier().fit(x_train,y_train)

In [None]:
#hasil prediksi dan klasifikasi
y_test_pred=dt.predict(x_test)
print(classification_report(y_test, y_test_pred))
akurasi=accuracy_score(y_test, y_test_pred)*100
print('Akurasi = %.2f'%akurasi)

In [None]:
#confusion matriks dari data frame
confusion_matrix_df=pd.DataFrame((confusion_matrix(y_test, y_test_pred)),
                                 ('No Sroke', 'Stroke'),('No Sroke', 'Stroke'))

In [None]:
#plot cunfusion matrrix
plt.figure()
heatmap=sns.heatmap(confusion_matrix_df, annot=True, annot_kws={'size':14}, fmt='d', cmap='YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)

plt.title('Confusion Matrix dari Model Data Testing\n(Decision Tree)', fontsize=18, color='darkblue')
plt.ylabel('True Label', fontsize=14)
plt.xlabel('Predicted Label', fontsize=14)
plt.show()

# Regresi Logistik

In [None]:
#model 2
#algoritma regresi logistik
lr=LogisticRegression().fit(x_train, y_train)
lr

In [None]:
#hasil prediksi dan klasifikasi
y_test_pred2=lr.predict(x_test)
print(classification_report(y_test, y_test_pred2))
akurasi2=accuracy_score(y_test, y_test_pred2)*100
print('Akurasi = %.2f'%akurasi2)

In [None]:
#confusion matriks dari data frame
confusion_matrix_df=pd.DataFrame((confusion_matrix(y_test, y_test_pred2)),
                                 ('No Sroke', 'Stroke'),('No Sroke', 'Stroke'))

In [None]:
#plot cunfusion matrrix
plt.figure()
heatmap=sns.heatmap(confusion_matrix_df, annot=True, annot_kws={'size':14}, fmt='d', cmap='YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)

plt.title('Confusion Matrix dari Model Data Testing\n(Regresi Logistik)', fontsize=18, color='darkblue')
plt.ylabel('True Label', fontsize=14)
plt.xlabel('Predicted Label', fontsize=14)
plt.show()

# Random Forest

In [None]:
#model 3
rdf=RandomForestClassifier().fit(x_train, y_train)
rdf 

In [None]:
#hasil prediksi dan klasifikasi
y_test_pred3=rdf.predict(x_test)
print(classification_report(y_test, y_test_pred3))
akurasi3=accuracy_score(y_test, y_test_pred3)*100
print('Akurasi = %.2f'%akurasi3)

In [None]:
#confusion matriks dari data frame
confusion_matrix_df=pd.DataFrame((confusion_matrix(y_test, y_test_pred3)),
                                 ('No Sroke', 'Stroke'),('No Sroke', 'Stroke'))

In [None]:
#plot cunfusion matrrix
plt.figure()
heatmap=sns.heatmap(confusion_matrix_df, annot=True, annot_kws={'size':14}, fmt='d', cmap='YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)

plt.title('Confusion Matrix dari Model Data Testing\n(Random Forest)', fontsize=18, color='darkblue')
plt.ylabel('True Label', fontsize=14)
plt.xlabel('Predicted Label', fontsize=14)
plt.show()