In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [3]:
#To predict anything, we do not need the id 
df.drop('id', axis = 1, inplace=True)

In [4]:
#df['gender'].unique()
#df['gender'] = df['gender'].replace(['Male', 'Female', 'Other'], [0,1,2])

In [5]:
df['ever_married'].unique()
df['ever_married'] = df['ever_married'].replace(['Yes', 'No'], [1,0])

In [6]:
#df['work_type'].unique()
#df['work_type'] = df['work_type'].replace(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'], [0,1,2,3,4])

In [7]:
#df['Residence_type'].unique()
#df['Residence_type'] = df['Residence_type'].replace(['Urban', 'Rural'], [1,0])

In [8]:
#df['smoking_status'].unique()
#df['smoking_status'] = df['smoking_status'].replace(['formerly smoked', 'never smoked', 'smokes', 'Unknown'], [0,1,2,3])

In [9]:
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

In [10]:
df['stroke'].unique()

array([1, 0], dtype=int64)

In [11]:
temp_df = pd.get_dummies(df[['gender', 'work_type','Residence_type','smoking_status']], drop_first= True)

df = pd.concat([df, temp_df], axis = 1)

df.drop(['gender', 'work_type','Residence_type','smoking_status'], axis = 1, inplace=True)


In [12]:
df.head()

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,1,228.69,36.6,1,1,0,0,1,0,0,1,1,0,0
1,61.0,0,0,1,202.21,28.893237,1,0,0,0,0,1,0,0,0,1,0
2,80.0,0,1,1,105.92,32.5,1,1,0,0,1,0,0,0,0,1,0
3,49.0,0,0,1,171.23,34.4,1,0,0,0,1,0,0,1,0,0,1
4,79.0,1,0,1,174.12,24.0,1,0,0,0,0,1,0,0,0,1,0


In [13]:
#Now getting all X values(everything but the stroke column)
X = df.drop('stroke', axis = 1)

# the label is stroke
y = df['stroke']

In [14]:
sample = RandomOverSampler(sampling_strategy= 1)
X_sam, y_sam = sample.fit_resample(X,y)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_sam, y_sam, test_size= 0.33)

In [16]:
#First trying Logistic Regression (I don't think this is the best but still giving it a try)
logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)
logistic_predictions = logistic_model.predict(X_test)

print(logistic_model.score(X_test,y_test))

print(classification_report(y_test, logistic_predictions))

# 76% isn't bad

0.7675288251791835
              precision    recall  f1-score   support

           0       0.79      0.73      0.76      1593
           1       0.75      0.81      0.78      1616

    accuracy                           0.77      3209
   macro avg       0.77      0.77      0.77      3209
weighted avg       0.77      0.77      0.77      3209

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
#Trying KNN
best_score = 0
best_neighbors = 0
best_predictions = 0
for num in range(1,10):
    knn = KNeighborsClassifier(n_neighbors= num)

    knn.fit(X_train,y_train)
    knn_predictions  = knn.predict(X_test)

    print('For ' + str(num) + ' neighbors, the accuracy is:')
    score = knn.score(X_test,y_test)
    print(score)

    if(score > best_score):
        best_score = score
        best_neighbors = num
        best_predictions = knn_predictions

print('\n\nThe best neighbors=' + str(best_neighbors) + ' with accuracy=' + str(best_score))

print(classification_report(y_test, knn_predictions))
#Not even better than logistic regression

For 1 neighbors, the accuracy is:
0.9738236210657526
For 2 neighbors, the accuracy is:
0.9738236210657526
For 3 neighbors, the accuracy is:
0.9488937363664693
For 4 neighbors, the accuracy is:
0.9488937363664693
For 5 neighbors, the accuracy is:
0.9286382050483016
For 6 neighbors, the accuracy is:
0.9286382050483016
For 7 neighbors, the accuracy is:
0.908382673730134
For 8 neighbors, the accuracy is:
0.908382673730134
For 9 neighbors, the accuracy is:
0.8896852602056715


The best neighbors=1 with accuracy=0.9738236210657526
              precision    recall  f1-score   support

           0       1.00      0.78      0.88      1593
           1       0.82      1.00      0.90      1616

    accuracy                           0.89      3209
   macro avg       0.91      0.89      0.89      3209
weighted avg       0.91      0.89      0.89      3209



In [18]:
#Finally, we try Random Forest, I think this one should be the best

rf = RandomForestClassifier()

rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)

print(rf.score(X_test, y_test))
print(classification_report(y_test, rf_predictions))

0.9900280461202867
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1593
           1       0.98      1.00      0.99      1616

    accuracy                           0.99      3209
   macro avg       0.99      0.99      0.99      3209
weighted avg       0.99      0.99      0.99      3209



#We Can see that The random forest is giving us the best accuracy, with 99% accuracy.