<a href="https://colab.research.google.com/github/mohmaed7777/DecisionTress-for-Brain-Strokes-Predictions/blob/main/Decision_Trees_classifier_to_predict_Brain_Strokes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Attribute Information:**

1) **gender**: 
"Male", "Female" or "Other"

2) **age**: age of the patient

3) **hypertension**: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension

4) **heart_disease**: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease

5) **ever_married**: "No" or "Yes"

6) **work_type**: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"

7)**Residence_type**: "Rural" or "Urban"

8) **avg_glucose_level**: average glucose level in blood

9) **bmi**: body mass index

10) **smoking_status**: "formerly smoked", "never smoked", "smokes" or "Unknown"*

11) **stroke**: 1 if the patient had a stroke or 0 if not
*Note: "Unknown" in smoking_status means that the information is unavailable for this patient

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn  as sns 
import sklearn 
%matplotlib inline 
sns.set()
pd.plotting.register_matplotlib_converters()
print('Setup Complete!')

Setup Complete!


In [2]:
data_path = '/content/healthcare-dataset-stroke-data.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
df.drop(['id', 'bmi'], axis=1, inplace=True)

In [4]:
df_uniques = df.nunique()
df_uniques

gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
smoking_status          4
stroke                  2
dtype: int64

In [5]:
binary_vals = list(df_uniques[df_uniques == 2].index)
binary_vals

['hypertension', 'heart_disease', 'ever_married', 'Residence_type', 'stroke']

In [6]:
df[binary_vals].dtypes

hypertension       int64
heart_disease      int64
ever_married      object
Residence_type    object
stroke             int64
dtype: object

In [7]:
categorical_vals  = list(df_uniques[(df_uniques > 2) & (df_uniques <=6)].index)
categorical_vals

['gender', 'work_type', 'smoking_status']

In [8]:
[[i, list(df[i].unique())] for i in categorical_vals]

[['gender', ['Male', 'Female', 'Other']],
 ['work_type',
  ['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked']],
 ['smoking_status', ['formerly smoked', 'never smoked', 'smokes', 'Unknown']]]

In [9]:
numaric_vals = list(set(df.columns) - set(categorical_vals)- set(binary_vals))
numaric_vals

['age', 'avg_glucose_level']

In [10]:
# Encoding processing : 
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder

In [11]:
lb, le, lo = LabelBinarizer(), LabelEncoder(), OrdinalEncoder()

In [12]:
for col in categorical_vals: 
  df[col] = le.fit_transform(df[col])

In [13]:
for col in binary_vals:
  df[col] = lb.fit_transform(df[col])

In [14]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,1,1
1,0,61.0,0,0,1,3,0,202.21,2,1
2,1,80.0,0,1,1,2,0,105.92,2,1
3,0,49.0,0,0,1,2,1,171.23,3,1
4,0,79.0,1,0,1,3,0,174.12,2,1


In [15]:
feature_col = [x for x in df.columns if x not in 'stroke']

In [16]:
df[feature_col].dtypes

gender                 int64
age                  float64
hypertension           int64
heart_disease          int64
ever_married           int64
work_type              int64
Residence_type         int64
avg_glucose_level    float64
smoking_status         int64
dtype: object

In [17]:
from sklearn.model_selection import StratifiedShuffleSplit

In [18]:
strat_shuff_split = StratifiedShuffleSplit(n_splits=1, test_size=1000, random_state=42)

In [19]:
# Get the index values from the genetor: 
train_index , test_index = next(strat_shuff_split.split(df[feature_col], df['stroke']))

In [20]:
# Create the datasets : 
x_train = df.loc[train_index, feature_col]
y_train = df.loc[train_index, 'stroke']
x_test = df.loc[test_index, feature_col]
y_test = df.loc[test_index, 'stroke']

In [21]:
# Checking the perentage compistion of each level in the train and test sets: 
y_train.value_counts(normalize=True).sort_index()

0    0.951338
1    0.048662
Name: stroke, dtype: float64

In [22]:
y_test.value_counts(normalize=True).sort_index()

0    0.951
1    0.049
Name: stroke, dtype: float64

**1/Applying KNN algorithm:-**

In [23]:
# Estimate Knn model and report the outcome : 
from sklearn.neighbors import KNeighborsClassifier

In [24]:

knn = KNeighborsClassifier(n_neighbors=5)


In [25]:
# Train the model :
knn = knn.fit(x_train, y_train)

In [28]:
y_pred = knn.predict(x_test)
y_pred[:5]

array([0, 0, 0, 0, 0])

**Create the Model:**

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt = dt.fit(x_train, y_train)

In [None]:
# Determine the number of nodes and the maximum depth : 
dt.tree_.node_count, dt.tree_.max_depth

(657, 18)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Function to return error metrics : 
def measure_error(y_true, y_pred, label):
  return pd.Series({'accuracy': accuracy_score(y_true, y_pred),
                    'precision':precision_score(y_true, y_pred),
                    'recall':recall_score(y_true, y_pred),
                    'f1':f1_score(y_true, y_pred)},
                   name=label)

In [None]:
y_train_pred = dt.predict(x_train)
y_test_pred = dt.predict(x_test)

In [None]:
train_test_full_error = pd.concat([measure_error(y_train, y_train_pred, 'train'),
                                   measure_error(y_test, y_test_pred, 'test')],
                                  axis=1)

In [None]:
train_test_full_error

Unnamed: 0,train,test
accuracy,1.0,0.915
precision,1.0,0.166667
recall,1.0,0.183673
f1,1.0,0.174757


**Using Grid Search with cross validation:**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'max_depth': range(1, dt.tree_.max_depth + 1, 2),
              'max_features': range(1, len(dt.feature_importances_) + 1)}

GR = GridSearchCV(DecisionTreeClassifier(random_state=42),
                  param_grid=param_grid,
                  scoring='accuracy',
                  n_jobs=-1) 
GR = GR.fit(x_train, y_train)             

In [None]:
# Get the number of nodes and maximum depth : 
GR.best_estimator_.tree_.node_count, GR.best_estimator_.tree_.max_depth

(3, 1)

In [None]:
y_train_pred_gr = GR.predict(x_train)
y_test_pred_gr = GR.predict(x_test)

In [None]:
train_test_full_error_gr = pd.concat([measure_error(y_train, y_train_pred, 'train'),
                                   measure_error(y_test, y_test_pred, 'test')],
                                  axis=1)

In [None]:
train_test_full_error_gr

Unnamed: 0,train,test
accuracy,1.0,0.915
precision,1.0,0.166667
recall,1.0,0.183673
f1,1.0,0.174757
