In [23]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv("/Users/linhphung/Documents/Others/DS Projects/healthcare-dataset-stroke-data.csv")
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
data.dropna()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [5]:
# select features 
X = data.iloc[:,1:11]
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked


In [6]:
# select predicted column
y = data['stroke']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: stroke, dtype: int64

In [7]:
# split data
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state = 1)

# Without tuning

In [8]:
# Get a list of categorical columns
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ])

In [16]:
# Define. Using default parameters
forest_model = RandomForestClassifier(random_state=1)

In [17]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', forest_model)
                             ])

In [18]:
# Fit the model using training data
my_pipeline.fit(train_X, train_y)

In [20]:
# Get prediction
preds = my_pipeline.predict(test_X)

In [21]:
# Calculate the accuracy score
accuracy = accuracy_score(test_y, preds)
print("Accuracy: ", accuracy)

Accuracy:  0.9413145539906104


# With hyperparameter tuning through pipeline

Reference for GridSearch + Pipeline:
https://amueller.github.io/aml/01-ml-workflow/12-pipelines-gridsearch.html#pipeline-and-gridsearchcv

In [30]:
# Hyperparameter tuning
# Creating grid with different parameters
params = {'model__n_estimators': [50, 100, 200, 300, 400],
          'model__max_depth': np.arange(1, 10)}

# Identifying best hyperparameters
grid = GridSearchCV(estimator = my_pipeline,
                    param_grid = params,
                    scoring = 'accuracy',
                    cv=10)

In [31]:
# Fit model to find the best hyperparameter
grid.fit(train_X, train_y)

In [32]:
# Show best combination of hyperparameters and accuracy
print(grid.best_params_)
print(grid.best_score_)

{'model__max_depth': 1, 'model__n_estimators': 50}
0.9545936684073109


In [33]:
# Final model with optimal hyperparameters
final_model = RandomForestClassifier(max_depth=50, n_estimators=1,random_state=1)

In [34]:
# Create new pipeline using best model
final_pipeline = Pipeline(steps=[('preprocess',preprocessor)
                                 , ('model',final_model)
                                 ])

In [35]:
# Fit best model
final_pipeline.fit(train_X,train_y)

In [37]:
pred2 = final_pipeline.predict(test_X)
accuracy2 = accuracy_score(test_y, pred2)
print("Accuracy: ", accuracy2)

Accuracy:  0.9389671361502347


# With hyperparameter tuning not through pipeline