In [1]:
#Import Dependencies

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

#Set pandas to show all columns
pd.set_option('display.max_columns', None)

In [2]:
#Read CSV
raw_data_df = pd.read_csv('../healthcare-dataset-stroke-data.csv')
raw_data_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
#Use Get_Dummies function to encode all variables (change from text to numeric)

dummies_df = pd.get_dummies(data = raw_data_df, columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])
dummies_df

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
1,51676,61.0,0,0,202.21,,1,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0
2,31112,80.0,0,1,105.92,32.5,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0
3,60182,49.0,0,0,171.23,34.4,1,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1
4,1665,79.0,1,0,174.12,24.0,1,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,80.0,1,0,83.75,,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0
5106,44873,81.0,0,0,125.20,40.0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0
5107,19723,35.0,0,0,82.99,30.6,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0
5108,37544,51.0,0,0,166.29,25.6,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0


In [4]:
#drop ID Column

dummies_df = dummies_df.drop(columns = 'id')

In [5]:
#Drop all rows with null values in any column
clean_df = dummies_df.dropna(axis = 0, how ='any')
clean_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
2,80.0,0,1,105.92,32.5,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0
5,81.0,0,0,186.21,29.0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0
5106,81.0,0,0,125.20,40.0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0
5107,35.0,0,0,82.99,30.6,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0
5108,51.0,0,0,166.29,25.6,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0


In [6]:
#Build X and y dataframes for training and testing machine learning model
#original code for specifying y:
y = clean_df['stroke']

#using ravel to specify y:
#y = clean_df['stroke'].ravel()

X = clean_df.drop(columns = 'stroke')


In [7]:
#Split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
# Scale the Data for Random forest model

# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

<h3>Random Forest Model</h3>

In [9]:
 # Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)

In [10]:
 # Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [11]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [12]:
 # Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [13]:
 # Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1169,1
Actual 1,58,0


Accuracy Score : 0.9519543973941368
Classification Report
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1170
           1       0.00      0.00      0.00        58

    accuracy                           0.95      1228
   macro avg       0.48      0.50      0.49      1228
weighted avg       0.91      0.95      0.93      1228



In [14]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.27303793291263223, 'avg_glucose_level'),
 (0.2297562549319405, 'age'),
 (0.22121047303983854, 'bmi'),
 (0.02973130200775081, 'hypertension'),
 (0.026018780410252105, 'heart_disease'),
 (0.020691982198009655, 'Residence_type_Rural'),
 (0.020245940404816865, 'smoking_status_never smoked'),
 (0.01959865343802406, 'Residence_type_Urban'),
 (0.019192532117477706, 'gender_Female'),
 (0.018888270133812186, 'work_type_Private'),
 (0.018714660816537082, 'gender_Male'),
 (0.018689960866098067, 'smoking_status_formerly smoked'),
 (0.017354835541913197, 'work_type_Self-employed'),
 (0.01644780968409576, 'smoking_status_smokes'),
 (0.01483286483113816, 'work_type_Govt_job'),
 (0.014326061092694985, 'smoking_status_Unknown'),
 (0.01069128521581811, 'ever_married_Yes'),
 (0.00973257791462618, 'ever_married_No'),
 (0.0008241941653865719, 'work_type_children'),
 (1.2532147389860333e-05, 'work_type_Never_worked'),
 (1.0961297474195939e-06, 'gender_Other')]

<h3>Logistic Regression Models</h3>

<h5>Model 1: All Columns Included</h5>

In [15]:
#Instantiate Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

LogisticRegression(max_iter=200, random_state=1)

In [16]:
#Fit the model using training data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=200, random_state=1)

In [17]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9592502037489813
Testing Data Score: 0.9527687296416938


In [18]:
# Calculate Accurace Score for Model Using Testing Data
accuracy_score(y_test, predictions)

0.9519543973941368

<h5>Model 2: Optimize model by dropping marriage and work type variables from dataset</h5>

In [19]:
#display clean dataframe with all columns
clean_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
2,80.0,0,1,105.92,32.5,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0
5,81.0,0,0,186.21,29.0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0
5106,81.0,0,0,125.20,40.0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0
5107,35.0,0,0,82.99,30.6,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0
5108,51.0,0,0,166.29,25.6,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0


In [20]:
#Build X and y dataframes for training and testing machine learning model. 
#Drop married and work type columns from dataset before splitting into training and test sets 

clean_df_2 = clean_df.drop(columns = ['ever_married_Yes', 
                                      'ever_married_No', 
                                      'work_type_Govt_job', 
                                      'work_type_Never_worked', 
                                      'work_type_Private', 
                                      'work_type_Self-employed', 
                                      'work_type_children'])
clean_df_2


y = clean_df_2['stroke']

X = clean_df_2.drop(columns = 'stroke')

In [21]:
#Split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [22]:
#Instantiate Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=2)
classifier

LogisticRegression(max_iter=200, random_state=2)

In [23]:
#Fit the model using training data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=200, random_state=2)

In [24]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9568052159739201
Testing Data Score: 0.9600977198697068


<h5>Model 3: Optimize model by dropping gender, work type, and marriage columns prior to splitting into testing and training datasets</h5>

In [25]:
#display clean dataframe with all columns
clean_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
2,80.0,0,1,105.92,32.5,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0
5,81.0,0,0,186.21,29.0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0
5106,81.0,0,0,125.20,40.0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0
5107,35.0,0,0,82.99,30.6,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0
5108,51.0,0,0,166.29,25.6,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0


In [26]:
#Build X and y dataframes for training and testing machine learning model. 
#Drop gender, work type, and marriage columns from dataset before splitting into training and test sets 

clean_df_3 = clean_df.drop(columns = ['gender_Female', 
                                      'gender_Male',
                                      'gender_Other',
                                      'ever_married_No',
                                      'ever_married_Yes',
                                      'work_type_Govt_job', 
                                      'work_type_Never_worked', 
                                      'work_type_Private', 
                                      'work_type_Self-employed', 
                                      'work_type_children'])
clean_df_3


y = clean_df_3['stroke']

X = clean_df_3.drop(columns = 'stroke')

In [27]:
#Split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [28]:
#Instantiate Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=2)
classifier

LogisticRegression(max_iter=200, random_state=2)

In [29]:
#Fit the model using training data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=200, random_state=2)

In [30]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9568052159739201
Testing Data Score: 0.9600977198697068
