In [None]:
!pip install interpret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting interpret
  Downloading interpret-0.3.2-py3-none-any.whl (1.4 kB)
Collecting interpret-core[dash,debug,decisiontree,ebm,lime,linear,notebook,plotly,required,sensitivity,shap,skoperules,treeinterpreter]==0.3.2
  Downloading interpret_core-0.3.2-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
Collecting skope-rules>=1.0.1
  Downloading skope_rules-1.0.1-py3-none-any.whl (14 kB)
Collecting gevent>=1.3.6
  Downloading gevent-22.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dash>=1.0.0
  Downloading dash-2.9.3-py3-none-any.whl (10.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m17.1 MB/

In [None]:
import pandas as pd
# Makes sure we see all columns
pd.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from interpret.glassbox import (LogisticRegression, ClassificationTree)
from interpret import show
from sklearn.metrics import f1_score, accuracy_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
path = "/content/drive/MyDrive"
fnames = [x for x in os.listdir(path) if x.startswith("healthcare")]

print(fnames)

['healthcare-dataset-stroke-data.csv']


In [None]:
class DataLoader():
  def _init_(self):
    self.data = None
    
  def load_dataset(self, path="/content/drive/MyDrive/healthcare-dataset-stroke-data.csv"):
    self.data = pd.read_csv(path)
      
      
  def preprocess_data(self):
    categorical_cols =["gender",
                      "ever_married",
                      "work_type",
                      "Residence_type",
                      "smoking_status"]
    encoded = pd.get_dummies(self.data[categorical_cols],
                            prefix=categorical_cols)
    self.data = pd.concat([encoded, self.data], axis=1)
    self.data.drop(categorical_cols, axis=1, inplace=True)
    
    self.data.bmi = self.data.bmi.fillna(0)
    self.data.drop(["id"], axis=1, inplace=True)
        
  def get_data_split(self):
    x = self.data.iloc[:,:-1]
    y = self.data.iloc[:,-1]
    return train_test_split(x, y, test_size=0.20, random_state=2021)
    
  def oversample(self, x_train, y_train):
    oversample = RandomOverSampler(sampling_strategy='minority')
    x_np = X_train.to_numpy()
    y_np = y_train.to_numpy()
    x_np, y_np = oversample.fit_resample( x_np, y_np)
    x_over = pd.DataFrame(x_np, columns=X_train.columns)
    y_over = pd.Series(y_np, name=y_train.name)
    return x_over, y_over

                             

In [None]:
# %% Load and preprocess data
data_loader = DataLoader()
data_loader.load_dataset()
data_loader.preprocess_data()

In [None]:
# Split the data for evaluation
X_train, X_test, y_train, y_test = data_loader.get_data_split()
print(X_train.shape)
print(X_test.shape)
X_train, y_train = data_loader.oversample(X_train, y_train)
print("After oversampling:", X_train.shape)

# Oversample the train data


(4088, 21)
(1022, 21)
After oversampling: (7778, 21)


In [None]:
# %% Fit logistic regression model
lr = LogisticRegression(random_state=20, feature_names=X_train.columns, penalty='l1', solver='liblinear')
lr.fit(X_train, y_train)
print("Training finished.")


Training finished.


In [None]:
# %% Evaluate logistic regression model
y_pred = lr.predict(X_test)
print(f"F1 score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")


F1 score 0.5138996389726957
Accuracy 0.7348336594911937


In [None]:
# %% Explain local prediction
lr_local = lr.explain_local(X_test[:100], y_test[:100], name='Logistic Regression')
show(lr_local)


In [None]:

# %% Fit decision tree model
tree = ClassificationTree()
tree.fit(X_train, y_train)
print("Training Finished")
y_pred = tree.predict(X_test)
print(f"F1 score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy{accuracy_score(y_test, y_pred)}")


Training Finished
F1 score 0.5240452498594523
Accuracy0.7544031311154599


In [None]:
# %% Explain local prediction
tree_local = tree.explain_local(X_test[:100], y_test[:100], name='Tree')
show(tree_local)
