In [118]:
import pandas as pd
import dataprep.eda as eda
import catboost
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import dataprep.eda as eda

In [49]:
import os

In [2]:
df = pd.read_csv("../data/raw/heart_cleveland_upload.csv")

In [3]:
target_features = ["condition"]
pred_features = [colname for colname in df.columns.tolist() if colname not in target_features]
num_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
cat_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

In [4]:
train_size = int(df.shape[0] * 0.75)
val_size = (df.shape[0] - train_size) // 2
test_size = (df.shape[0] - train_size - val_size)

In [5]:
X_train, X_test = train_test_split(df, test_size=test_size)
X_train, X_val = train_test_split(X_train, test_size=val_size)

In [6]:
train_pool = catboost.Pool(data=X_train[pred_features], label=X_train[target_features], cat_features=cat_features)
val_pool = catboost.Pool(data=X_val[pred_features], label=X_val[target_features], cat_features=cat_features)
test_pool = catboost.Pool(data=X_test[pred_features], label=X_test[target_features], cat_features=cat_features)

In [33]:
model_params = {
    "iterations": 400, 
    "learning_rate": 0.05,
    #"depth": 4,
    #"l2_leaf_reg": 3,
    "loss_function": "Logloss",
    "eval_metric": "F1",
    "random_seed": 42,
    "use_best_model": True, 
    "verbose": 50,
    "task_type": "CPU",
    #"early_stopping_rounds": 20,
#     "od_type": "IncToDec",
#     "od_pval": 1e-2,
    "metric_period": 50,
}

In [34]:
clf = CatBoostClassifier(**model_params)
clf.fit(train_pool, eval_set=val_pool, plot=True);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.8195122	test: 0.8292683	best: 0.8292683 (0)	total: 2.79ms	remaining: 1.11s
50:	learn: 0.9300000	test: 0.8717949	best: 0.8717949 (50)	total: 84.9ms	remaining: 581ms
100:	learn: 0.9901961	test: 0.9230769	best: 0.9230769 (100)	total: 162ms	remaining: 481ms
150:	learn: 0.9950739	test: 0.9230769	best: 0.9230769 (100)	total: 240ms	remaining: 395ms
200:	learn: 1.0000000	test: 0.9000000	best: 0.9230769 (100)	total: 320ms	remaining: 317ms
250:	learn: 1.0000000	test: 0.9000000	best: 0.9230769 (100)	total: 393ms	remaining: 233ms
300:	learn: 1.0000000	test: 0.9000000	best: 0.9230769 (100)	total: 468ms	remaining: 154ms
350:	learn: 1.0000000	test: 0.9000000	best: 0.9230769 (100)	total: 546ms	remaining: 76.3ms
399:	learn: 1.0000000	test: 0.9230769	best: 0.9230769 (100)	total: 622ms	remaining: 0us

bestTest = 0.9230769231
bestIteration = 100

Shrink model to first 101 iterations.


In [35]:
f1_score(X_test[target_features], clf.predict(X_test))

0.8275862068965517

In [36]:
clf.save_model("../models/catboost_model")

In [37]:
clf1 = CatBoostClassifier(**model_params)

In [38]:
clf1.load_model("../models/catboost_model")

<catboost.core.CatBoostClassifier at 0x7fe22dfc7bb0>

In [39]:
f1_score(X_test[target_features], clf1.predict(X_test))

0.8275862068965517

In [83]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [85]:
df.mean(axis=0)

age           54.542088
sex            0.676768
cp             2.158249
trestbps     131.693603
chol         247.350168
fbs            0.144781
restecg        0.996633
thalach      149.599327
exang          0.326599
oldpeak        1.055556
slope          0.602694
ca             0.676768
thal           0.835017
condition      0.461279
dtype: float64

In [84]:
(df - df.mean(axis=0)) / df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,14.457912,0.323232,-2.158249,28.306397,-13.350168,0.855219,1.003367,-18.599327,-0.326599,-0.955556,0.397306,0.323232,-0.835017,-0.461279
1,14.457912,-0.676768,-2.158249,8.306397,-8.350168,-0.144781,-0.996633,1.400673,-0.326599,0.744444,-0.602694,1.323232,-0.835017,-0.461279
2,11.457912,-0.676768,-2.158249,18.306397,-21.350168,-0.144781,-0.996633,-35.599327,-0.326599,1.544444,1.397306,-0.676768,-0.835017,-0.461279
3,10.457912,0.323232,-2.158249,6.306397,34.649832,0.855219,1.003367,24.400673,-0.326599,0.344444,0.397306,0.323232,-0.835017,0.538721
4,9.457912,0.323232,-2.158249,-21.693603,-36.350168,-0.144781,1.003367,-5.599327,0.673401,0.744444,0.397306,-0.676768,-0.835017,-0.461279


In [89]:
pd.get_dummies(df.sex, prefix="sex", drop_first=True)

Unnamed: 0,sex_1
0,1
1,0
2,0
3,1
4,1
...,...
292,1
293,1
294,1
295,0


In [107]:
from sklearn.base import BaseEstimator, TransformerMixin


class NormalizeAndOHEncode(BaseEstimator, TransformerMixin):
    def __init__(self, num_features, cat_features):
        self.num_features = num_features
        self.cat_features = cat_features
    
    def fit(self, X, y=None):
        self.means = X[self.num_features].mean(axis=0)
        self.stds = X[self.num_features].std(axis=0) + 1e-10
        return self
    
    def transform(self, X_, y=None):
        X = X_.copy()
        X[self.num_features] -= self.means
        X[self.num_features] /= self.stds
        for colname in cat_features:
            colname_oh = pd.get_dummies(X[colname], prefix=colname, drop_first=True)
            X = X.drop(colname, axis=1).join(colname_oh)
        return X

In [108]:
trans = NormalizeAndOHEncode(num_features, cat_features)

In [110]:
trans_df = trans.fit_transform(df)

In [111]:
cat_features

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

In [112]:
trans_df.columns

Index(['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'condition', 'sex_1',
       'cp_1', 'cp_2', 'cp_3', 'fbs_1', 'restecg_1', 'restecg_2', 'exang_1',
       'slope_1', 'slope_2', 'ca_1', 'ca_2', 'ca_3', 'thal_1', 'thal_2'],
      dtype='object')

In [113]:
import pickle

In [114]:
with open("../models/trans.pkl", "wb") as f:
    pickle.dump(trans, f)
with open("../models/trans.pkl", "rb") as f:
    trans1 = pickle.load(f)

In [116]:
trans1

(NormalizeAndOHEncode(cat_features=['sex', 'cp', 'fbs', 'restecg', 'exang',
                                    'slope', 'ca', 'thal'],
                      num_features=['age', 'trestbps', 'chol', 'thalach',
                                    'oldpeak']),
 NormalizeAndOHEncode(cat_features=['sex', 'cp', 'fbs', 'restecg', 'exang',
                                    'slope', 'ca', 'thal'],
                      num_features=['age', 'trestbps', 'chol', 'thalach',
                                    'oldpeak']))

In [120]:
report = eda.create_report(df, title="Jopa")

  0%|          | 0/1789 [00:00<?, ?it/s]

  return func(*(_execute_task(a, cache) for a in args))


In [123]:
report.save("../reports/jopa")

Report has been saved to ../reports/jopa.html!
