In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
import joblib

In [20]:
import fastapi
fastapi.__version__

'0.78.0'

In [9]:
df = pd.read_csv("heart_cleveland_upload.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [10]:
train_df, valid_df = train_test_split(df, test_size=0.2, shuffle=True)

In [15]:
np.array(df.iloc[3, :]),

(array([ 65. ,   1. ,   0. , 138. , 282. ,   1. ,   2. , 174. ,   0. ,
          1.4,   1. ,   1. ,   0. ,   1. ]),)

In [None]:
curl -X 'GET' \
  'http://0.0.0.0:1234/predict' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
  "data":[[65.0, 1.0, 0.0, 138.0, 282.0, 1.0, 2.0, 174.0, 0.0, 1.4, 1.0, 1.0, 0.0]],
  "feature_names": ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal"]
}'

In [None]:
curl -X 'GET' \
  'http://0.0.0.0:1234/predict' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
  "data":[[65.0, 1.0, 0.0, 138.0, 282.0, 1.0, 2.0, 174.0, 0.0, 1.4, 1.0, 1.0, 0.0],
[65.0, 1.0, 0.0, 138.0, 282.0, 1.0, 2.0, 174.0, 0.0, 1.4, 1.0, 1.0, 0.0]],
  "feature_names": ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal"]
}'

In [None]:
curl -X 'GET' \
  'http://0.0.0.0:5000/predict' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
  "data":[[65., 1., 0., 138., 282., 1., 2., 174., 0., 1.4, 1., 1., 0.]],
  "feature_names": [age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal]
}'

In [19]:
X_train, y_train = train_df.drop(columns=["condition"]), np.array(train_df["condition"])

In [20]:
X_test, y_test = valid_df.drop(columns=["condition"]), np.array(valid_df["condition"])

In [21]:
cat_features = ["sex", "fbs", "restecg", "exang", "slope", "ca", "thal"]
num_features = ["age", "cp", "trestbps", "chol", "thalach", "oldpeak"]

In [46]:
len(cat_features) + len(num_features)

13

In [22]:
numeric_transformer = Pipeline(steps=[("MinMaxScaler",
                                               MinMaxScaler())])

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, num_features),
    ("cat", categorical_transformer, cat_features),
])

In [23]:
train_df_proccesed = preprocessor.fit_transform(train_df)

In [38]:
joblib.dump(preprocessor, "transformer.pkl")

['transformer.pkl']

In [24]:
test_df_proccesed = preprocessor.transform(X_test)

In [25]:
test_df_proccesed

array([[0.64583333, 0.66666667, 0.0754717 , ..., 1.        , 0.        ,
        0.        ],
       [0.52083333, 0.66666667, 0.52830189, ..., 0.        , 0.        ,
        1.        ],
       [0.77083333, 1.        , 0.24528302, ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.625     , 0.        , 0.62264151, ..., 1.        , 0.        ,
        0.        ],
       [0.4375    , 0.66666667, 0.43396226, ..., 0.        , 0.        ,
        1.        ],
       [0.33333333, 0.        , 0.1509434 , ..., 0.        , 0.        ,
        1.        ]])

In [26]:
from typing import NoReturn
from abc import ABC, abstractmethod

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


class Model(ABC):

    @abstractmethod
    def __init__(self) -> NoReturn:
        super().__init__()
        self.model = None
        self.name = type(self).__name__

    def fit(self, x, y) -> NoReturn:
        self.model.fit(x, y)

    def fit_predict(self, x_train, y_train, x) -> np.ndarray:
        self.model.fit(x_train, y_train)
        y_preds = self.model.predict(x)
        return y_preds

    def predict(self, x) -> np.ndarray:
        y_preds = self.model.predict(x)
        return y_preds


class LR(Model):

    def __init__(self, **kwargs) -> NoReturn:
        super().__init__()
        self.model = LogisticRegression(**kwargs)


class RF(Model):

    def __init__(self, **kwargs) -> NoReturn:
        super().__init__()
        self.model = RandomForestClassifier(**kwargs)

In [39]:
model = LogisticRegression()

In [40]:
model.fit(train_df_proccesed, y_train)

LogisticRegression()

In [41]:
y_preds_train = model.predict(train_df_proccesed)

In [42]:
roc_auc_score(y_train, y_preds_train), accuracy_score(y_train, y_preds_train), f1_score(y_train, y_preds_train)

(0.8670327304048234, 0.869198312236287, 0.8544600938967136)

In [43]:
y_preds = model.predict(test_df_proccesed)

In [44]:
roc_auc_score(y_test, y_preds), accuracy_score(y_test, y_preds), f1_score(y_test, y_preds)

(0.7608453837597331, 0.7666666666666667, 0.7083333333333333)

In [45]:
joblib.dump(model, "lr_model.pkl")

['lr_model.pkl']