In [25]:
import pandas as pd
import numpy as np
import sklearn
import yaml
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import os
import sys

In [26]:
sys.path.append(os.path.abspath('../'))

In [27]:
from src.data.dataloader import DataLoader

In [28]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
DATA_PATH = "../data/heart_cleveland_upload.csv"

In [30]:
reader_conf = '''
target: condition
categorical:
    - name: sex
      values: [0,1]
    - name: cp
      values: [0, 1, 2, 3]
    - name: fbs
      values: [0, 1]
    - name: restecg
      values: [0, 1, 2]
    - name: exang
      values: [0, 1]
    - name: slope
      values: [0, 1, 2]
    - name: ca
      values: [0, 1, 2, 3]
    - name: thal
      values: [0, 1, 2]
'''

In [31]:
reader_conf = yaml.safe_load(reader_conf)

In [32]:
data_reader = DataLoader(**reader_conf)

In [33]:
X, target = data_reader.read_data(DATA_PATH)

In [34]:
X = X.values
target = target.values

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, target, train_size=0.8, random_state=42)

In [36]:
pipe = Pipeline([('scaler', StandardScaler()), ('model', LogisticRegression(solver='liblinear'))])

In [37]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', LogisticRegression(solver='liblinear'))])

In [38]:
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

In [39]:
accuracy_score(y_train, y_train_pred)

0.8945147679324894

In [40]:
accuracy_score(y_test, y_test_pred)

0.7666666666666667

In [41]:
f1_score(y_train, y_train_pred)

0.8815165876777251

In [42]:
f1_score(y_test, y_test_pred)

0.7586206896551724