In [1]:
import os

import numpy as np
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

print("Setup Complete")

Setup Complete


# Read data

In [2]:
datapath = os.path.join('..', 'data', 'raw', 'heart.csv')
print(datapath)

..\data\raw\heart.csv


In [3]:
df = pd.read_csv(datapath)

In [4]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
# Data types
for col in df.columns:
    print(col, np.dtype(df[col]))

age int64
sex int64
cp int64
trestbps int64
chol int64
fbs int64
restecg int64
thalach int64
exang int64
oldpeak float64
slope int64
ca int64
thal int64
target int64


# Scaling

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
scaler = StandardScaler()

In [8]:
scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
scaled_df.target = df.target

In [9]:
scaled_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.952197,0.681005,1.973123,0.763956,-0.256334,2.394438,-1.005832,0.015443,-0.696631,1.087338,-2.274579,-0.714429,-2.148873,1
1,-1.915313,0.681005,1.002577,-0.092738,0.072199,-0.417635,0.898962,1.633471,-0.696631,2.122573,-2.274579,-0.714429,-0.512922,1
2,-1.474158,-1.468418,0.032031,-0.092738,-0.816773,-0.417635,-1.005832,0.977514,-0.696631,0.310912,0.976352,-0.714429,-0.512922,1
3,0.180175,0.681005,0.032031,-0.663867,-0.198357,-0.417635,0.898962,1.239897,-0.696631,-0.206705,0.976352,-0.714429,-0.512922,1
4,0.290464,-1.468418,-0.938515,-0.663867,2.08205,-0.417635,0.898962,0.583939,1.435481,-0.379244,0.976352,-0.714429,-0.512922,1


# Prepare target and features, split on train and test

In [10]:
from sklearn.model_selection import train_test_split 

### Dataset from usual data

In [11]:
data = df.copy()

In [12]:
drop_cols = ['target', 'fbs', ]
y = data.target
X = data.drop(drop_cols, axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,0,150,0,2.3,0,0,1
1,37,1,2,130,250,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,172,0,1.4,2,0,2
3,56,1,1,120,236,1,178,0,0.8,2,0,2
4,57,0,0,120,354,1,163,1,0.6,2,0,2


In [13]:
# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25)

# Random forest simple model

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
clf = RandomForestClassifier(n_estimators = 150)

In [16]:
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=150)

0

In [17]:
y_preds = clf.predict(X_test)

In [18]:
from sklearn.metrics import roc_auc_score, f1_score

In [19]:
f1_score(y_test, y_preds)

0.8780487804878048

In [30]:
roc_auc_score(y_test, y_preds)

0.8666666666666666

# LogReg model

In [151]:
from sklearn.linear_model import LogisticRegression

In [152]:
clf_2 = LogisticRegression(penalty='l1', solver='liblinear')

In [153]:
clf_2.fit(X_train, y_train)

LogisticRegression(penalty='l1', solver='liblinear')

In [154]:
y_preds = clf_2.predict(X_test)

In [155]:
f1_score(y_test, y_preds)

0.8723404255319149

# LogReg with scaled data

In [156]:
scaled_data = scaled_df.copy()

In [157]:
drop_cols = ['target', 'fbs', ]
y = scaled_data.target
X = scaled_data.drop(drop_cols, axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,restecg,thalach,exang,oldpeak,slope,ca,thal
0,0.952197,0.681005,1.973123,0.763956,-0.256334,-1.005832,0.015443,-0.696631,1.087338,-2.274579,-0.714429,-2.148873
1,-1.915313,0.681005,1.002577,-0.092738,0.072199,0.898962,1.633471,-0.696631,2.122573,-2.274579,-0.714429,-0.512922
2,-1.474158,-1.468418,0.032031,-0.092738,-0.816773,-1.005832,0.977514,-0.696631,0.310912,0.976352,-0.714429,-0.512922
3,0.180175,0.681005,0.032031,-0.663867,-0.198357,0.898962,1.239897,-0.696631,-0.206705,0.976352,-0.714429,-0.512922
4,0.290464,-1.468418,-0.938515,-0.663867,2.08205,0.898962,0.583939,1.435481,-0.379244,0.976352,-0.714429,-0.512922


In [158]:
clf_3 = LogisticRegression(penalty='l1', solver='saga')

In [159]:
clf_3.fit(X_train, y_train)



LogisticRegression(penalty='l1', solver='saga')

In [160]:
y_preds = clf_3.predict(X_test)

In [161]:
f1_score(y_test, y_preds)

0.7524752475247525

# One pipeline model

In [162]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [163]:
pipe = Pipeline(
    [
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
        ('logreg', LogisticRegression(penalty='l1', solver='liblinear'))
        
    ]
)

In [164]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('logreg',
                 LogisticRegression(penalty='l1', solver='liblinear'))])

In [165]:
y_preds = pipe.predict(X_test)

In [166]:
f1_score(y_test, y_preds)

0.8723404255319149

### Export model

In [168]:
import joblib

with open('clf_hw2.pkl', 'wb') as output_file:
    joblib.dump(pipe, output_file)


In [169]:
X_test.to_csv('test_for_predict.csv', index=True)