In [45]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from math import sqrt
import numpy as np

# Define constants
data_path = "../Data"
seed = 1
submission_path = "../Submissions"

In [46]:
# Read initial training data
train_y = pd.read_csv(data_path + "/train_labels.csv")
train_y.set_index("patient_id", inplace=True)
val = train_y['heart_disease_present']
all_features = pd.read_csv(data_path + "/train_values.csv")
all_features.set_index("patient_id", inplace=True)

In [47]:
# Read test data
submit_X = pd.read_csv(data_path + "/test_values.csv")
submit_X.set_index("patient_id", inplace=True)

In [48]:
# Define utility functions
def coefficients(model, features):
    return pd.DataFrame(np.transpose(model.coef_), index=features, columns=["coef"])

def create_submission(file_name, train_X, train_y, pred_X, pred_index):
    model = LogisticRegression(random_state = 1)
    model.fit(train_X, train_y)
    predictions = model.predict_proba(pred_X)
    submission = pd.DataFrame(predictions[:,0], index = pred_index, columns=['heart_disease_present'])
    submission.reset_index(inplace=True)
    submission.to_csv(submission_path + "/" + file_name, index=False)
    
    return model, submission

In [49]:
# Model iterations

features1 = [
    "max_heart_rate_achieved", "age", "oldpeak_eq_st_depression"
]

X1 = all_features[features1]

train_X1, test_X1, train_y1, test_y1 = train_test_split(
    X1, val, test_size = 0.6, random_state=1
)

model = LogisticRegression(random_state = 1)
model.fit(train_X1, train_y1)
pred1 = model.predict(test_X1)
logloss = log_loss(test_y1, pred1)
logloss

11.193158998231711

In [50]:
coefficients(model, features1)

Unnamed: 0,coef
max_heart_rate_achieved,-0.023543
age,0.024027
oldpeak_eq_st_depression,0.686174


In [51]:
features2 = features1 + [
    'resting_ekg_results', 'serum_cholesterol_mg_per_dl', 'num_major_vessels', 'resting_blood_pressure'
]

X2 = all_features[features2]

train_X2, test_X2, train_y2, test_y2 = train_test_split(
    X2, val, test_size = 0.6, random_state=1
)

model = LogisticRegression(random_state = 1)
model.fit(train_X2, train_y2)
pred2 = model.predict(test_X2)
logloss = log_loss(test_y2, pred2)
logloss

10.87337032045347

In [52]:
coefficients(model, features2)

Unnamed: 0,coef
max_heart_rate_achieved,-0.041904
age,-0.055654
oldpeak_eq_st_depression,0.520261
resting_ekg_results,0.232879
serum_cholesterol_mg_per_dl,-0.00173
num_major_vessels,1.030687
resting_blood_pressure,0.054217


In [53]:
# features not included: thal, chest_pain_type
features3 = features2 + [
    'sex', "slope_of_peak_exercise_st_segment", "fasting_blood_sugar_gt_120_mg_per_dl", "exercise_induced_angina"
]

X3 = all_features[features3]

train_X3, test_X3, train_y3, test_y3 = train_test_split(
    X3, val, test_size = 0.6, random_state=1
)

model = LogisticRegression(random_state = 1)
model.fit(train_X3, train_y3)
pred3 = model.predict(test_X3)
logloss = log_loss(test_y3, pred3)
logloss

9.913996883438829

In [54]:
coefficients(model, features3).sort_values("coef")

Unnamed: 0,coef
fasting_blood_sugar_gt_120_mg_per_dl,-0.614819
age,-0.060972
max_heart_rate_achieved,-0.045725
serum_cholesterol_mg_per_dl,0.003565
resting_ekg_results,0.02839
resting_blood_pressure,0.043251
oldpeak_eq_st_depression,0.098412
slope_of_peak_exercise_st_segment,0.487813
num_major_vessels,1.185497
sex,1.461254


In [55]:
features3

['max_heart_rate_achieved',
 'age',
 'oldpeak_eq_st_depression',
 'resting_ekg_results',
 'serum_cholesterol_mg_per_dl',
 'num_major_vessels',
 'resting_blood_pressure',
 'sex',
 'slope_of_peak_exercise_st_segment',
 'fasting_blood_sugar_gt_120_mg_per_dl',
 'exercise_induced_angina']

In [56]:
create_submission(
    "submission4.csv",
    all_features[features3],
    val,
    test_X[features3],
    test_X.index
)

(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),     patient_id  heart_disease_present
 0       bv01fp               0.618963
 1       6r9x2j               0.002281
 2       bthqr4               0.952715
 3       a2kf1z               0.224608
 4       pjgqa3               0.896387
 5       27oevk               0.954863
 6       mcwqgs               0.049414
 7       30v796               0.708710
 8       0g192k               0.119381
 9       w1wgrq               0.657947
 10      s8dx1q               0.744653
 11      7kf275               0.095256
 12      xc17yq               0.873443
 13      e68djo               0.118476
 14      oyt4ek               0.169919
 15      tbo0wx               0.006146
 16      f06u72               0.208862
 17      ju1wdc               0.

In [57]:
# features not included: thal, chest_pain_type, resting_ekg_results
features4 = [feature for feature in features3 if feature != "resting_ekg_results"]

X4 = all_features[features4]

train_X4, test_X4, train_y4, test_y4 = train_test_split(
    X4, val, test_size = 0.6, random_state=1
)

model = LogisticRegression(random_state = 1)
model.fit(train_X4, train_y4)
pred4 = model.predict(test_X4)
logloss = log_loss(test_y4, pred4)
logloss

9.913996883438829

In [58]:
features = [
    "sex", "exercise_induced_angina", "num_major_vessels", "fasting_blood_sugar_gt_120_mg_per_dl"
]

X = all_features[features]

train_X, test_X, train_y, test_y = train_test_split(
    X, val, test_size = 0.6, random_state=1
)

model = LogisticRegression(random_state = 1)
model.fit(train_X, train_y)
pred = model.predict(test_X)
logloss = log_loss(test_y, pred)
logloss

10.873392531493197

In [59]:
coefficients(model, features)

Unnamed: 0,coef
sex,1.129217
exercise_induced_angina,2.051349
num_major_vessels,0.67432
fasting_blood_sugar_gt_120_mg_per_dl,-0.677821


In [60]:
X = pd.get_dummies(all_features, columns=["thal", "chest_pain_type"], drop_first=True)

train_X, test_X, train_y, test_y = train_test_split(
    X, val, test_size = 0.6, random_state=1
)

model = LogisticRegression(random_state = 1)
model.fit(train_X, train_y)
pred = model.predict(test_X)
logloss = log_loss(test_y, pred)
logloss

8.954571620664824

In [61]:
coefficients(model, X.columns.tolist())

Unnamed: 0,coef
slope_of_peak_exercise_st_segment,0.457783
resting_blood_pressure,0.048374
num_major_vessels,1.091505
fasting_blood_sugar_gt_120_mg_per_dl,-0.393244
resting_ekg_results,0.313188
serum_cholesterol_mg_per_dl,0.001332
oldpeak_eq_st_depression,-0.0084
sex,0.715752
age,-0.058416
max_heart_rate_achieved,-0.047832


In [68]:
submit_X_transform = pd.get_dummies(submit_X, columns=["thal", "chest_pain_type"], drop_first=True)

m,s = create_submission(
    "submission5.csv",
    X,
    val,
    submit_X_transform,
    submit_X_transform.index
)