# Logistic Regression
In this Jupyter Notebook a Logistic Regression will be implemented. Data cleaning from the previous notebook will be used and then it is splitted into trainin-validation-test sets for implementing the Logistic Regression model. The following steps are developed in this work:

1. Load and cleand data.
2. Split data into training/validation/test data
3. Use a `DictVectorizer` to turn data into matrix for training
4. Develop Logistic Regression Model <br>
    4.1 Simple Logistic Regression with default parameters <br>
    4.2 Logistic Regression with Regularization <br>
    4.3 Logistic Regression with K-Fold Cross Validation <br>
    4.4 K-fold and cross validation on logistic regression, C-value tuning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

# 1. Load and clean data

In [3]:
df = pd.read_csv('../datasets/heart_disease/heart_2020_cleaned.csv')
df.columns = df.columns.str.lower()

numerical = list(df.dtypes[df.dtypes == 'float'].index.values)
categorical = list(df.dtypes[df.dtypes == 'object'].index.values)
categorical.remove('heartdisease')

for c in categorical:
    df[c] = df[c].str.lower()

df['heartdisease'] = df['heartdisease'].str.lower()

# 2. Split data into training/validation/test data

In [4]:
df_full_train, df_test = train_test_split(df, test_size=0.20, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

y_train = (df_train['heartdisease']=='yes').astype('int').values
y_val = (df_val['heartdisease']=='yes').astype('int').values
y_test = (df_test['heartdisease']=='yes').astype('int').values

df_train = df_train.drop(columns='heartdisease')
df_val = df_val.drop(columns='heartdisease')
df_test = df_test.drop(columns='heartdisease')

df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# 3. Dict Vectorizer

In [5]:
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(df_train.to_dict(orient='records'))
X_val = dv.transform(df_val.to_dict(orient='records'))
X_test = dv.transform(df_test.to_dict(orient='records'))

# 4. Develop Logistic Regression Model

## 4.1 Simple Logistic Regression with default parameters

In [10]:
lr = LogisticRegression(random_state=1, max_iter=10000)
lr.fit(X_train, y_train)
y_pred = lr.predict_proba(X_val)

roc_auc_score(y_val, y_pred[:,1])

0.8405325039916198

In [11]:
df_logistic_regression = pd.DataFrame(data=dv.feature_names_, columns=['feature'])

df_logistic_regression['coefficient'] = lr.coef_[0]

df_logistic_regression.sort_values(by='coefficient', ascending=False)

Unnamed: 0,feature,coefficient
12,agecategory=80 or older,1.577492
11,agecategory=75-79,1.316811
10,agecategory=70-74,1.138805
9,agecategory=65-69,0.83223
27,genhealth=poor,0.82963
8,agecategory=60-64,0.596847
25,genhealth=fair,0.461774
49,stroke=yes,0.372306
7,agecategory=55-59,0.325866
20,diabetic=yes,0.187098


## 4.2 Logistic Regression with Regularization

In [15]:
def train_lr(X_train, y_train, C=1.0):
    lr = LogisticRegression(random_state=1, max_iter=10000, C=C)
    lr.fit(X_train, y_train)
    return lr

def predict_lr(X_val, lr):
    y_pred = lr.predict_proba(X_val)
    return y_pred[:,1]

def process_training_data(df_train):
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(df_train.to_dict(orient='records'))
    return dv, X_train

def process_inference_data(df_val, dv):
    X_val = dv.transform(df_val.to_dict(orient='records'))
    return X_val

In [16]:
dv, X_train = process_training_data(df_train)
X_val = process_inference_data(df_val, dv)
lr = train_lr(X_train, y_train)
y_pred = predict_lr(X_val, lr)

In [17]:
roc_auc_score(y_val, y_pred)

0.8405325039916198

## 4.3 Logistic Regression with K-Fold Cross Validation

In [18]:
kfold = KFold(n_splits=10, shuffle=True, random_state=1)

In [19]:
#df_full_train['heartdisease'] = (df_full_train['heartdisease'] == 'yes').astype('int')

In [20]:
aucs = []

for train_idx, val_idx in tqdm(kfold.split(df_full_train)):
    df_train_kfold = df_full_train.iloc[train_idx]
    y_train_kfold = df_train_kfold.heartdisease.values
    df_train_kfold = df_train_kfold.drop(columns = 'heartdisease')

    df_val_kfold = df_full_train.iloc[val_idx]
    y_val_kfold = df_val_kfold.heartdisease.values
    df_val_kfold = df_val_kfold.drop(columns = 'heartdisease')

    dv_kfold, X_train_kfold = process_training_data(df_train_kfold)
    X_val_kfold = process_inference_data(df_val_kfold, dv_kfold)
    lr_kfold = train_lr(X_train_kfold, y_train_kfold)
    y_pred_kfold = predict_lr(X_val_kfold, lr_kfold)

    rocauc = roc_auc_score(y_val_kfold, y_pred_kfold)
    aucs.append(rocauc)

0it [00:00, ?it/s]

In [21]:
aucs = np.array(aucs)

In [22]:
print('mean auc = {:.3f}'.format(aucs.mean()))
print('std auc = {:.3f}'.format(aucs.std()))

mean auc = 0.840
std auc = 0.003



## 4.4 K-fold and cross validation on logistic regression, C-value tuning

In [38]:
for C in tqdm([0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000]):

    kfold = KFold(n_splits=5, shuffle=True, random_state=1)

    aucs = []

    for train_idx, val_idx in tqdm(kfold.split(df_full_train)):
        df_train_kfold = df_full_train.iloc[train_idx]
        y_train_kfold = df_train_kfold.heartdisease.values
        df_train_kfold = df_train_kfold.drop(columns = 'heartdisease')

        df_val_kfold = df_full_train.iloc[val_idx]
        y_val_kfold = df_val_kfold.heartdisease.values
        df_val_kfold = df_val_kfold.drop(columns = 'heartdisease')

        dv_kfold, X_train_kfold = process_training_data(df_train_kfold)
        X_val_kfold = process_inference_data(df_val_kfold, dv_kfold)
        lr_kfold = train_lr(X_train_kfold, y_train_kfold, C)
        y_pred_kfold = predict_lr(X_val_kfold, lr_kfold)

        rocauc = roc_auc_score(y_val_kfold, y_pred_kfold)
        aucs.append(rocauc)
    print('C=%6s, auc = %f ± %f' % (C, np.mean(aucs), np.std(aucs)))

  0%|          | 0/8 [00:00<?, ?it/s]

0it [00:00, ?it/s]

C= 0.001, auc = 0.830919 ± 0.002894


0it [00:00, ?it/s]

C=  0.01, auc = 0.839898 ± 0.002247


0it [00:00, ?it/s]

C=   0.1, auc = 0.840014 ± 0.002187


0it [00:00, ?it/s]

C=   0.5, auc = 0.839926 ± 0.002188


0it [00:00, ?it/s]

C=     1, auc = 0.839915 ± 0.002182


0it [00:00, ?it/s]

C=    10, auc = 0.839907 ± 0.002188


0it [00:00, ?it/s]

C=   100, auc = 0.839903 ± 0.002186


0it [00:00, ?it/s]

C=  1000, auc = 0.839908 ± 0.002186
