In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from pprint import pprint

In [3]:
# Load the dataset
columns = ["age", "sex", "cp", "restbp", "chol", "fbs", "restecg", 
           "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"]
df0     = pd.read_table("data/heart_disease_all14.csv", sep=',', header=None, names=columns)


In [4]:

def get_clean_and_standardize_data_frame(df0):
    # Convert categorical variables with more than two values into dummy variables.
    # Note that variable ca is discrete but not categorical, so we don't convert it.
    df      = df0.copy()
    dummies = pd.get_dummies(df["cp"],prefix="cp")
    df      = df.join(dummies)
    del df["cp"]
    del df["cp_4.0"]
    df      = df.rename(columns = {"cp_1.0":"cp_1","cp_2.0":"cp_2","cp_3.0":"cp_3"})

    dummies = pd.get_dummies(df["restecg"],prefix="recg")
    df      = df.join(dummies)
    del df["restecg"]
    del df["recg_0.0"]
    df      = df.rename(columns = {"recg_1.0":"recg_1","recg_2.0":"recg_2"})

    dummies = pd.get_dummies(df["slope"],prefix="slope")
    df      = df.join(dummies)
    del df["slope"]
    del df["slope_2.0"]
    df      = df.rename(columns = {"slope_1.0":"slope_1","slope_3.0":"slope_3"})

    dummies = pd.get_dummies(df["thal"],prefix="thal")
    df      = df.join(dummies)
    del df["thal"]
    del df["thal_3.0"]
    df      = df.rename(columns = {"thal_6.0":"thal_6","thal_7.0":"thal_7"})

    # Replace response variable values and rename
    df["num"].replace(to_replace=[1,2,3,4],value=1,inplace=True)
    df      = df.rename(columns = {"num":"hd"})

    # New list of column labels after the above operations
    new_columns_1 = ["age", "sex", "restbp", "chol", "fbs", "thalach", 
                     "exang", "oldpeak", "ca", "hd", "cp_1", "cp_2",
                     "cp_3", "recg_1", "recg_2", "slope_1", "slope_3",
                     "thal_6", "thal_7"]

    print('\nNumber of patients in dataframe: %i, with disease: %i, without disease: %i\n' \
          % (len(df.index),len(df[df.hd==1].index),len(df[df.hd==0].index)))
    print(df.head())
    print(df.describe())

    # Standardize the dataframe
    stdcols = ["age","restbp","chol","thalach","oldpeak"]
    nrmcols = ["ca"]
    stddf   = df.copy()
    stddf[stdcols] = stddf[stdcols].apply(lambda x: (x-x.mean())/x.std())
    stddf[nrmcols] = stddf[nrmcols].apply(lambda x: (x-x.mean())/(x.max()-x.min()))

    new_columns_2 = new_columns_1[:9] + new_columns_1[10:]
    new_columns_2.insert(0,new_columns_1[9])
    stddf = stddf.reindex(columns=new_columns_2)
    
    return stddf, new_columns_2


    return (Xall, yall, new_columns_2)
stddf, columns = get_clean_and_standardize_data_frame(df0)


# Convert dataframe into lists for use by classifiers
yall = stddf["hd"]
Xall = stddf[columns[1:]].values



Number of patients in dataframe: 299, with disease: 139, without disease: 160

    age  sex  restbp   chol  fbs  thalach  exang  oldpeak   ca   hd  cp_1  \
0  63.0  1.0   145.0  233.0  1.0    150.0    0.0      2.3  0.0  0.0     1   
1  67.0  1.0   160.0  286.0  0.0    108.0    1.0      1.5  3.0  1.0     0   
2  67.0  1.0   120.0  229.0  0.0    129.0    1.0      2.6  2.0  1.0     0   
3  37.0  1.0   130.0  250.0  0.0    187.0    0.0      3.5  0.0  0.0     0   
4  41.0  0.0   130.0  204.0  0.0    172.0    0.0      1.4  0.0  0.0     0   

   cp_2  cp_3  recg_1  recg_2  slope_1  slope_3  thal_6  thal_7  
0     0     0       0       1        0        1       1       0  
1     0     0       0       1        0        0       0       0  
2     0     0       0       1        0        0       0       1  
3     0     1       0       0        0        1       0       0  
4     1     0       0       1        1        0       0       0  
              age        sex      restbp        chol         

# Fit on the entire dataset

In [5]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import metrics

def fit_logistic_model(Xall, yall):
    model = LogisticRegression(fit_intercept=True)
    print(model)
    lrfit = model.fit(Xall,yall)
    print('\nLogisticRegression score on full data set: %f\n' % lrfit.score(Xall,yall))
    ypred = model.predict(Xall)
    print('\nClassification report on full data set:')
    print(metrics.classification_report(yall,ypred))
    print('\nConfusion matrix:')
    print(metrics.confusion_matrix(yall,ypred))
    print('\nLogisticRegression coefficients:')
    coeff = model.coef_.tolist()[0]
    for index in range(len(coeff)):
        print('%s : %8.5f' % (columns[index+1].rjust(9),coeff[index]))
    print('Intercept : %f' %model.intercept_)
    
fit_logistic_model(Xall, yall)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression score on full data set: 0.876254


Classification report on full data set:
             precision    recall  f1-score   support

        0.0       0.86      0.93      0.89       160
        1.0       0.90      0.82      0.86       139

avg / total       0.88      0.88      0.88       299


Confusion matrix:
[[148  12]
 [ 25 114]]

LogisticRegression coefficients:
      age : -0.03697
      sex :  1.07496
   restbp :  0.31269
     chol :  0.16268
      fbs : -0.34533
  thalach : -0.43907
    exang :  0.65986
  oldpeak :  0.44011
       ca :  2.42265
     cp_1 : -1.33059
     cp_2 : -0.65864
     cp_3 : -1.37985
   recg_1 :  0.11286
   recg_2 :  0.37125
  slope_1 : -0.76523
  slope_3 : -0.42006
   thal_6 :  

In [6]:
'''
Fit using only the age and thalach features, separately for ca=0 and ca>0.
'''
import pandas as pd
import statsmodels.api as sm

dropped_columns  = [2,3,4,5,7,8,9,10,11,12,13,14,15,16,17,18] # Drop all features except 1 (age) and 6 (thalach).

df1        = stddf.copy()
df1        = df1[df1.ca<0]
df1        = df1.drop(df1.columns[dropped_columns],axis=1) 
X_all_2_cols = df1[df1.columns[1:]]
yall_2_cols   = df1[df1.columns[0]]

fit_logistic_model(X_all_2_cols, yall_2_cols)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

LogisticRegression score on full data set: 0.767045


Classification report on full data set:
             precision    recall  f1-score   support

        0.0       0.79      0.92      0.85       129
        1.0       0.62      0.34      0.44        47

avg / total       0.75      0.77      0.74       176


Confusion matrix:
[[119  10]
 [ 31  16]]

LogisticRegression coefficients:
      age : -0.20579
      sex : -1.08938
Intercept : -0.948295


  from pandas.core import datetools
