In [1]:
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
train_values = pd.read_csv('train_values.csv')
train_labels = pd.read_csv('train_labels.csv')
test_values = pd.read_csv('test_values.csv')

In [3]:
train_values.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0
1,ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0
2,yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1
3,l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0
4,oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0


In [4]:
X = train_values.drop(labels='patient_id',axis=1)
X_test = test_values.drop(labels='patient_id',axis=1)
X.head()
X_test.head()

Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,1,normal,128,2,0,0,2,308,0.0,1,45,170,0
1,2,normal,110,3,0,0,0,214,1.6,0,54,158,0
2,1,normal,125,4,3,0,2,304,0.0,1,77,162,1
3,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0
4,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0


Unnamed: 0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,2,reversible_defect,170,1,0,0,2,288,0.2,1,59,159,0
1,1,normal,138,4,0,0,0,183,1.4,0,35,182,0
2,2,reversible_defect,120,4,0,0,2,177,2.5,1,43,120,1
3,1,normal,102,3,1,0,0,318,0.0,0,60,160,0
4,2,normal,138,4,1,0,2,166,3.6,1,61,125,1


In [5]:
train_labels.head()

Unnamed: 0,patient_id,heart_disease_present
0,0z64un,0
1,ryoo3j,0
2,yt1s1x,1
3,l2xjde,1
4,oyt4ek,0


In [6]:
y = train_labels.drop(labels='patient_id',axis=1)
y = np.array(y).flatten()
y

array([0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0], dtype=int64)

#  Create Cross-Validation split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_CV, y_train, y_CV = train_test_split(X, y, test_size=0, random_state=2)

In [9]:
X_train.shape
y_train.shape

(180, 13)

(180,)

#  Encode with pandas get_dummies method
---

In [10]:
# Get dummies
X_train = pd.get_dummies(X_train, prefix_sep='_', drop_first=True)
X_test = pd.get_dummies(X_test, prefix_sep='_', drop_first=True)
try:
    X_CV = pd.get_dummies(X_CV, prefix_sep='_', drop_first=True)
except:
    pass  # when test_size=0

In [11]:
X_train.head()
X_test.head()
X_CV.head()

Unnamed: 0,slope_of_peak_exercise_st_segment,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,thal_normal,thal_reversible_defect
12,1,140,3,0,0,0,335,0.0,1,64,158,0,1,0
23,1,160,4,0,0,2,228,2.3,1,66,138,0,0,0
179,1,160,3,1,0,0,201,0.0,0,54,163,0,1,0
25,2,120,3,0,0,2,211,1.5,0,68,115,0,1,0
35,1,122,4,0,0,2,222,0.0,1,48,186,0,1,0


Unnamed: 0,slope_of_peak_exercise_st_segment,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,thal_normal,thal_reversible_defect
0,2,170,1,0,0,2,288,0.2,1,59,159,0,0,1
1,1,138,4,0,0,0,183,1.4,0,35,182,0,1,0
2,2,120,4,0,0,2,177,2.5,1,43,120,1,0,1
3,1,102,3,1,0,0,318,0.0,0,60,160,0,1,0
4,2,138,4,1,0,2,166,3.6,1,61,125,1,1,0


Unnamed: 0,slope_of_peak_exercise_st_segment,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina


In [12]:
X_train.dtypes

slope_of_peak_exercise_st_segment         int64
resting_blood_pressure                    int64
chest_pain_type                           int64
num_major_vessels                         int64
fasting_blood_sugar_gt_120_mg_per_dl      int64
resting_ekg_results                       int64
serum_cholesterol_mg_per_dl               int64
oldpeak_eq_st_depression                float64
sex                                       int64
age                                       int64
max_heart_rate_achieved                   int64
exercise_induced_angina                   int64
thal_normal                               uint8
thal_reversible_defect                    uint8
dtype: object

In [13]:
# #  Remove the lowest importance feature, doesn't do much here...
# X_train.drop(columns=['sex','fasting_blood_sugar_gt_120_mg_per_dl'],inplace=True)
# X_CV.drop(columns=['sex','fasting_blood_sugar_gt_120_mg_per_dl'],inplace=True)
# X_test.drop(columns=['sex','fasting_blood_sugar_gt_120_mg_per_dl'],inplace=True)

#  Feature Scaling 1 - scale

In [14]:
from sklearn import preprocessing
X_train_scaled = preprocessing.scale(X_train)
try:
    X_CV_scaled = preprocessing.scale(X_CV)
except:
    pass  #  when test_size = 0
X_test_scaled = preprocessing.scale(X_test)

# Feature Scaling 2 - fit/transform

In [15]:
# from sklearn.preprocessing import StandardScaler

# ss_train = StandardScaler().fit(X_train)
# X_train_scaled = ss_train.transform(X_train)

# try:
#     ss_CV = StandardScaler().fit(X_CV)
#     X_CV_scaled = ss_CV.transform(X_CV)
# except:
#     pass  #  when test_size = 0

# ss_test = StandardScaler().fit(X_test)
# X_test_scaled = ss_test.transform(X_test)

#  Feature Importance

In [16]:
# Feature Importance with Extra Trees Classifier
from sklearn.ensemble import ExtraTreesClassifier
# load data
# feature extraction
model = ExtraTreesClassifier()
model.fit(X_train, y_train)
print(model.feature_importances_)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

[ 0.0626381   0.07346254  0.1003064   0.09506403  0.02441465  0.03531336
  0.05924447  0.07128409  0.0376175   0.05967326  0.08949315  0.07130074
  0.15095552  0.06923219]


In [17]:
importance_df = pd.DataFrame()
importance_df['Feature']=X_train.columns
importance_df['Importance']=model.feature_importances_
importance_df.sort_values(by='Importance',ascending=False)

Unnamed: 0,Feature,Importance
12,thal_normal,0.150956
2,chest_pain_type,0.100306
3,num_major_vessels,0.095064
10,max_heart_rate_achieved,0.089493
1,resting_blood_pressure,0.073463
11,exercise_induced_angina,0.071301
7,oldpeak_eq_st_depression,0.071284
13,thal_reversible_defect,0.069232
0,slope_of_peak_exercise_st_segment,0.062638
9,age,0.059673


#  Hyper Parameter Optimization
---

# Method 1

In [18]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestClassifier

# # Create logistic regression
# logistic = LogisticRegression()
# RFC = RandomForestClassifier()

# #  Logistic Regression
# # Create regularization penalty space
# penalty = ['l1', 'l2','elasticnet']

# # Create regularization hyperparameter space
# C = np.linspace(0.00001,100, 10000)

# solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

# max_iter = np.linspace(10,1e4, 10000)

# # Create hyperparameter options
# hyperparameters = dict(C=C, penalty=penalty,solver=solver,max_iter=max_iter)

# # Fit grid search
# best_model = logistic.fit(X_CV_scaled, y_CV)

# # Fit grid search
# best_model = logistic.fit(X_CV_scaled, y_CV)

# # View best Logistic Regression hyperparameters
# print('Best Penalty:', best_model.get_params()['penalty'])
# print('Best C:', best_model.get_params()['C'])
# print('Best solver:', best_model.get_params()['solver'])
# print('max_iter:', best_model.get_params()['max_iter'])

#  Random Forest
# # Create regularization hyperparameter space
# n_estimators = [1,2,3,4,5]
# criterion = ['gini', 'entropy']
# max_depth = np.linspace(1,100)

# # Create hyperparameter options
# hyperparameters = dict(n_estimators=n_estimators, criterion=criterion,max_depth=max_depth)

# # Fit grid search
# best_model = RFC.fit(X_CV_scaled, y_CV)

# # View best Logistic Regression hyperparameters
# print('n_estimators:', best_model.get_params()['n_estimators'])
# print('criterion:', best_model.get_params()['criterion'])
# print('max_depth:', best_model.get_params()['max_depth'])

#  Method 2 - GridSearch

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Create logistic regression
logistic = LogisticRegression()
RFC = RandomForestClassifier()

#rf = RFC

# Logistic Regression
rf_params_LR = [{ 
    'penalty': ['l2'],
    'solver': ['newton-cg','lbfgs', 'liblinear','sag', 'saga'],  
    'C': [0.05,.075,0.1,0.125,0.15,0.2,0.3]},
    #'tol':[0.0001,1e-5]}
    {'penalty': ['l1'],
    'solver': ['liblinear','saga'],  
    'C': [0.05,.075,0.1,0.125,0.15,0.2,0.3]#,
    #'tol':[0.0001,1e-5,1e-6]
    }
]

# Random Forest
rf_params_RF = [{ 
    #'n_estimators': [100,200],
    'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13],  
    'criterion': ['gini'],#'entropy'],
    'max_features':[1,2,4,5,6,7,8,9,10]
}]

gs_random = GridSearchCV(estimator=logistic, param_grid=rf_params_LR,scoring='neg_log_loss',cv= 5)
gs_random.fit(X_train_scaled, y_train)
print(gs_random.best_params_)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'C': [0.05, 0.075, 0.1, 0.125, 0.15, 0.2, 0.3]}, {'penalty': ['l1'], 'solver': ['liblinear', 'saga'], 'C': [0.05, 0.075, 0.1, 0.125, 0.15, 0.2, 0.3]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=0)

{'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}


#  Cross Validation to find optional Hyperparameters
---

In [20]:
gs_random.best_params_

{'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}

In [21]:
try:
    gs_random.best_params_['criterion']
    gs_random.best_params_['max_depth']
    gs_random.best_params_['max_features']
except:
    pass

# Fitting using Machine Learning Models
---
*  Logistic Regression
*  Random Forest

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

#  Fit with Logistic Regression
#clf = LogisticRegression(random_state=1,tol=1e-6,penalty='l2',solver='liblinear',C=0.1,max_iter=100,n_jobs=-1).fit(X_train_scaled, y_train)
#clf = RandomForestClassifier(n_estimators=100,random_state=0,criterion='gini',max_depth=12,max_features=4).fit(X_train_scaled, y_train)

#  GridSearchCV Solutions

# clf = RandomForestClassifier(n_estimators=500,random_state=0,
#                              criterion=gs_random.best_params_['criterion'],
#                              max_depth=gs_random.best_params_['max_depth'],
#                              max_features=gs_random.best_params_['max_features']).fit(X_train_scaled, y_train)

clf = LogisticRegression(random_state=1,
                         penalty=gs_random.best_params_['penalty'],
                         solver=gs_random.best_params_['solver'],
                         C=gs_random.best_params_['C'],
                         max_iter=100).fit(X_train_scaled, y_train)

clf.predict(X_test_scaled)


#  probability of heart disease
probs = clf.predict_proba(X_test_scaled)
probs[:5]

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0], dtype=int64)

array([[ 0.62667537,  0.37332463],
       [ 0.90279265,  0.09720735],
       [ 0.09812816,  0.90187184],
       [ 0.89006393,  0.10993607],
       [ 0.15302052,  0.84697948]])

In [23]:
# probability of heart disease (y=1)
probs[:,1]

array([ 0.37332463,  0.09720735,  0.90187184,  0.10993607,  0.84697948,
        0.07754288,  0.13201298,  0.94224372,  0.20566627,  0.17904115,
        0.40631166,  0.63629164,  0.35837056,  0.9673208 ,  0.13960439,
        0.04393036,  0.04158402,  0.05974882,  0.90901041,  0.10148332,
        0.93983639,  0.21220125,  0.16100084,  0.02745936,  0.41751018,
        0.89199187,  0.46754253,  0.1793906 ,  0.62133956,  0.06847906,
        0.84231331,  0.27341058,  0.69054281,  0.38685667,  0.13741821,
        0.08739883,  0.26829327,  0.37234404,  0.18954841,  0.08151173,
        0.94911577,  0.1212183 ,  0.85935042,  0.06191929,  0.90019558,
        0.14528152,  0.11666699,  0.22048805,  0.24176366,  0.7663812 ,
        0.68622651,  0.10499743,  0.97617897,  0.13611764,  0.48080851,
        0.07718964,  0.91908236,  0.17279843,  0.35536305,  0.65972109,
        0.08813295,  0.90519429,  0.20153447,  0.93590548,  0.09229203,
        0.79339676,  0.70774799,  0.58857031,  0.74341828,  0.71

#  Accuracy

In [24]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_train_scaled)
y_true = y_train
accuracy_score(y_true, y_pred)

0.84444444444444444

In [25]:
try:
    y_pred = clf.predict(X_CV_scaled)
    y_true = y_CV
    accuracy_score(y_true, y_pred)
except:
    pass # when test_size=0

# Log Loss

In [26]:
from sklearn.metrics import log_loss

try:
    print('CV Log-Loss: ',log_loss(y_CV, clf.predict_proba(X_CV_scaled)[:,1]))
except:
    pass
print('Train Log-Loss: ',log_loss(y_train, clf.predict_proba(X_train_scaled)[:,1]))

Train Log-Loss:  0.363882746654


#  Output final probability data

In [27]:
output = pd.DataFrame()

In [28]:
output['patient_id']=test_values['patient_id']
output['heart_disease_present']=probs[:,1]
output

Unnamed: 0,patient_id,heart_disease_present
0,olalu7,0.373325
1,z9n6mx,0.097207
2,5k4413,0.901872
3,mrg7q5,0.109936
4,uki4do,0.846979
5,kev1sk,0.077543
6,9n6let,0.132013
7,jxmtyg,0.942244
8,51s2ff,0.205666
9,wi9mcs,0.179041


In [29]:
output.to_csv('prediction_result.csv',index=False)

#  Ensemble with Random States
---
* Doesn't seem to add value here

In [30]:
def Rand_State_Ensemble(num):
    '''
    Output to multiple prediction files, each with a unique random state
    '''
    for i in range(num):
        X_train, X_CV, y_train, y_CV = train_test_split(X, y, test_size=0.25, random_state=i)
        
        X_test = test_values.drop(labels='patient_id',axis=1)
        
        # Get dummies Encoding
        X_train = pd.get_dummies(X_train, prefix_sep='_', drop_first=True)
        X_test = pd.get_dummies(X_test, prefix_sep='_', drop_first=True)
        X_CV = pd.get_dummies(X_CV, prefix_sep='_', drop_first=True)
        
        #  Feature Scaling
        X_train_scaled = preprocessing.scale(X_train)
        X_CV_scaled = preprocessing.scale(X_CV)
        X_test_scaled = preprocessing.scale(X_test)
        
        #clf = LogisticRegression(random_state=i,penalty='l2',solver='liblinear',C=0.1,max_iter=100).fit(X_train_scaled, y_train)
        #clf = RandomForestClassifier(n_estimators=100,random_state=i,criterion='gini',max_depth=11,max_features=1).fit(X_train_scaled, y_train)
        clf = RandomForestClassifier(n_estimators=1000,random_state=i,
                             criterion=gs_random.best_params_['criterion'],
                             max_depth=gs_random.best_params_['max_depth'],
                             max_features=gs_random.best_params_['max_features']).fit(X_train_scaled, y_train)
        probs = clf.predict_proba(X_test_scaled)
        output = pd.DataFrame()
        output['patient_id']=test_values['patient_id']
        output['heart_disease_present']=probs[:,1]
        output
        
        output.to_csv('Random_States\prediction_result_{}.csv'.format(i),index=False)

In [31]:
num = 3
Rand_State_Ensemble(num)

KeyError: 'criterion'

In [None]:
Rand_State_DF = pd.DataFrame()
filenames = []

In [None]:
for i in range(num):
    file = pd.read_csv('Random_States\prediction_result_{}.csv'.format(i))
    filenames.append(file)
df_rand_avg = pd.concat(filenames,ignore_index=True)

In [None]:
avg_df = df_rand_avg.groupby('patient_id').mean()
avg_df.reset_index(inplace=True)

In [None]:
avg_df_sorted =pd.DataFrame()
avg_df_sorted['patient_id']=output['patient_id']
avg_df_sorted.head()

In [None]:
float(avg_df[avg_df_sorted['patient_id'][0]==avg_df['patient_id']]['heart_disease_present'])

In [None]:
li = []
for i in range(len(avg_df)):
    val = float(avg_df[avg_df_sorted['patient_id'][i]==avg_df['patient_id']]['heart_disease_present'])
    li.append(val)
avg_df_sorted['heart_disease_present']=li

In [None]:
avg_df_sorted

In [None]:
avg_df_sorted.to_csv('Random_States\prediction_result.csv',index=False)