# Modelling

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
from sklearn.metrics import accuracy_score, log_loss,f1_score,roc_curve,roc_auc_score
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [232]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [151]:
train_pca = pd.read_csv('train_pca.csv')
test_pca = pd.read_csv('test_pca.csv')

In [212]:
y = train.iloc[:,-1]
X = train.iloc[:,:-1]
X.columns

Index(['slope_of_peak_exercise_st_segment', 'resting_blood_pressure',
       'chest_pain_type', 'num_major_vessels',
       'fasting_blood_sugar_gt_120_mg_per_dl', 'resting_ekg_results',
       'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'sex', 'age',
       'max_heart_rate_achieved', 'exercise_induced_angina', 'thal_normal',
       'thal_reversible_defect'],
      dtype='object')

In [161]:
X_t = test.copy()
X_t.columns

Index(['slope_of_peak_exercise_st_segment', 'resting_blood_pressure',
       'chest_pain_type', 'num_major_vessels',
       'fasting_blood_sugar_gt_120_mg_per_dl', 'resting_ekg_results',
       'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'sex', 'age',
       'max_heart_rate_achieved', 'exercise_induced_angina', 'thal_normal',
       'thal_reversible_defect'],
      dtype='object')

In [222]:
X = pd.get_dummies(X.drop(columns=['patient_id']),columns=['thal','slope_of_peak_exercise_st_segment','resting_ekg_results','chest_pain_type','num_major_vessels'],drop_first=True)

In [234]:
len(X.nunique().sort_values())

20

## Baseline model

(Considering all the features)

In [126]:
# Predicted, Actual
def metrics_summary(train_pred,y):
    print("Log Loss:",log_loss(y, train_pred));print()
    print("Accuracy:",accuracy_score(y, train_pred));print()

    #confusion matrix
    cross = pd.crosstab(train_pred,y
                        ,rownames=['Predicted'],colnames=['Actual'])
    print(cross)

#### Logistics regression

In [229]:
# Highest with normal features
clf = LogisticRegression(random_state=0,penalty='l2',solver="lbfgs",max_iter=1500).fit(X, y)
train_pred = clf.predict(X)

metrics_summary(train_pred,y)

Log Loss: 4.41333251698471

Accuracy: 0.8722222222222222

Actual      0   1
Predicted        
0          90  13
1          10  67


In [156]:
# with pca features
# clf = LogisticRegression(random_state=0).fit(X, y)
# train_pred = clf.predict(X)

# metrics_summary(train_pred,y)

Log Loss: 11.321149986878083

Accuracy: 0.6722222222222223

Actual      0   1
Predicted        
0          76  35
1          24  45




## Cross validation strategy

In [225]:
i = 1
kf = StratifiedKFold(n_splits=5,random_state=1,shuffle=True) 
scores = []
model = LogisticRegression()
for train_index,test_index in kf.split(X,y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = X.loc[train_index],X.loc[test_index]
    ytr,yvl = y[train_index],y[test_index]
    model = LogisticRegression(random_state=0,solver="lbfgs",max_iter=2500)
    model.fit(xtr, ytr)     
    pred_test = model.predict(xvl)
    score = accuracy_score(yvl,pred_test)
    scores.append(score)
    print('accuracy_score',score)
    i+=1 

print('Mean score',np.mean(scores))


1 of kfold 5
accuracy_score 0.75

2 of kfold 5
accuracy_score 0.75

3 of kfold 5
accuracy_score 0.8333333333333334

4 of kfold 5
accuracy_score 0.8333333333333334

5 of kfold 5
accuracy_score 0.7777777777777778
Mean score 0.788888888888889


## Hyperparameter tuning

In [226]:
from sklearn.model_selection import GridSearchCV
weights = np.linspace(0.05, 0.95, 20)
gsc = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid = {'C': [0.001, 0.01, 0.1,0.2,0.3,0.35,0.36,0.37,0.4,0.5, 0.8,0.9,1,2, 10, 100, 1000],
                  'random_state': [0],
                  'penalty':['l2'],
                  'solver':['lbfgs'],
#                  'solver':["lbfgs","newton-cg","liblinear","sag","saga"],
                  "max_iter": [5000],
                  "class_weight":["balanced"]
                 },
    scoring='accuracy',
    cv=5
)

grid_result = gsc.fit(X, y)
print("Best parameters : %s" % grid_result.best_params_)

clf = LogisticRegression(**grid_result.best_params_).fit(X, y)
train_pred = clf.predict(X)

metrics_summary(train_pred,y)

Best parameters : {'C': 100, 'class_weight': 'balanced', 'max_iter': 5000, 'penalty': 'l2', 'random_state': 0, 'solver': 'lbfgs'}
Log Loss: 4.41333251698471

Accuracy: 0.8722222222222222

Actual      0   1
Predicted        
0          90  13
1          10  67


## Submission

In [233]:
pd.get_dummies(test,columns=['slope_of_peak_exercise_st_segment','resting_ekg_results','chest_pain_type','num_major_vessels'],drop_first=True)
# test_preds = clf.predict(test)

Unnamed: 0,resting_blood_pressure,fasting_blood_sugar_gt_120_mg_per_dl,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,thal_normal,thal_reversible_defect,slope_of_peak_exercise_st_segment_2,slope_of_peak_exercise_st_segment_3,resting_ekg_results_1,resting_ekg_results_2,chest_pain_type_2,chest_pain_type_3,chest_pain_type_4,num_major_vessels_1,num_major_vessels_2,num_major_vessels_3
0,170,0,288,0.2,1,59,159,0,0,1,1,0,0,1,0,0,0,0,0,0
1,138,0,183,1.4,0,35,182,0,1,0,0,0,0,0,0,0,1,0,0,0
2,120,0,177,2.5,1,43,120,1,0,1,1,0,0,1,0,0,1,0,0,0
3,102,0,318,0.0,0,60,160,0,1,0,0,0,0,0,0,1,0,1,0,0
4,138,0,166,3.6,1,61,125,1,1,0,1,0,0,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,120,0,340,0.0,0,58,172,0,1,0,0,0,0,0,0,1,0,0,0,0
86,136,0,315,1.8,1,42,125,1,0,0,1,0,0,0,0,0,1,0,0,0
87,148,0,244,0.8,1,42,178,0,1,0,0,0,0,1,0,0,0,0,1,0
88,138,0,243,0.0,0,46,152,1,1,0,1,0,0,1,0,0,1,0,0,0


In [235]:
submission = pd.read_csv('./data/submission_format.csv')
print(submission.info())

submission.head()

print(test.head())

test = pd.get_dummies(test,columns=['slope_of_peak_exercise_st_segment','resting_ekg_results','chest_pain_type','num_major_vessels'],drop_first=True)
test_preds = clf.predict(test)

submission['heart_disease_present'] = test_preds.astype('float')

submission.info()

submission.to_csv('logistic_out.csv',index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 2 columns):
patient_id               90 non-null object
heart_disease_present    90 non-null float64
dtypes: float64(1), object(1)
memory usage: 1.1+ KB
None
   slope_of_peak_exercise_st_segment  resting_blood_pressure  chest_pain_type  \
0                                  2                     170                1   
1                                  1                     138                4   
2                                  2                     120                4   
3                                  1                     102                3   
4                                  2                     138                4   

   num_major_vessels  fasting_blood_sugar_gt_120_mg_per_dl  \
0                  0                                     0   
1                  0                                     0   
2                  0                                     0   
3                