# Modelling

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
from sklearn.metrics import accuracy_score, log_loss,f1_score,roc_curve,roc_auc_score
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [150]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [151]:
train_pca = pd.read_csv('train_pca.csv')
test_pca = pd.read_csv('test_pca.csv')

In [159]:
y = train.iloc[:,-1]
X = train.iloc[:,:-1]
X.columns

Index(['slope_of_peak_exercise_st_segment', 'resting_blood_pressure',
       'chest_pain_type', 'num_major_vessels',
       'fasting_blood_sugar_gt_120_mg_per_dl', 'resting_ekg_results',
       'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'sex', 'age',
       'max_heart_rate_achieved', 'exercise_induced_angina', 'thal_normal',
       'thal_reversible_defect'],
      dtype='object')

In [161]:
X_t = test.copy()
X_t.columns

Index(['slope_of_peak_exercise_st_segment', 'resting_blood_pressure',
       'chest_pain_type', 'num_major_vessels',
       'fasting_blood_sugar_gt_120_mg_per_dl', 'resting_ekg_results',
       'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'sex', 'age',
       'max_heart_rate_achieved', 'exercise_induced_angina', 'thal_normal',
       'thal_reversible_defect'],
      dtype='object')

## Baseline model

(Considering all the features)

In [126]:
# Predicted, Actual
def metrics_summary(train_pred,y):
    print("Log Loss:",log_loss(y, train_pred));print()
    print("Accuracy:",accuracy_score(y, train_pred));print()

    #confusion matrix
    cross = pd.crosstab(train_pred,y
                        ,rownames=['Predicted'],colnames=['Actual'])
    print(cross)

#### Logistics regression

In [127]:
# Highest with normal features
clf = LogisticRegression(random_state=0,penalty='l2',class_weight="balanced",solver="lbfgs",max_iter=1500).fit(X, y)
train_pred = clf.predict(X)

metrics_summary(train_pred,y)

Log Loss: 4.9889876746491115

Accuracy: 0.8555555555555555

Actual      0   1
Predicted        
0          88  14
1          12  66


In [156]:
# with pca features
# clf = LogisticRegression(random_state=0).fit(X, y)
# train_pred = clf.predict(X)

# metrics_summary(train_pred,y)

Log Loss: 11.321149986878083

Accuracy: 0.6722222222222223

Actual      0   1
Predicted        
0          76  35
1          24  45




## Cross validation strategy

In [165]:
i = 1
kf = StratifiedKFold(n_splits=5,random_state=1,shuffle=True) 
scores = []

for train_index,test_index in kf.split(X.loc[:,['slope_of_peak_exercise_st_segment', 'resting_blood_pressure','chest_pain_type']],y):
    print('\n{} of kfold {}'.format(i,kf.n_splits))
    xtr,xvl = X.loc[train_index],X.loc[test_index]
    ytr,yvl = y[train_index],y[test_index]
    model = LogisticRegression(random_state=0,class_weight="balanced",solver="lbfgs",max_iter=2500)
    model.fit(xtr, ytr)     
    pred_test = model.predict(xvl)
    score = accuracy_score(yvl,pred_test)
    scores.append(score)
    print('accuracy_score',score)
    metrics_summary(yva)
    i+=1 
print('Mean score',np.mean(scores))


1 of kfold 5
accuracy_score 0.8055555555555556

2 of kfold 5
accuracy_score 0.8333333333333334

3 of kfold 5
accuracy_score 0.8611111111111112

4 of kfold 5
accuracy_score 0.8333333333333334

5 of kfold 5
accuracy_score 0.7777777777777778
Mean score 0.8222222222222223


## Hyperparameter tuning

In [158]:
from sklearn.model_selection import GridSearchCV
weights = np.linspace(0.05, 0.95, 20)
gsc = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid = {'C': [0.001, 0.01, 0.1,0.2,0.3,0.35,0.36,0.37,0.4,0.5, 0.8,0.9,1,2, 10, 100, 1000],
                  'random_state': [0],
                  'penalty':['l2'],
                  'solver':['lbfgs'],
#                  'solver':["lbfgs","newton-cg","liblinear","sag","saga"],
                  "max_iter": [5000],
                  "class_weight":["balanced"]
                 },
    scoring='accuracy',
    cv=5
)

grid_result = gsc.fit(X, y)
print("Best parameters : %s" % grid_result.best_params_)

clf = LogisticRegression(**grid_result.best_params_).fit(X, y)
train_pred = clf.predict(X)

metrics_summary(train_pred,y)

Best parameters : {'C': 0.001, 'class_weight': 'balanced', 'max_iter': 5000, 'penalty': 'l2', 'random_state': 0, 'solver': 'lbfgs'}
Log Loss: 12.280600422163777

Accuracy: 0.6444444444444445

Actual      0   1
Predicted        
0          67  31
1          33  49


## Submission

In [None]:
submission = pd.read_csv('./data/submission_format.csv')
print(submission.info())

submission.head()

test_X.head()

ids = test_X['patient_id']
test_preds = clf.predict(test_X.drop(columns=['patient_id']))
test_preds

ids = test_X['patient_id']
test_preds1 = clf.predict(test_X.drop(columns=['patient_id']))
test_preds1

submission['patient_id'] = ids
submission['heart_disease_present'] = test_preds.astype('float')

submission.info()

submission.to_csv('logistic_out.csv',index=False)

sum(test_preds != test_preds1) 