### Predict Heart Diseases:

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

#### Read training data and do data exploration:

In [2]:
input_data = pd.read_csv("train.csv", index_col="ID")
input_data.head()

Unnamed: 0_level_0,age,sex,chest,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_electrocardiographic_results,maximum_heart_rate_achieved,exercise_induced_angina,oldpeak,slope,number_of_major_vessels,thal,class
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,49.207124,0,4.0,162.996167,181.108682,0,0,148.227858,1,0.944547,2,0,3,1
1,53.628425,1,1.741596,130.23373,276.47463,0,2,152.917139,0,0.11907,2,0,3,0
2,49.591426,1,4.0,146.999012,223.300517,1,2,102.35209,1,1.616747,2,2,7,1
3,58.991445,1,4.0,112.369143,187.245501,0,0,158.16475,1,0.0,1,1,7,1
4,51.053602,1,1.954609,138.032047,238.482868,0,0,172.540828,0,1.150464,1,1,3,0


In [3]:
test_samples = pd.read_csv("test.csv", index_col="ID")
test_samples.head()

Unnamed: 0_level_0,age,sex,chest,resting_blood_pressure,serum_cholestoral,fasting_blood_sugar,resting_electrocardiographic_results,maximum_heart_rate_achieved,exercise_induced_angina,oldpeak,slope,number_of_major_vessels,thal
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
600000,53.963191,1,4.0,136.940829,236.862814,0,2,143.961525,1,0.7111,1,2,7
600001,49.621479,1,4.0,122.024651,226.593019,0,2,112.220874,1,0.0,1,1,7
600002,36.933893,1,1.300924,127.585643,351.061046,0,0,177.945922,0,0.313078,1,0,7
600003,54.884588,1,3.0,124.876745,250.37053,0,0,153.176526,0,0.308277,1,1,7
600004,71.016392,0,1.746972,120.9226,210.97971,0,0,164.573122,0,1.02137,1,1,3


#### Divide data into training and validation sets in stratified fashion:

In [4]:
input_features = input_data.iloc[:,:-1].values
input_classes = input_data.iloc[:,-1].values

(train_samples, val_samples, train_classes, val_classes) = \
train_test_split(input_features, input_classes, test_size=0.2, stratify=input_classes, random_state=0)


In [5]:
uni_v, counts_v = np.unique(val_classes, return_counts=True)
uni_t, counts_t = np.unique(train_classes, return_counts=True)
uni_i, counts_i = np.unique(input_classes, return_counts=True)
print("class ratios in sets: %0.2f, %0.2f, %0.2f" % \
  (counts_v[0]/counts_v[1], counts_t[0]/counts_t[1], counts_i[0]/counts_i[1]))

class ratios in sets: 1.25, 1.25, 1.25


#### Predict classes with Gradient Boost Classifier (default parameters) :

In [41]:
gbc = GradientBoostingClassifier(random_state=0)
gbc.fit(train_samples, train_classes)
predict_classes = gbc.predict(val_samples)
print("validation score %0.4f " % accuracy_score(val_classes, predict_classes))

test_predicted_classes = gbc.predict(test_samples)
final_output = pd.DataFrame({'ID' : test_samples.index.values, 'class' : test_predicted_classes})
final_output.to_csv("submission_default.csv", index=False)

validation score 0.9002 


#### Tune GBM paramaeters by grid search on 10% of training data set

In [6]:
(grid_leftout_samples, grid_search_samples, grid_leftout_classes, grid_search_classes) = \
train_test_split(train_samples, train_classes, test_size=0.1, stratify=train_classes, random_state=0)
grid_search_samples.shape

(48000, 13)

Tune n_estimators first and take a optimal value

In [7]:
grid_parameters = {
    #"learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    #"subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    'n_estimators':range(80,150,10),
    #"min_samples_split": np.linspace(0.1, 0.5, 12),
    #"min_samples_leaf": np.linspace(0.1, 0.5, 12),
    #"max_depth":[3,5,8],
    }


grid_gbc = GridSearchCV(GradientBoostingClassifier(random_state=0,), grid_parameters, cv=3, n_jobs=-1)
grid_gbc.fit(grid_search_samples, grid_search_classes)

GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': range(80, 150, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [8]:
print(grid_gbc.best_params_, grid_gbc.best_score_)
predict_classes = grid_gbc.predict(val_samples)
print("Accuracy : %0.4f" % accuracy_score(val_classes, predict_classes))

{'n_estimators': 130}
Accuracy : 0.8998


Tune Tree parameters

In [13]:
grid_parameters = {'min_samples_split':range(1500,10000,500)}
grid_gbc = GridSearchCV(GradientBoostingClassifier(random_state=0,n_estimators=130, max_depth=4), grid_parameters, cv=3, n_jobs=-1)
grid_gbc.fit(grid_search_samples, grid_search_classes)

GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=130,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'min_samples_split': range(1500, 10000, 500)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
print(grid_gbc.best_params_, grid_gbc.best_score_)
predict_classes = grid_gbc.predict(val_samples)
print("Accuracy : %0.4f" % accuracy_score(val_classes, predict_classes))

{'min_samples_split': 2500} 0.8976041666666666
Accuracy : 0.8995


In [17]:
grid_parameters = {'min_samples_leaf':range(10,101,10)}
grid_gbc = GridSearchCV(GradientBoostingClassifier(random_state=0,n_estimators=130, max_depth=4, min_samples_split=2500),\
                        grid_parameters, cv=3, n_jobs=-1)
grid_gbc.fit(grid_search_samples, grid_search_classes)

GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2500,
              min_weight_fraction_leaf=0.0, n_estimators=130,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'min_samples_leaf': range(10, 101, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [18]:
print(grid_gbc.best_params_, grid_gbc.best_score_)
predict_classes = grid_gbc.predict(val_samples)
print("Accuracy : %0.4f" % accuracy_score(val_classes, predict_classes))

{'min_samples_leaf': 90} 0.8978958333333333
Accuracy : 0.8994


Tune subsample size and learning rate

In [23]:
grid_parameters = {'subsample':np.arange(0.1,0.61,0.1)}
grid_gbc = GridSearchCV(GradientBoostingClassifier(random_state=0,n_estimators=130,\
                                                   min_samples_leaf = 90, \
                                                   max_depth=4, min_samples_split=2500),\
                        grid_parameters, cv=3, n_jobs=-1)
grid_gbc.fit(grid_search_samples, grid_search_classes)

GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=90, min_samples_split=2500,
              min_weight_fraction_leaf=0.0, n_estimators=130,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
print(grid_gbc.best_params_, grid_gbc.best_score_)
predict_classes = grid_gbc.predict(val_samples)
print("Accuracy : %0.4f" % accuracy_score(val_classes, predict_classes))

{'subsample': 0.5} 0.8982708333333334
Accuracy : 0.9003


In [26]:
grid_parameters = {"learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2]}
grid_gbc = GridSearchCV(GradientBoostingClassifier(n_estimators=130, subsample=0.5, \
                                                   min_samples_leaf = 90, \
                                                   max_depth=4, min_samples_split=2500, random_state=0),\
                        grid_parameters, cv=3, n_jobs=-1)
grid_gbc.fit(grid_search_samples, grid_search_classes)

GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=90, min_samples_split=2500,
              min_weight_fraction_leaf=0.0, n_estimators=130,
              presort='auto', random_state=0, subsample=0.5, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [27]:
print(grid_gbc.best_params_, grid_gbc.best_score_)
predict_classes = grid_gbc.predict(val_samples)
print("Accuracy : %0.4f" % accuracy_score(val_classes, predict_classes))

{'learning_rate': 0.1} 0.8982708333333334
Accuracy : 0.9003


In [28]:
final_gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=130, subsample=0.5, min_samples_leaf = 90, \
            max_depth=4, min_samples_split=2500, random_state=0)
final_gbc.fit(train_samples, train_classes)
predict_classes = final_gbc.predict(val_samples)
print("Accuracy : %0.4f" % accuracy_score(val_classes, predict_classes))

Accuracy : 0.9019


#### Now predict on test samples:

In [38]:
test_predicted_classes = final_gbc.predict(test_samples)
final_output = pd.DataFrame({'ID' : test_samples.index.values, 'class' : test_predicted_classes})
final_output.to_csv("submission1.csv", index=False)

### Try XGBOOST 

In [6]:
import xgboost as xgb



In [11]:
xgb_tree = xgb.XGBClassifier(max_depth=4, n_estimators=140, learning_rate=0.3)
xgb_tree.fit(np.asmatrix(train_samples), train_classes)
predict_classes = xgb_tree.predict(np.asmatrix(val_samples))
print("Accuracy : %0.4f" % accuracy_score(val_classes, predict_classes))

Accuracy : 0.9032


In [77]:
test_predicted_classes = xgb_tree.predict(np.asmatrix(test_samples))
final_output = pd.DataFrame({'ID' : test_samples.index.values, 'class' : test_predicted_classes})
final_output.to_csv("submission2.csv", index=False)

  if diff:
