In [2]:
# Importing Libraries

import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('realheart.csv')

In [4]:
df.head()

Unnamed: 0,age,sex,cp,tresbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0,6,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3,3,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2,7,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0,3,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0,3,0


In [5]:
# Splitting Data
X = df.drop(columns = 'target')
y = df['target']

In [6]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Creating Model
gb = GradientBoostingClassifier(loss='log_loss', learning_rate= 0.1, n_estimators=25,
                                    criterion='friedman_mse',
                                    min_samples_split=8, min_samples_leaf=10,
                                    max_depth=3,max_leaf_nodes = 8
                                    )
    
# Fitting Model
gb.fit(X_train, y_train)
    
y_pred = gb.predict(X_test)
y_train_pred = gb.predict(X_train)
    

print('Accuracy on test data is ', accuracy_score(y_pred, y_test))
print('Accuracy on training data is ', accuracy_score(y_train_pred, y_train))

Accuracy on test data is  0.9016393442622951
Accuracy on training data is  0.9090909090909091


# With GSV

In [9]:
gb = GradientBoostingClassifier(random_state = 42)


param_grid = {
    'n_estimators':[15, 25, 30, 45],
    'loss': ['log_loss'],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 8, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 8, 10]
}

grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, 
                           cv=5, scoring='accuracy', n_jobs=-1, verbose = 1)

start_time = time.time()

grid_search.fit(X_train, y_train)

end_time = time.time()

training_time = start_time - end_time

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best Cross-Validation Score: {best_score:.4f}')

# Evaluating the model on the test data
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f'Test Accuracy: {test_accuracy:.4f}')
print('Training Ended In ', training_time)

Fitting 5 folds for each of 900 candidates, totalling 4500 fits
Best Parameters: {'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 45}
Best Cross-Validation Score: 0.8223
Test Accuracy: 0.8852
Training Ended In  -122.90931797027588
