### MACHINE LEARNING 2025 ZOOMCAMP MIDTERM PROJECT.
* Using heart.csv dataset to predict heart failure in patients.
* Dataset is available [here](https://www.kaggle.com/datasets/tan5577/heart-failure-dataset)

In [48]:
import pandas as pd

In [49]:
df = pd.read_csv('heart.csv')
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


### 0. EXPLORATORY DATA ANALYSIS.
* The target variable is heartdisease.
* 0 => normal heart.
* 1 => heart disease.

In [50]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df

Unnamed: 0,age,sex,chestpaintype,restingbp,cholesterol,fastingbs,restingecg,maxhr,exerciseangina,oldpeak,st_slope,heartdisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [51]:
# 0: 'no' => fastingbs < 120mg/dl, 1: 'yes' =>  fastingbs >= 120mg/dl
df['fastingbs'] = df['fastingbs'].map({0: 'no', 1: 'yes'})
df['exerciseangina'] = df['exerciseangina'].map({'N': 'no', 'Y': 'yes'})

categorical_variables = ['chestpaintype', 'sex', 'fastingbs', 'restingecg', 'exerciseangina', 'st_slope']
numerical_variables = ['age', 'restingbp', 'cholesterol', 'maxhr', 'oldpeak']

for categorical_variable in categorical_variables:
    df[categorical_variable] = df[categorical_variable].str.lower().str.replace(' ', '_')

df

Unnamed: 0,age,sex,chestpaintype,restingbp,cholesterol,fastingbs,restingecg,maxhr,exerciseangina,oldpeak,st_slope,heartdisease
0,40,m,ata,140,289,no,normal,172,no,0.0,up,0
1,49,f,nap,160,180,no,normal,156,no,1.0,flat,1
2,37,m,ata,130,283,no,st,98,no,0.0,up,0
3,48,f,asy,138,214,no,normal,108,yes,1.5,flat,1
4,54,m,nap,150,195,no,normal,122,no,0.0,up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,m,ta,110,264,no,normal,132,no,1.2,flat,1
914,68,m,asy,144,193,yes,normal,141,no,3.4,flat,1
915,57,m,asy,130,131,no,normal,115,yes,1.2,flat,1
916,57,f,ata,130,236,no,lvh,174,no,0.0,flat,1


In [52]:
df.isnull().sum()

age               0
sex               0
chestpaintype     0
restingbp         0
cholesterol       0
fastingbs         0
restingecg        0
maxhr             0
exerciseangina    0
oldpeak           0
st_slope          0
heartdisease      0
dtype: int64

In [53]:
df[categorical_variables + numerical_variables].describe()

Unnamed: 0,age,restingbp,cholesterol,maxhr,oldpeak
count,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,136.809368,0.887364
std,9.432617,18.514154,109.384145,25.460334,1.06657
min,28.0,0.0,0.0,60.0,-2.6
25%,47.0,120.0,173.25,120.0,0.0
50%,54.0,130.0,223.0,138.0,0.6
75%,60.0,140.0,267.0,156.0,1.5
max,77.0,200.0,603.0,202.0,6.2


### 1. SPLITTING THE DATASET.
* 60/20/20 split using train_test_split() from sklearn.model_selection module.
* 60% => training set.
* 20% => validation set.
* 20% => testing set.

In [54]:
from sklearn.model_selection import train_test_split

In [55]:
train_full_df, test_df = train_test_split(df, test_size = 0.2, random_state = 42)
train_df, validation_df = train_test_split(train_full_df, test_size = 0.25, random_state = 42)

len(train_df), len(validation_df), len(test_df)

(550, 184, 184)

In [56]:
train_df = train_df.reset_index(drop = True)
validation_df = validation_df.reset_index(drop = True)
test_df = test_df.reset_index(drop = True)

### 2. FEATURE IMPORTANCE.
* mutual information => to measure importance of categorical variables.
* correlation => to measure importance of numerical variables.

In [57]:
heart_failure_rate = round(train_df.heartdisease.value_counts(normalize = True), 2)
heart_failure_rate

heartdisease
1    0.54
0    0.46
Name: proportion, dtype: float64

#### 2.1. MUTUAL INFORMATION.
* more important (higher) to less important (lower).

In [58]:
from sklearn.metrics import mutual_info_score

In [59]:
def mutual_information_heart(series):
    return mutual_info_score(series, train_full_df.heartdisease)

mi_score = train_full_df[categorical_variables].apply(mutual_information_heart)
sorted_scores = mi_score.sort_values(ascending = False)

In [60]:
cols = ['mi_scores']
pd.DataFrame(sorted_scores, columns = cols)

Unnamed: 0,mi_scores
st_slope,0.214106
exerciseangina,0.149062
chestpaintype,0.144451
sex,0.051851
fastingbs,0.035691
restingecg,0.005695


#### 2.2. CORRELATION.
* (value < 0) => negative correlation.
* (value > 0) => positive correlation.

In [61]:
correlation_score = train_full_df[numerical_variables].corrwith(train_full_df.heartdisease)
sorted_correlation = correlation_score.sort_values(ascending = False)

cols = ['correlation_scores']
pd.DataFrame(sorted_correlation, columns = cols)

Unnamed: 0,correlation_scores
oldpeak,0.42274
age,0.29941
restingbp,0.138611
cholesterol,-0.251893
maxhr,-0.414676


### 3. FEATURE MATRIX. 
* using a DictVectorizer.
* one-hot encoding.

In [62]:
from sklearn.feature_extraction import DictVectorizer

In [63]:
y_train = train_df.heartdisease.values
y_validation = validation_df.heartdisease.values
y_test = test_df.heartdisease.values

del train_df['heartdisease']
del validation_df['heartdisease']
del test_df['heartdisease']

In [64]:
train_dict = train_df.to_dict(orient = 'records')

dictVectorizer = DictVectorizer(sparse = False)
dictVectorizer.fit(train_dict)
X_train = dictVectorizer.transform(train_dict)

feature_matrix = dictVectorizer.feature_names_

validation_dict = validation_df.to_dict(orient = 'records')
X_validation = dictVectorizer.transform(validation_dict)

test_dict = test_df.to_dict(orient = 'records')
X_test = dictVectorizer.transform(test_dict)

### 4. LOGISTIC REGRESSION TRAINING.
* baseline model.

In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [66]:
logistic_model = LogisticRegression(max_iter = 1000, random_state = 42)
logistic_model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [67]:
y_prediction = logistic_model.predict_proba(X_validation)[:, 1]

roc_auc_score(y_validation, y_prediction)

0.9433962264150944

#### 4.1. LOGISTIC REGRESSION: PARAMETER TUNING.
* C.
* penalty = l1.

In [68]:
c_params = [0.01, 0.1, 1, 10, 100]
auc_scores = []

for c_param in c_params:
    logistic_model = LogisticRegression(penalty = 'l1', solver = 'liblinear', C = c_param, max_iter = 1000, random_state = 42)
    logistic_model.fit(X_train, y_train)

    y_prediction = logistic_model.predict_proba(X_validation)[:, 1]
    auc = round(roc_auc_score(y_validation, y_prediction), 4)
    auc_scores.append((c_param, auc))

cols = ['c_parameters', 'auc_scores']
logistic_df = pd.DataFrame(auc_scores, columns = cols).sort_values(by = "auc_scores", ascending = False)
logistic_df.reset_index(drop = True)

Unnamed: 0,c_parameters,auc_scores
0,1.0,0.9428
1,10.0,0.9428
2,100.0,0.9427
3,0.1,0.9355
4,0.01,0.8148


#### 4.2. LOGISTIC REGRESSION: PARAMETER TUNING.
* C.
* penalty = l2.

In [69]:
c_params = [0.01, 0.1, 1, 10, 100]
auc_scores = []

for c_param in c_params:
    logistic_model = LogisticRegression(penalty = 'l2', solver = 'liblinear', C = c_param, max_iter = 1000, random_state = 42)
    logistic_model.fit(X_train, y_train)

    y_prediction = logistic_model.predict_proba(X_validation)[:, 1]
    auc = round(roc_auc_score(y_validation, y_prediction), 4)
    auc_scores.append((c_param, auc))

cols = ['c_parameters', 'auc_scores']
logistic_df = pd.DataFrame(auc_scores, columns = cols).sort_values(by = "auc_scores", ascending = False)
logistic_df.reset_index(drop = True)

Unnamed: 0,c_parameters,auc_scores
0,0.1,0.9448
1,1.0,0.9434
2,10.0,0.943
3,100.0,0.9429
4,0.01,0.9363


#### 4.3. LOGISTIC REGRESSION: FINAL MODEL.
* C = {0.1}.
* penalty = l2.

In [70]:
logistic_model = LogisticRegression(penalty = 'l2', solver = 'liblinear', C = 0.1, max_iter = 1000, random_state = 42)
logistic_model.fit(X_train, y_train)

y_prediction = logistic_model.predict_proba(X_validation)[:, 1]
round(roc_auc_score(y_validation, y_prediction), 4)

0.9448

### 5. DECISION TREE TRAINING.
* baseline model.

In [71]:
from sklearn.tree import DecisionTreeClassifier

In [72]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [73]:
dt_prediction = tree_model.predict_proba(X_validation)[:, 1]

roc_auc_score(y_validation, dt_prediction)

0.8557087566521528

#### 5.1. DECISION TREE: PARAMETER TUNING.
* max_depth.
* min_samples_leaf.

In [74]:
max_depths = [1, 2, 3, 4, 5, 6, 10, 15, 20, 100, 200, 500]

for max_depth in max_depths:
    tree_model = DecisionTreeClassifier(max_depth = max_depth)
    tree_model.fit(X_train, y_train)

    dt_prediction = tree_model.predict_proba(X_validation)[:, 1]
    auc_score = roc_auc_score(y_validation, dt_prediction)

    print('%4s => %.3f' % (max_depth, auc_score))

   1 => 0.852
   2 => 0.905
   3 => 0.938
   4 => 0.922
   5 => 0.917
   6 => 0.911
  10 => 0.841
  15 => 0.849
  20 => 0.860
 100 => 0.848
 200 => 0.856
 500 => 0.856


#### 5.2. DECISION TREE: PARAMETER TUNING.
* max_depth = {3, 4, 5}.
* min_samples_leaf.

In [75]:
max_depths = [3, 4, 5]
min_samples = [1, 2, 3, 4, 5, 6, 10, 15, 20, 100]
auc_scores = []

for max_depth in max_depths:
    for min_sample in min_samples:
        tree_model = DecisionTreeClassifier(max_depth = max_depth, min_samples_leaf = min_sample)
        tree_model.fit(X_train, y_train)

        ydt_prediction = tree_model.predict_proba(X_validation)[:, 1]
        auc_score = roc_auc_score(y_validation, ydt_prediction)
        auc_scores.append((max_depth, min_sample, auc_score))
   
cols = ['max_depth', 'min_sample', 'auc_score']
decision_tree_df = pd.DataFrame(auc_scores, columns = cols)
decision_tree_df

Unnamed: 0,max_depth,min_sample,auc_score
0,3,1,0.93753
1,3,2,0.93753
2,3,3,0.93753
3,3,4,0.93753
4,3,5,0.93753
5,3,6,0.93753
6,3,10,0.93753
7,3,15,0.930273
8,3,20,0.930273
9,3,100,0.902758


#### 5.3. FINAL DECISION TREE.
* max_depth = 3
* min_samples_leaf = 3

In [76]:
tree_model = DecisionTreeClassifier(max_depth = 3, min_samples_leaf = 3)
tree_model.fit(X_train, y_train)

dt_prediction = tree_model.predict_proba(X_validation)[:, 1]
roc_auc_score(y_validation, dt_prediction)

0.9375302370585389

### 6. RANDOM FOREST TREE TRAINING.
* baseline model.

In [77]:
from sklearn.ensemble import RandomForestClassifier

In [78]:
forest_model = RandomForestClassifier(random_state = 42)
forest_model.fit(X_train, y_train)

rf_prediction = forest_model.predict_proba(X_validation)[:, 1]
roc_auc_score(y_validation, rf_prediction)

0.9504716981132075

#### 6.1. RANDOM FOREST TREE: PARAMETER TUNING.
* n_estimators.
* max_depth.

In [79]:
n_estimators = [n for n in range(10, 201, 10)]
auc_scores = []

for n_estimator in n_estimators:
    forest_model = RandomForestClassifier(n_estimators = n_estimator, random_state = 42)
    forest_model.fit(X_train, y_train)

    rf_prediction = forest_model.predict_proba(X_validation)[:, 1]
    auc_score = round(roc_auc_score(y_validation, rf_prediction), 3)
    auc_scores.append((n_estimator, auc_score))

cols = ['n_estimators', 'auc_scores']
forest_df = pd.DataFrame(auc_scores, columns = cols)
forest_df

Unnamed: 0,n_estimators,auc_scores
0,10,0.932
1,20,0.942
2,30,0.942
3,40,0.95
4,50,0.949
5,60,0.951
6,70,0.951
7,80,0.952
8,90,0.951
9,100,0.95


In [80]:
n_estimators = [170, 190, 200] 
max_depths = [5, 10, 15, 20]
auc_scores = []

for max_depth in max_depths:
    for n_estimator in n_estimators:
        forest_model = RandomForestClassifier(n_estimators = n_estimator, max_depth = max_depth, random_state = 42)
        forest_model.fit(X_train, y_train)
    
        rf_prediction = forest_model.predict_proba(X_validation)[:, 1]
        auc_score = round(roc_auc_score(y_validation, rf_prediction), 3)
        auc_scores.append((max_depth, n_estimator, auc_score))

cols = ['max_depths', 'n_estimators', 'auc_scores']
forest_df = pd.DataFrame(auc_scores, columns = cols)
forest_df

Unnamed: 0,max_depths,n_estimators,auc_scores
0,5,170,0.951
1,5,190,0.951
2,5,200,0.951
3,10,170,0.955
4,10,190,0.955
5,10,200,0.956
6,15,170,0.952
7,15,190,0.952
8,15,200,0.951
9,20,170,0.953


#### 6.2. FINAL RANDOM FOREST TREE
* best performance is with:
* max_depth = {10} and n_estimator = {200}.

In [81]:
forest_model = RandomForestClassifier(n_estimators = 200, max_depth = 10, random_state = 42)
forest_model.fit(X_train, y_train)

rf_prediction = forest_model.predict_proba(X_validation)[:, 1]
roc_auc_score(y_validation, rf_prediction)

0.9556119980648283

### 7. XGBOOST MODEL TRAINING.
* baseline model.

In [82]:
import xgboost as xgb

In [83]:
dTrain = xgb.DMatrix(X_train, label = y_train, feature_names = feature_matrix)
dValidation = xgb.DMatrix(X_validation, label = y_validation, feature_names = feature_matrix)

xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

watchlist = [(dTrain, 'train'), (dValidation, 'validation')]

In [84]:
xgb_model = xgb.train(xgb_params, dTrain, evals = watchlist, num_boost_round = 20, verbose_eval = 5)
xgb_prediction = xgb_model.predict(dValidation)

roc_auc_score(y_validation, xgb_prediction)

[0]	train-auc:0.94338	validation-auc:0.92852
[5]	train-auc:0.98911	validation-auc:0.93928
[10]	train-auc:0.99704	validation-auc:0.94642
[15]	train-auc:0.99916	validation-auc:0.95513
[19]	train-auc:0.99973	validation-auc:0.95537


0.9553701015965167

#### 7.1. XGBOOST: PARAMETER TUNING.
* eta.
* max_depth.
* min_child_weight.

In [85]:
xgb_params = [{'eta': 0.01, 'eval_metric': 'auc', 'objective': 'binary:logistic'},
              {'eta': 0.1, 'eval_metric': 'auc', 'objective': 'binary:logistic'},
              {'eta': 0.2, 'eval_metric': 'auc', 'objective': 'binary:logistic'},
              {'eta': 0.3, 'eval_metric': 'auc', 'objective': 'binary:logistic'}]

for xgb_param in xgb_params:
    xgb_model = xgb.train(xgb_param, dTrain, evals = watchlist, num_boost_round = 20, verbose_eval = 5)

[0]	train-auc:0.94338	validation-auc:0.92852
[5]	train-auc:0.94957	validation-auc:0.93203
[10]	train-auc:0.95071	validation-auc:0.93312
[15]	train-auc:0.95977	validation-auc:0.92761
[19]	train-auc:0.96209	validation-auc:0.92737
[0]	train-auc:0.94338	validation-auc:0.92852
[5]	train-auc:0.96833	validation-auc:0.93239
[10]	train-auc:0.97866	validation-auc:0.93971
[15]	train-auc:0.98624	validation-auc:0.94533
[19]	train-auc:0.98987	validation-auc:0.94539
[0]	train-auc:0.94338	validation-auc:0.92852
[5]	train-auc:0.97945	validation-auc:0.93330
[10]	train-auc:0.99180	validation-auc:0.94424
[15]	train-auc:0.99622	validation-auc:0.95090
[19]	train-auc:0.99797	validation-auc:0.95416
[0]	train-auc:0.94338	validation-auc:0.92852
[5]	train-auc:0.98911	validation-auc:0.93928
[10]	train-auc:0.99704	validation-auc:0.94642
[15]	train-auc:0.99916	validation-auc:0.95513
[19]	train-auc:0.99973	validation-auc:0.95537


#### 7.2. XGBOOST: PARAMETER TUNING.
* eta = {0.3}.
* max_depth.
* min_child_weight.

In [86]:
xgb_params = [{'eta': 0.3, 'max_depth': 1, 'eval_metric': 'auc', 'objective': 'binary:logistic'},
              {'eta': 0.3, 'max_depth': 2, 'eval_metric': 'auc', 'objective': 'binary:logistic'},
              {'eta': 0.3, 'max_depth': 3, 'eval_metric': 'auc', 'objective': 'binary:logistic'},
              {'eta': 0.3, 'max_depth': 6, 'eval_metric': 'auc', 'objective': 'binary:logistic'}]

for xgb_param in xgb_params:
    xgb_model = xgb.train(xgb_param, dTrain, evals = watchlist, num_boost_round = 20, verbose_eval = 5)

[0]	train-auc:0.79877	validation-auc:0.85160
[5]	train-auc:0.89875	validation-auc:0.93209
[10]	train-auc:0.92029	validation-auc:0.94557
[15]	train-auc:0.92683	validation-auc:0.94467
[19]	train-auc:0.93029	validation-auc:0.94872
[0]	train-auc:0.86558	validation-auc:0.90487
[5]	train-auc:0.92502	validation-auc:0.93656
[10]	train-auc:0.93965	validation-auc:0.94297
[15]	train-auc:0.94435	validation-auc:0.94376
[19]	train-auc:0.95032	validation-auc:0.94963
[0]	train-auc:0.89682	validation-auc:0.93572
[5]	train-auc:0.94865	validation-auc:0.95271
[10]	train-auc:0.96104	validation-auc:0.95997
[15]	train-auc:0.97087	validation-auc:0.96299
[19]	train-auc:0.97670	validation-auc:0.96021
[0]	train-auc:0.94338	validation-auc:0.92852
[5]	train-auc:0.98911	validation-auc:0.93928
[10]	train-auc:0.99704	validation-auc:0.94642
[15]	train-auc:0.99916	validation-auc:0.95513
[19]	train-auc:0.99973	validation-auc:0.95537


#### 7.3. XGBOOST: PARAMETER TUNING.
* eta = {0.3}.
* max_depth = {3}.
* min_child_weight.

In [87]:
xgb_params = [{'eta': 0.3, 'max_depth': 3, 'min_child_weight': 1, 'eval_metric': 'auc', 'objective': 'binary:logistic'},
              {'eta': 0.3, 'max_depth': 3, 'min_child_weight': 2, 'eval_metric': 'auc', 'objective': 'binary:logistic'},
              {'eta': 0.3, 'max_depth': 3, 'min_child_weight': 3, 'eval_metric': 'auc', 'objective': 'binary:logistic'},
              {'eta': 0.3, 'max_depth': 3, 'min_child_weight': 6, 'eval_metric': 'auc', 'objective': 'binary:logistic'}]

for xgb_param in xgb_params:
    xgb_model = xgb.train(xgb_param, dTrain, evals = watchlist, num_boost_round = 20, verbose_eval = 5)

[0]	train-auc:0.89682	validation-auc:0.93572
[5]	train-auc:0.94865	validation-auc:0.95271
[10]	train-auc:0.96104	validation-auc:0.95997
[15]	train-auc:0.97087	validation-auc:0.96299
[19]	train-auc:0.97670	validation-auc:0.96021
[0]	train-auc:0.89682	validation-auc:0.93572
[5]	train-auc:0.94693	validation-auc:0.95277
[10]	train-auc:0.95737	validation-auc:0.95924
[15]	train-auc:0.96639	validation-auc:0.95658
[19]	train-auc:0.97172	validation-auc:0.95138
[0]	train-auc:0.89333	validation-auc:0.93070
[5]	train-auc:0.94484	validation-auc:0.94999
[10]	train-auc:0.95998	validation-auc:0.95404
[15]	train-auc:0.96766	validation-auc:0.95652
[19]	train-auc:0.97121	validation-auc:0.95767
[0]	train-auc:0.89317	validation-auc:0.92749
[5]	train-auc:0.94009	validation-auc:0.94594
[10]	train-auc:0.95050	validation-auc:0.94721
[15]	train-auc:0.96006	validation-auc:0.94612
[19]	train-auc:0.96493	validation-auc:0.95138


#### 7.4. XGBOOST: FINAL MODEL.
* eta = {0.3}.
* max_depth = {3}.
* min_child_weight = {1}.

In [88]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 3,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

xgb_model = xgb.train(xgb_params, dTrain, evals = watchlist, num_boost_round = 16, verbose_eval = 5)
xgb_prediction = xgb_model.predict(dValidation)

roc_auc_score(y_validation, xgb_prediction)

[0]	train-auc:0.89682	validation-auc:0.93572
[5]	train-auc:0.94865	validation-auc:0.95271
[10]	train-auc:0.96104	validation-auc:0.95997
[15]	train-auc:0.97087	validation-auc:0.96299


0.9629898403483309

### 8. FINAL MODEL
* XGBoost.
* On the test set.

In [89]:
dTest = xgb.DMatrix(X_test, label = y_test, feature_names = feature_matrix)

xgb_params = {
    'eta': 0.3,
    'max_depth': 3,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}
watchlist = [(dTrain, 'train'), (dTest, 'test')]

model = xgb.train(xgb_params, dTrain, evals = watchlist, num_boost_round = 16, verbose_eval = 5)
model_prediction = model.predict(dTest)

roc_auc_score(y_test, model_prediction)

[0]	train-auc:0.89682	test-auc:0.91103
[5]	train-auc:0.94865	test-auc:0.92887
[10]	train-auc:0.96104	test-auc:0.93482
[15]	train-auc:0.97087	test-auc:0.93834


0.9383420317999757

### 9. TESTING THE MODEL.

In [90]:
patient_dict = train_dict[0]
patient_dict

{'age': 56,
 'sex': 'm',
 'chestpaintype': 'nap',
 'restingbp': 120,
 'cholesterol': 0,
 'fastingbs': 'no',
 'restingecg': 'normal',
 'maxhr': 97,
 'exerciseangina': 'no',
 'oldpeak': 0.0,
 'st_slope': 'flat'}

In [91]:
X_patient = dictVectorizer.transform(patient_dict)
dPatient = xgb.DMatrix(X_patient, feature_names = feature_matrix)

heart_failure = model.predict(dPatient)
print(f"The probability of the patient having a heart failure is: {heart_failure}.")

if heart_failure >= 0.5:
   print("The patient is likely to have a heart failure.")
else:
   print("The patient has a healthy heart.")

The probability of the patient having a heart failure is: [0.88630563].
The patient is likely to have a heart failure.


### 10. SAVING THE MODEL USING PICKLE.

In [92]:
import pickle

In [93]:
output_file = f'heartfailure_model.bin'

with open(output_file, 'wb') as f_out:
    pickle.dump((dictVectorizer, model, feature_matrix), f_out)

### 11. LOADING THE MODEL USING PICKLE.

In [94]:
with open(output_file, 'rb') as f_in:
    dictVectorizer, model, feature_matrix = pickle.load(f_in)