In [1]:
import pandas as pd

heart_disease = pd.read_csv("../data/heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [2]:
# Obtain the X and y features
X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

In [3]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [4]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate the classification model
model = RandomForestClassifier(n_estimators=100)

# Print the default parameters used by the classification model
model.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y)

model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [6]:
y_pred = model.predict(X_test)

In [7]:
# Model evaluation

print(f'Training score: {model.score(X_train, y_train)}')
print(f'Testing score: {model.score(X_test, y_test)}')

Training score: 1.0
Testing score: 0.7912087912087912


In [8]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score

print(classification_report(y_test, y_pred))
print()
print(confusion_matrix(y_test, y_pred))
print()
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.76      0.77        41
           1       0.80      0.82      0.81        50

    accuracy                           0.79        91
   macro avg       0.79      0.79      0.79        91
weighted avg       0.79      0.79      0.79        91


[[31 10]
 [ 9 41]]

0.7912087912087912


In [9]:
# Hyperparameter tuning process for Random Forest classifier
# The best parameter seems to be 20
for i in range(1000, 1500, 10):
    model = RandomForestClassifier(n_estimators=i)
    model.fit(X_train, y_train)
    print(f"Estimators is {i}, and model accuracy is {model.score(X_test, y_test)}")

Estimators is 1000, and model accuracy is 0.8131868131868132
Estimators is 1010, and model accuracy is 0.8131868131868132
Estimators is 1020, and model accuracy is 0.8241758241758241
Estimators is 1030, and model accuracy is 0.8021978021978022
Estimators is 1040, and model accuracy is 0.8131868131868132
Estimators is 1050, and model accuracy is 0.8351648351648352
Estimators is 1060, and model accuracy is 0.8241758241758241
Estimators is 1070, and model accuracy is 0.8131868131868132
Estimators is 1080, and model accuracy is 0.8131868131868132
Estimators is 1090, and model accuracy is 0.8241758241758241
Estimators is 1100, and model accuracy is 0.8241758241758241
Estimators is 1110, and model accuracy is 0.8241758241758241
Estimators is 1120, and model accuracy is 0.8131868131868132
Estimators is 1130, and model accuracy is 0.8241758241758241
Estimators is 1140, and model accuracy is 0.8131868131868132
Estimators is 1150, and model accuracy is 0.8021978021978022
Estimators is 1160, and 

In [10]:
improved_model = RandomForestClassifier(n_estimators=20)
improved_model.fit(X_train, y_train)

y_pred = improved_model.predict(X_test)

# Evaluate the improved model
print(f'Training score: {improved_model.score(X_train, y_train)}')
print(f'Testing score: {improved_model.score(X_test, y_test)}')

print(classification_report(y_test, y_pred))
print()
print(confusion_matrix(y_test, y_pred))
print()
print(accuracy_score(y_test, y_pred))

Training score: 1.0
Testing score: 0.7912087912087912
              precision    recall  f1-score   support

           0       0.78      0.76      0.77        41
           1       0.80      0.82      0.81        50

    accuracy                           0.79        91
   macro avg       0.79      0.79      0.79        91
weighted avg       0.79      0.79      0.79        91


[[31 10]
 [ 9 41]]

0.7912087912087912


In [13]:
import pickle

pickle.dump(improved_model, open("save_model.pkl", "wb"))
loaded_model = pickle.load(open("save_model.pkl", "rb"))
improved_model.score(X_test, y_test)

0.7912087912087912