In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split

In [2]:
models = {
  'Logistic Regression': LogisticRegression(random_state=42),
  'Random Forest': RandomForestClassifier(random_state=42),
  'Decision Tree': DecisionTreeClassifier(random_state=42),
  'Support Vector Machine': SVC(random_state=42)
}

In [3]:
params = {
	'Logistic Regression': {
		'C': [1,5,10,20],
    'max_iter': [1000, 1500]
	},

	'Random Forest': {
		'n_estimators': [100, 200, 300],
		'criterion': ['gini', 'entropy'],
		'max_depth': [None, 10, 20, 30],
	},

	'Decision Tree': {
		'max_depth': [2, 3, 5, 10, 20],
		'criterion': ["gini", "entropy"]
	},

	'Support Vector Machine': {
		'C': [0.1, 1, 10, 100, 1000],
		'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
		'kernel': ['rbf','linear']
	}
}

In [4]:
df = pd.read_csv('../data/cleaned_heart_disease.csv')

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,thal,slope_downsloping,slope_flat,slope_upsloping,num
0,0.708333,1,1,0.481132,0.244292,1,2,0.603053,0,2.3,0.0,6.0,1.0,0.0,0.0,0
1,0.791667,1,4,0.622642,0.365297,0,2,0.282443,1,1.5,3.0,3.0,0.0,1.0,0.0,2
2,0.791667,1,4,0.245283,0.23516,0,2,0.442748,1,2.6,2.0,7.0,0.0,1.0,0.0,1
3,0.166667,1,3,0.339623,0.283105,0,0,0.885496,0,3.5,0.0,3.0,1.0,0.0,0.0,0
4,0.25,0,2,0.339623,0.178082,0,2,0.770992,0,1.4,0.0,3.0,0.0,0.0,1.0,0


In [5]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [6]:
results_gridcv = []

def compare_model_gridcv(models, params):
	for i in range(len(models)):
		key = list(params.keys())[i]
		hyperparameter = params[key]
		classifier = GridSearchCV(list(models.values())[i], hyperparameter, cv=5)
		classifier.fit(x_train, y_train)
		model_name = list(models.keys())[i]
		best_score = classifier.best_score_
		best_params = classifier.best_params_
		print(f'Model Selected: {model_name}')
		print(f'Model Best Accuracy: {best_score}')
		print(f'Model Best Parameters: {best_params}')
		print('==========================================================')
		results_gridcv.append([model_name, best_score, best_params])

	return results_gridcv

In [7]:
start_time = time.time()

final_result_gridcv = compare_model_gridcv(models, params)

end_time = time.time()

print(f'Time Taken: {round((end_time - start_time) / 60, 1)} Minute')

Model Selected: Logistic Regression
Model Best Accuracy: 0.528125
Model Best Parameters: {'C': 20, 'max_iter': 1000}
Model Selected: Random Forest
Model Best Accuracy: 0.8453125
Model Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'n_estimators': 200}
Model Selected: Decision Tree
Model Best Accuracy: 0.7046875
Model Best Parameters: {'criterion': 'entropy', 'max_depth': 20}
Model Selected: Support Vector Machine
Model Best Accuracy: 0.8296875
Model Best Parameters: {'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}
Time Taken: 1.7 Minute


In [8]:
results_randomcv = []

def compare_model_randomcv(models, params):
	for i in range(len(models)):
		key = list(params.keys())[i]
		hyperparameter = params[key]
		classifier = RandomizedSearchCV(list(models.values())[i], hyperparameter, cv=5)
		classifier.fit(x_train, y_train)
		model_name = list(models.keys())[i]
		best_score = classifier.best_score_
		best_params = classifier.best_params_
		print(f'Model Selected: {model_name}')
		print(f'Model Best Accuracy: {best_score}')
		print(f'Model Best Parameters: {best_params}')
		print('==========================================================')
		results_randomcv.append([model_name, best_score, best_params])

	return results_randomcv

In [9]:
start_time = time.time()

final_result_gridcv = compare_model_gridcv(models, params)

end_time = time.time()

print(f'Time Taken: {round((end_time - start_time) / 60, 1)} Minute')

Model Selected: Logistic Regression
Model Best Accuracy: 0.528125
Model Best Parameters: {'C': 20, 'max_iter': 1000}
Model Selected: Random Forest
Model Best Accuracy: 0.8453125
Model Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'n_estimators': 200}
Model Selected: Decision Tree
Model Best Accuracy: 0.7046875
Model Best Parameters: {'criterion': 'entropy', 'max_depth': 20}
Model Selected: Support Vector Machine
Model Best Accuracy: 0.8296875
Model Best Parameters: {'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}
Time Taken: 1.6 Minute


#### **Saving Model (pkl format)**

In [10]:
import joblib
from sklearn.metrics import r2_score

##### **Since Random Forest Classifier Was The Best Model, I Will Used It**

In [12]:
model = RandomForestClassifier(criterion='entropy', max_depth=None, n_estimators=200)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(f'Accuracy: {r2_score(y_test, y_pred)}')

joblib.dump(model, "../models/final_model.pkl")

Accuracy: 0.8255606308389255


['../models/final_model.pkl']