## Install libraries

In [1]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import time
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot') 
from matplotlib.pyplot import figure
%matplotlib inline 
matplotlib.rcParams['figure.figsize'] = (12,8)
import seaborn as sns

# Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Training
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# Exporting
import pickle

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
%cd gdrive

## Load dataset

In [None]:
%cd MyDrive

In [None]:
%cd HeartDisease/

In [None]:
# Read the CSV file's data
df = pd.read_csv('heart_disease_dataset.csv')

## EDA

In [None]:
# Quick look at the data
df.head()

In [None]:
# Show the dataset's shape
df.shape

In [None]:
# Show the non null count and datatypes of the variables
df.info()

In [None]:
df.describe()

In [None]:
df.hist(bins=20, figsize=(20,15))
plt.show()

In [None]:
# Look better at the target variable
df.target.value_counts()

In [None]:
# Visualise the target variable
sns.countplot(x='target', data=df)

In [None]:
# Calculate the number of samples in each class
num_class_0 = np.sum(df.values == 0)
num_class_1 = np.sum(df.values == 1)


# Calculate the class imbalance ratio
class_imbalance_ratio = num_class_0 / (num_class_0 + num_class_1)
print(f"Class imbalance ratio: {class_imbalance_ratio}")

In [None]:
# Check for duplicate values
df.duplicated().sum()

In [None]:
# Each duplicate
duplicate_rows = df[df.duplicated(keep=False)]

# Print the duplicate rows
print(duplicate_rows)

In [None]:
# Correlation heatmap
sns.set(rc={'figure.figsize':(20,12)})
sns.heatmap(df.corr().abs(),annot=True)

In [None]:
sns.set_context('notebook',font_scale = 2.3)
df.drop('target', axis=1).corrwith(df.target).plot(kind='bar', grid=True, figsize=(20, 10), 
                                                        title="Correlation with the target feature")
plt.tight_layout()

In [None]:
df.columns

In [None]:
# Visualise the outliers of the numerical columns
plt.figure(figsize=(20, 10), facecolor='w')

sns.boxplot(data=df[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']])

plt.show()

In [None]:
# find the extreme value of chol column
df['chol'].max()

In [None]:
# find the extreme value of trestbps column
print("max: ", df['trestbps'].max())

print("min: ", df['trestbps'].min())

## Data Cleaning and Preprocessing

### Verify that there is no null value

In [None]:
# Prin number of missing values in each column
print(df.isnull().sum())

### Fixing the outliers

In [None]:
# Show the columns which need outlier treatment
# Exclude the target var
dfIQR=df[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']]


dfIQR.describe()

In [None]:
Q1 = dfIQR.quantile(0.25) # Calculate 1st quantile
Q3 = dfIQR.quantile(0.75) # Calculate 3rd quantile


IQR = Q3-Q1 

In [None]:
# Check for values that fall below the lower bound and above the upper bound 
df = df[~((df < (Q1-1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
# Visualise the outliers of the numerical columns again
plt.figure(figsize=(20,10), facecolor='w')
sns.boxplot(data=df[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']])
plt.show()

In [None]:
# Check for duplicate values
df.duplicated().sum()

### Feature Selection

In [None]:
# Separate independent and dependent variables
X = df.iloc[:, :-1]  # independent columns
y = df.iloc[:, -1]    # target column


# We have 13 features
bestfeatures = SelectKBest(score_func=chi2, k=10) # Apply SelectKBest class to extract top 10 best features
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)


# Concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']  # Naming the dataframe columns


print(featureScores.nlargest(9, 'Score'))  # Print 10 most important features

In [None]:
# Drop the least important features
drop_columns = ['restecg', 'trestbps', 'slope']
df = df.drop(drop_columns, axis=1) # axis=1 as we want to drop columns, not rows

## Split the dataset to 80%-20%

In [None]:
# Split the dataset
x = df.iloc[:, :-1] # Independent variables
y = df.iloc[:, -1] # Dependent variable


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .20, random_state = 0)

### Check balance class

In [None]:
# View the percentage of each class

print('\nBalance of positive and negative classes (%):')
y_train.value_counts(normalize=True)*100

## Implementation and Evaluation of the algorithms

In [None]:
def fit_and_evaluate_the_model(model, x_train, y_train, x_test, y_test):
    # Fit the training set to the model
    model.fit(x_train, y_train)
    
    # The model predicts the labels of the features in the test set
    y_pred = model.predict(x_test)
    
    
    print(classification_report(y_test, y_pred))
    print("Recall: ", recall_score(y_test, y_pred))
    
    
    # Calculate the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    ax = sns.heatmap(conf_matrix, annot=True, fmt = "g")
    
    
    # Plot the confusion matrix
    ax.set_title('Confusion Matrix\n\n')
    ax.set_xlabel('\nPredicted Values')
    ax.set_ylabel('Actual Values ')
    
    
    # Display the visualisation of the Confusion Matrix
    plt.rcParams["figure.figsize"] = (5, 3)
    plt.show()

In [None]:
def find_best_parameters(model, param_grid, x_train, y_train, x_test, y_test):
    
    # Create a grid search object
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring="recall")

    # Fit the grid search object to the training data
    grid_search.fit(x_train, y_train)

    # Print the best parameters
    print("Best Parameters:", grid_search.best_params_)
    
    # Get the result in a tabular format
    result = pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),
           pd.DataFrame(grid_search.cv_results_["mean_test_score"], columns=["Recall"])],axis=1)
    
    
    return result.sort_values(by="Recall", ascending=False)

## Logistic Regression

In [None]:
lr = LogisticRegression()


fit_and_evaluate_the_model(lr, x_train, y_train, x_test, y_test)

## Tuned Logistic Regression using GridSearchCV

In [None]:
lr0 = LogisticRegression()

# Create a parameter grid
param_grid = {'C': [100, 10, 1.0, 0.1, 0.01], ## Regularization --- high C => stronger regularization
              'penalty': ['l2'],
              'solver': ['newton-cg', 'lbfgs', 'liblinear']
             }


best_params = find_best_parameters(lr0, param_grid, x_train, y_train, x_test, y_test)
best_params

In [None]:
# Get first key-value pair
first_pair = next(iter((best_params.loc[:, ~best_params.columns.isin(['Recall'])].to_dict(orient='index').items())) )
first_pair[1]

In [None]:
lr1 = LogisticRegression(**first_pair[1])


fit_and_evaluate_the_model(lr1, x_train, y_train, x_test, y_test)

## Decision Tree

In [None]:
decision_tree = DecisionTreeClassifier()


fit_and_evaluate_the_model(decision_tree, x_train, y_train, x_test, y_test)

## Tuned Decision Tree using GridSearchCV

In [None]:
decision_tree0 = DecisionTreeClassifier()


# Create a parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}


best_params = find_best_parameters(decision_tree0, param_grid, x_train, y_train, x_test, y_test)
best_params

In [None]:
# Exclude Recall column
# Replace nan value with None
# Convert dataframe to a dictionary


# Get first key-value pair
first_pair = next(iter((best_params.loc[:, ~best_params.columns.isin(['Recall'])].replace({np.nan: None}).to_dict(orient='index').items())) )
first_pair[1]

In [None]:
decision_tree1 = DecisionTreeClassifier(**first_pair[1])


fit_and_evaluate_the_model(decision_tree1, x_train, y_train, x_test, y_test)

## Random Forest

In [None]:
rfc = RandomForestClassifier()


fit_and_evaluate_the_model(rfc, x_train, y_train, x_test, y_test)

## Tuned Random Forest using GridSearchCV

In [None]:
param_grid = {
    'n_estimators': [200, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy']
}


rfc0 = RandomForestClassifier()


best_params = find_best_parameters(rfc0, param_grid, x_train, y_train, x_test, y_test)
best_params

In [None]:
# Exclude the Recall column
# Convert the dataframe to a dictionary


# Get first key-value pair
first_pair = next(iter((best_params.loc[:, ~best_params.columns.isin(['Recall'])].to_dict(orient='index').items())) )
first_pair[1]

In [None]:
rfc1 = RandomForestClassifier(**first_pair[1])


fit_and_evaluate_the_model(rfc1, x_train, y_train, x_test, y_test)

## XGBoost

In [None]:
xgb = XGBClassifier()


fit_and_evaluate_the_model(xgb, x_train, y_train, x_test, y_test)

## Tuned XGBoost using GridSearchCV

In [None]:
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1],
    'colsample_bytree': [0.8, 0.9, 1],
    'gamma': [0, 1, 2]
}


xgb0 = XGBClassifier()


best_params = find_best_parameters(xgb0, param_grid, x_train, y_train, x_test, y_test)
best_params

In [None]:
# Exclude the Recall column
# Convert the dataframe to a dictionary


# Get first key-value pair
first_pair = next(iter((best_params.loc[:, ~best_params.columns.isin(['Recall'])].to_dict(orient='index').items())) )
first_pair[1]

In [None]:
xgb1 = XGBClassifier(**first_pair[1])


fit_and_evaluate_the_model(xgb1, x_train, y_train, x_test, y_test)

## Cross Validation to check model stability

In [None]:
def cross_validation(model, x_train, y_train):
    scores = cross_val_score(model, x_train, y_train, cv=5)
    
    print("Cross Validation Scores:\n")

    print("Mean score: ", scores.mean()) # Print mean score
    print("Standard deviation: ", scores.std()) # Print Standard Deviation
    
    
    return scores

In [None]:
# cross validate logistic regression
lr_scores = cross_validation(decision_tree1, x_train, y_train)

In [None]:
# cross validate decision tree model
dtree_scores = cross_validation(decision_tree0, x_train, y_train)

In [None]:
# cross validate random forest model
rf_scores = cross_validation(rfc1, x_train, y_train)

In [None]:
# cross validate xgboost model
xgboost_scores = cross_validation(xgb, x_train, y_train)

## Plot Cross-Validation Scores using Box Plot

In [None]:
results = []

results.append(lr_scores)
results.append(dtree_scores)
results.append(xgboost_scores)


fig = plt.figure()
fig.suptitle('Algorithm Comparison on Cross Validation Scores', size=20)
plt.boxplot(results)
plt.xticks([1, 2, 3], ['Logistic Regression', 'Decision Tree', 'XGBoost'], Fontsize=8)

## Export the best performing model

In [None]:
with open('best_model', 'wb') as file:
  pickle.dump(xgb, file)