In [1]:
from google.colab import files
uploaded = files.upload()

Saving cleanheart_dataset.csv to cleanheart_dataset.csv


In [33]:
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
import pandas as pd
import numpy as np

In [15]:
# Load the dataset
df = pd.read_csv("cleanheart_dataset.csv")

In [16]:
# Separate the features and target variable
X = df.drop("HeartDiseaseorAttack", axis=1)
y = df["HeartDiseaseorAttack"]

In [17]:
# Perform oversampling using SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X, y)

In [18]:
X_train_res.shape[0] #new row count

412128

In [19]:
y_train_res.shape[0] #new row count

412128

In [20]:
y_train_res.value_counts() #new balance ratio in target variable

0.0    206064
1.0    206064
Name: HeartDiseaseorAttack, dtype: int64

In [21]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train_res, y_train_res, test_size=0.3, random_state=42)

In [37]:
# Feature Selection technique : Mutual Info Classifier
selector = SelectKBest(mutual_info_classif, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

In [38]:
#To understand which features were selected

selected_mask = selector.get_support()

# get the names of the selected features
feature_names = np.array(X_train.columns)
selected_features = feature_names[selected_mask]

print("Selected features:", selected_features)

Selected features: ['HighBP' 'HighChol' 'BMI' 'Smoker' 'Fruits' 'GenHlth' 'Sex' 'Age'
 'Education' 'Income']


In [27]:
# Define the KNN classifier
knn = KNeighborsClassifier()

# Define the hyperparameter grid for grid search
param_grid = {'n_neighbors': [1, 3, 5, 7, 9]}

# Perform grid search using cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train_selected, y_train)

# Print the best hyperparameters and score
print('Best hyperparameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

Best hyperparameters: {'n_neighbors': 1}
Best score: 0.837387209821226


In [28]:
# Evaluate the model on the test set
y_pred = grid_search.predict(X_test_selected)

In [29]:
# Print the confusion matrix
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

# Print the accuracy
print('Accuracy:', accuracy_score(y_test, y_pred))

# Print the precision
print('Precision:', precision_score(y_test, y_pred))

# Print the recall
print('Recall:', recall_score(y_test, y_pred))

# Print the F1 score
print('F1 score:', f1_score(y_test, y_pred))

# Print the mean squared error
print('Mean squared error:', mean_squared_error(y_test, y_pred))

Confusion matrix:
 [[48441 13565]
 [ 5273 56360]]
Accuracy: 0.8476370724447787
Precision: 0.8060064354665714
Recall: 0.9144451835867149
F1 score: 0.8568084038979006
Mean squared error: 0.15236292755522124
