In [1]:
# Import pandas to load and manipulate the dataset
import pandas as pd

# Import train_test_split to split the data into training and test sets
from sklearn.model_selection import train_test_split

# Import KNeighborsClassifier to create and train the k-NN classification model
from sklearn.neighbors import KNeighborsClassifier

# Import metrics to evaluate the model's performance (accuracy, precision, recall, f1, confusion matrix, classification report)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Import StandardScaler to scale features (k-NN is sensitive to feature scales)
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the student_pass_fail.csv dataset into a pandas DataFrame
data = pd.read_csv(r"C:\Users\ADMIN\Documents\heart.csv")
data.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [4]:
# Create X (features) by selecting StudyHours, Attendance, and AssignmentsCompleted columns
X = data[["age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slope","ca","thal"]]

# Create y (target) by selecting the Pass column (0 or 1)
y = data["target"]

In [5]:
# Split the data into training (80%, 40 records) and test (20%, 10 records) sets
# test_size=0.2 allocates 20% of the 50 records to the test set
# random_state=42 ensures reproducible random splits
# stratify=y ensures similar class proportions in training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize StandardScaler to scale features (k-NN is distance-based and requires scaled features)
scaler = StandardScaler()

# Fit the scaler on training data and transform X_train
# Scaling ensures features (e.g., StudyHours, Attendance) are on similar scales for accurate distance calculations
X_train_scaled = scaler.fit_transform(X_train)

# Transform X_test using the same scaler (do not fit on test data to avoid data leakage)
X_test_scaled = scaler.transform(X_test)

In [6]:
# Initialize the k-NN classifier with k=5 neighbors
# n_neighbors=5 means the model considers the 5 nearest neighbors for classification
# metric='euclidean' uses Euclidean distance to find neighbors
model = KNeighborsClassifier(n_neighbors=5, metric='euclidean')

# Train the model on the scaled training data (X_train_scaled, y_train)
# k-NN doesn't have a traditional training phase; it stores the training data for distance calculations
model.fit(X_train_scaled, y_train)

In [7]:
# Use the trained model to predict Pass (0 or 1) for the scaled test set (X_test_scaled)
# Predictions are based on majority voting among the 5 nearest neighbors
y_pred = model.predict(X_test_scaled)

# Use the trained model to predict probabilities for the test set
# Probabilities show the proportion of neighbors voting for Pass=1
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]


In [11]:
# Calculate Accuracy (proportion of correct predictions)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate Precision (proportion of predicted Pass=1 that are correct)
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate Recall (proportion of actual Pass=1 correctly predicted)
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate F1-Score (harmonic mean of precision and recall)
f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

# Calculate and print Confusion Matrix to show true positives, true negatives, false positives, false negatives
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix (rows: actual, columns: predicted):")
print(conf_matrix)

# Print the Classification Report to summarize precision, recall, f1-score, and support for each class
# Also includes macro average (unweighted mean) and weighted average (weighted by support)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['have disease (0)', 'dont have disease (1)']))

Accuracy: 0.8634146341463415
Precision: 0.8737864077669902
Recall: 0.8571428571428571
F1-Score: 0.8653846153846154

Confusion Matrix (rows: actual, columns: predicted):
[[87 13]
 [15 90]]

Classification Report:
                       precision    recall  f1-score   support

     have disease (0)       0.85      0.87      0.86       100
dont have disease (1)       0.87      0.86      0.87       105

             accuracy                           0.86       205
            macro avg       0.86      0.86      0.86       205
         weighted avg       0.86      0.86      0.86       205



In [13]:
# Predict Pass for a new student with StudyHours=8, Attendance=85, AssignmentsCompleted=10
# Create a 2D array for the new student's feature values in the same order as X
new_patient = [[60,1,0,145,203,1,1,125,0,2.6,0,0,2]]

# Scale the new student's features using the same scaler
new_patient_scaled = scaler.transform(new_patient)

# Use the trained model to predict the Pass status for the new student
predicted_pass = model.predict(new_patient_scaled)
predicted_pass




array([0])

In [14]:
# Use the trained model to predict the probability of Pass=1
predicted_proba = model.predict_proba(new_patient_scaled)[:, 1]
predicted_proba

array([0.])