https://colab.research.google.com/drive/1y8lGFPTdfmfgPKBHfdEljcREJ85YtLAe#scrollTo=OaZihPuhpKO8


In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import Counter
import matplotlib.pyplot as plt

In [4]:
file_id = '1w_4xTCDYcY-lu3p1zB3pT2EjM7CuXIaa'
url = f'https://drive.google.com/uc?id={file_id}'

In [5]:
data = pd.read_csv(url)
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [6]:
data.shape

(43400, 12)

In [7]:
le = LabelEncoder()
values = ['gender', 'ever_married', 'work_type' , 'Residence_type', 'smoking_status']
for col in values:
  data[col] = le.fit_transform(data[col])
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,1,3.0,0,0,0,4,0,95.12,18.0,3,0
1,30468,1,58.0,1,0,1,2,1,87.96,39.2,1,0
2,16523,0,8.0,0,0,0,2,1,110.89,17.6,3,0
3,56543,0,70.0,0,0,1,2,0,69.04,35.9,0,0
4,46136,1,14.0,0,0,0,1,0,161.28,19.1,3,0


In [8]:
from sklearn.impute import SimpleImputer
x = data.drop('stroke', axis=1)
imputer = SimpleImputer(strategy='mean')
x = pd.DataFrame(imputer.fit_transform(x), columns=x.columns, index=x.index) #convert back to DataFrame
y = data['stroke']

In [9]:
smote = SMOTE(random_state=42)
x_balanced, y_balanced = smote.fit_resample(x, y)
print("Balanced Data Shape:", x_balanced.shape)
print("Class Distribution After SMOTE:", Counter(y_balanced))

Balanced Data Shape: (85234, 11)
Class Distribution After SMOTE: Counter({0: 42617, 1: 42617})


In [10]:
x_train, x_test, y_train, y_test = train_test_split(x_balanced, y_balanced, test_size=0.33, random_state=42)

In [11]:
print("Train Set Shape:", x_train.shape)
print("Test Set Shape:", x_test.shape)

Train Set Shape: (57106, 11)
Test Set Shape: (28128, 11)


In [12]:
def knn(x_train, y_train, x_test, k=9):

    x_train_np = x_train.values
    x_test_np = x_test.values
    y_train_np = y_train.values

    max_labels = []

    for test_row in x_test_np:
        distances = np.linalg.norm(x_train_np - test_row, axis=1)
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train_np[k_indices]

        most_common_label = Counter(k_nearest_labels).most_common(1)[0][0]
        max_labels.append(most_common_label)

    return np.array(max_labels)


In [13]:
prediction = knn(x_train, y_train, x_test)

In [15]:
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.94      0.77      0.85     14074
           1       0.81      0.95      0.87     14054

    accuracy                           0.86     28128
   macro avg       0.87      0.86      0.86     28128
weighted avg       0.87      0.86      0.86     28128



In [16]:
accuracy = accuracy_score(y_test, prediction)
accuracy

0.860494880546075

In [34]:
from sklearn.neighbors import KNeighborsClassifier

metrics = ['euclidean', 'manhattan']
best_k = None
best_score = 0
best_metric = None

for metric in metrics:
    for k in range(1,15):
        knn_model = KNeighborsClassifier(n_neighbors=k, metric=metric)
        knn_model.fit(x_train, y_train)
        prediction = knn_model.predict(x_test)
        score = accuracy_score(y_test, prediction)

        if score > best_score:
            best_score = score
            best_k = k
            best_metric = metric

        print(f"Metric: {metric}, k: {k}, Accuracy: {score:.4f}")

Metric: euclidean, k: 1, Accuracy: 0.9824
Metric: euclidean, k: 2, Accuracy: 0.9817
Metric: euclidean, k: 3, Accuracy: 0.9688
Metric: euclidean, k: 4, Accuracy: 0.9695
Metric: euclidean, k: 5, Accuracy: 0.9585
Metric: euclidean, k: 6, Accuracy: 0.9599
Metric: euclidean, k: 7, Accuracy: 0.9487
Metric: euclidean, k: 8, Accuracy: 0.9512
Metric: euclidean, k: 9, Accuracy: 0.9423
Metric: euclidean, k: 10, Accuracy: 0.9453
Metric: euclidean, k: 11, Accuracy: 0.9373
Metric: euclidean, k: 12, Accuracy: 0.9396
Metric: euclidean, k: 13, Accuracy: 0.9334
Metric: euclidean, k: 14, Accuracy: 0.9346
Metric: manhattan, k: 1, Accuracy: 0.9856
Metric: manhattan, k: 2, Accuracy: 0.9863
Metric: manhattan, k: 3, Accuracy: 0.9771
Metric: manhattan, k: 4, Accuracy: 0.9785
Metric: manhattan, k: 5, Accuracy: 0.9709
Metric: manhattan, k: 6, Accuracy: 0.9728
Metric: manhattan, k: 7, Accuracy: 0.9667
Metric: manhattan, k: 8, Accuracy: 0.9693
Metric: manhattan, k: 9, Accuracy: 0.9639
Metric: manhattan, k: 10, Acc