In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

In [None]:
md = pd.read_csv('medical_clean.csv')
print(md.shape)
print(md.info())

In [None]:
md.head(5).T

In [None]:
print(md.duplicated().sum())
print(md.isna().sum().sum())

In [None]:
md = md.replace({'No':0,'Yes':1})

In [None]:
md.Age.describe()

In [None]:
mdreg = md[['Age','Gender','HighBlood','Stroke','Overweight'
            ,'Diabetes','Hyperlipidemia','BackPain','Anxiety','Asthma']].copy()

In [None]:
mdreg.Gender.value_counts().sort_index()

In [None]:
mdreg.shape

In [None]:
mdreg.Stroke.value_counts().sort_index()

In [None]:
mdreg.info()

In [None]:
mdgender = pd.get_dummies(data=mdreg['Gender'])
mdreg.drop(columns='Gender', inplace=True)
mdreg.insert(2, 'gender_male', mdgender.Male.astype(int))
mdreg.insert(2, 'gender_female', mdgender.Female.astype(int))
mdreg.insert(2, 'gender_nonbinary', mdgender.Nonbinary.astype(int))

In [None]:
mdreg.to_csv('medical_209T1_data.csv', index=False)

In [None]:
y = mdreg.Stroke
X = mdreg[['Age','gender_male','gender_female','gender_nonbinary','HighBlood','Overweight','Diabetes','Hyperlipidemia','BackPain',
           'Anxiety','Asthma']].assign(const=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=62)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
np.savetxt('X_train.csv', X_train, delimiter=',')
np.savetxt('y_train.csv', y_train, delimiter=',')
np.savetxt('X_test.csv', X_test, delimiter=',')
np.savetxt('y_test.csv', y_test, delimiter=',')

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

In [None]:
y_pred_proba = knn.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score: {auc_score}")

In [None]:
k_values = [i for i in range (1,35)]
scores = []

scaler = StandardScaler()
X = scaler.fit_transform(X)

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    score = cross_val_score(knn, X, y, cv=5)
    scores.append(np.mean(score))

In [None]:
sns.lineplot(x = k_values, y = scores, marker = '*', markersize=15)
plt.xlabel('K Values')
plt.ylabel('Accuracy Score')

In [None]:
best_index = np.argmax(scores)
best_k = k_values[best_index]
print('Best K:', best_k)
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

In [None]:
y_pred_proba = knn.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score: {auc_score}")