In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from io import StringIO
import clean as c
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [15]:
df = c.df_b
df1 = c.df1
df2 = c.df2

In [16]:
feats = c.feats
feats

['Symptom Severity (1-10)',
 'Sleep Quality (1-10)',
 'Mood Score (1-10)',
 'Stress Level (1-10)',
 'Treatment Progress (1-10)',
 'Med_n',
 'Therapy_n',
 'Emot_n',
 'Diag_n']

# Using KNN as a Classifier

In [17]:
X = df1[feats]
y = df1['Out_b']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [18]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

train_accuracy = knn.score(X_train, y_train)
test_accuracy = knn.score(X_test, y_test)

print(f"KNN Training Accuracy: {train_accuracy:.3f}")
print(f"KNN Testing Accuracy: {test_accuracy:.3f}")

KNN Training Accuracy: 0.753
KNN Testing Accuracy: 0.455


Overfitting. Model performs well on training data but struggles to generalize the test set.

# Tune Hyperparameters

In [19]:
param_grid = {
    'n_neighbors': range(1, 20),  # Test k values from 1 to 20
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Grid search with cross-validation
grid_search = GridSearchCV(
    KNeighborsClassifier(),
    param_grid,
    cv=5,
    scoring='accuracy'
)
grid_search.fit(X_scaled, y)

# Best parameters and accuracy
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.3f}")

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 18}
Best Cross-Validation Accuracy: 0.544


Best distance metric is Eclidiean, and the optimal k = 10, meaning that the classifier considers the 10 nearest neighbors for predictions.

Cross validation score, the average accuracy across data subset indicates moderate improvement over the untuned model.

In [20]:
best_knn = grid_search.best_estimator_
test_accuracy = best_knn.score(X_test, y_test)
print(f"Best KNN Testing Accuracy: {test_accuracy:.3f}")

Best KNN Testing Accuracy: 0.591


After tuning the arrucary improves, indicates that the model generalizes better with optimal hyperparameters. 

Overall, the tuned model reduced overfitting by increasing k, which smooths the deciion boundary and makes the model less sensitive to noise in the training data. 


## Testing Binary measure 2: no change and deter

In [21]:
X = df2[feats]
y = df2['Outcome']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [22]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

train_accuracy = knn.score(X_train, y_train)
test_accuracy = knn.score(X_test, y_test)

print(f"KNN Training Accuracy: {train_accuracy:.3f}")
print(f"KNN Testing Accuracy: {test_accuracy:.3f}")

KNN Training Accuracy: 0.716
KNN Testing Accuracy: 0.621


In [23]:
param_grid = {
    'n_neighbors': range(1, 20),  # Test k values from 1 to 20
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Grid search with cross-validation
grid_search = GridSearchCV(
    KNeighborsClassifier(),
    param_grid,
    cv=5,
    scoring='accuracy'
)
grid_search.fit(X_scaled, y)

# Best parameters and accuracy
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.3f}")

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 16}
Best Cross-Validation Accuracy: 0.567


In [24]:
best_knn = grid_search.best_estimator_
test_accuracy = best_knn.score(X_test, y_test)
print(f"Best KNN Testing Accuracy: {test_accuracy:.3f}")

Best KNN Testing Accuracy: 0.545


Overfitting in Initial Model:

The initial model (k=3) overfits slightly but performs better on the test set compared to the tuned model.
Impact of Hyperparameter Tuning:

Increasing 
𝑘
k from 3 to 16 and switching to Manhattan distance reduces the model's variance (overfitting) but also reduces its overall accuracy on the test set.
Cross-Validation vs Test Set Accuracy:

The cross-validation accuracy (56.7%) is consistent with the best test accuracy (54.5%). However, these accuracies indicate the model has room for improvement.
