In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [2]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError("Unsupported distance metric")
        return distances

    def predict(self, X):
        
        y_pred = []
        for x_test in X:
            distances = self.compute_distance(self.X_train, x_test)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            prob = np.mean(k_nearest_labels)
            y_pred.append(prob)
        return np.array(y_pred)


In [3]:
def preprocess_data(train_path, test_path):
    
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    
    train_data = train_data.drop(['CustomerId', 'Surname'], axis=1)
    test_data = test_data.drop(['CustomerId', 'Surname'], axis=1)

    
    test_data['Exited'] = -1  

    
    combined_data = pd.concat([train_data, test_data], ignore_index=True)

    
    combined_data = pd.get_dummies(combined_data, columns=['Geography', 'Gender'], drop_first=True)

    
    train_data = combined_data[combined_data['Exited'] != -1]
    test_data = combined_data[combined_data['Exited'] == -1].drop('Exited', axis=1)

   
    y = train_data['Exited'].values
    X = train_data.drop('Exited', axis=1).values
    X_test = test_data.values

    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_test = scaler.transform(X_test)

    return X, y, X_test


In [4]:
def cross_validate(X, y, knn, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    for train_index, val_index in skf.split(X, y):
        X_train_cv, X_val_cv = X[train_index], X[val_index]
        y_train_cv, y_val_cv = y[train_index], y[val_index]

        knn.fit(X_train_cv, y_train_cv)
        y_pred = knn.predict(X_val_cv)
        auc = roc_auc_score(y_val_cv, y_pred)
        auc_scores.append(auc)

    return auc_scores


In [5]:
def hyperparameter_tuning(X, y):
    k_values = range(1, 11)  
    distance_metrics = ['euclidean', 'manhattan']
    best_auc = 0
    best_params = {}

    for k in k_values:
        for metric in distance_metrics:
            knn = KNN(k=k, distance_metric=metric)
            cv_scores = cross_validate(X, y, knn, n_splits=5)
            mean_auc = np.mean(cv_scores)
            print(f"k={k}, metric={metric}, AUC={mean_auc:.4f}")
            if mean_auc > best_auc:
                best_auc = mean_auc
                best_params = {'k': k, 'distance_metric': metric}
    print(f"\nBest parameters: k={best_params['k']}, metric={best_params['distance_metric']}, AUC={best_auc:.4f}")
    return best_params


In [6]:
# Main script
if __name__ == "__main__":
    
    X, y, X_test = preprocess_data('train.csv', 'test.csv')

    
    print("Starting hyperparameter tuning...\n")
    best_params = hyperparameter_tuning(X, y)

    
    print("\nTraining the final model...")
    knn = KNN(k=best_params['k'], distance_metric=best_params['distance_metric'])
    knn.fit(X, y)

    
    print("Making predictions on the test set...")
    test_predictions = knn.predict(X_test)

    
    test_predictions = np.clip(test_predictions, 0, 1)

    
    test_ids = pd.read_csv('test.csv')['id']

    submission = pd.DataFrame({'id': test_ids, 'Exited': test_predictions})
    submission.to_csv('submissions.csv', index=False)
    print("Submission file 'submissions.csv' has been created.")


Starting hyperparameter tuning...

k=1, metric=euclidean, AUC=0.7492
k=1, metric=manhattan, AUC=0.7456
k=2, metric=euclidean, AUC=0.8113
k=2, metric=manhattan, AUC=0.8123
k=3, metric=euclidean, AUC=0.8380
k=3, metric=manhattan, AUC=0.8390
k=4, metric=euclidean, AUC=0.8561
k=4, metric=manhattan, AUC=0.8561
k=5, metric=euclidean, AUC=0.8668
k=5, metric=manhattan, AUC=0.8663
k=6, metric=euclidean, AUC=0.8731
k=6, metric=manhattan, AUC=0.8752
k=7, metric=euclidean, AUC=0.8789
k=7, metric=manhattan, AUC=0.8788
k=8, metric=euclidean, AUC=0.8827
k=8, metric=manhattan, AUC=0.8833
k=9, metric=euclidean, AUC=0.8863
k=9, metric=manhattan, AUC=0.8882
k=10, metric=euclidean, AUC=0.8899
k=10, metric=manhattan, AUC=0.8903

Best parameters: k=10, metric=manhattan, AUC=0.8903

Training the final model...
Making predictions on the test set...
Submission file 'submissions.csv' has been created.
