In [1]:
import numpy as np
import pandas as pd

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X.values if isinstance(X, pd.DataFrame) else X
        self.y_train = y.values if isinstance(y, pd.Series) else y

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2)**2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)

    def predict(self, X):
        X = X.values if isinstance(X, pd.DataFrame) else X
        probas = []
        for sample in X:
            distances = self.compute_distance(sample, self.X_train)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            proba = np.sum(k_nearest_labels) / self.k
            probas.append(proba)
        return np.array(probas)

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy import stats

# Define data preprocessing function
def preprocess_data(train_path, test_path):
    
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    train_data.drop(columns=['CustomerId', 'Surname', 'id'], inplace=True)
    test_data.drop(columns=['CustomerId', 'Surname', 'id'], inplace=True)

    train_data['HasCrCard'] = train_data['HasCrCard'].astype('object')
    train_data['IsActiveMember'] = train_data['IsActiveMember'].astype('object')

    test_data['HasCrCard'] = test_data['HasCrCard'].astype('object')
    test_data['IsActiveMember'] = test_data['IsActiveMember'].astype('object')
    
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    numerical_features = train_data.select_dtypes(include=[np.number]).columns.tolist()

    if 'Exited' in numerical_features:
        numerical_features.remove('Exited')
    
    def detect_outliers(df, features, threshold=3):
        outliers = np.zeros(df.shape[0])
        for feature in numerical_features:
            z_scores = np.abs(stats.zscore(df[feature]))
            outliers += (z_scores > threshold).astype(int)
        return outliers > 0

    outliers = detect_outliers(train_data, numerical_features)
    train_data = train_data[~outliers]

    scaler = StandardScaler()
    train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])
    test_data[numerical_features] = scaler.transform(test_data[numerical_features])

    X = train_data.drop('Exited', axis=1)
    X = X.astype('float')
    y = train_data['Exited']
    X_test = test_data

    return X, y, X_test

In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):

    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []

    for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        knn.fit(X_train, y_train)
        y_pred_proba = knn.predict(X_val)
        
        score = roc_auc_score(y_val, y_pred_proba)
        scores.append(score)

    mean_score = np.mean(scores)
    print(f"Mean ROC AUC: {mean_score}")
    return scores

In [4]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=20, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

Mean ROC AUC: 0.9140720881211382
Cross-validation scores: [0.9047709484634938, 0.9128112769566723, 0.9144948179111314, 0.9209717132386623, 0.9173116840357307]


In [5]:
# Hyperparameter tuning
k_values = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
distance_metrics = ['euclidean', 'manhattan']

best_k = 0
best_metric = None
best_score = 0

for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)
        scores = cross_validate(X, y, knn)
        mean_score = np.mean(scores)
        
        if mean_score > best_score:
            best_score = mean_score
            best_k = k
            best_metric = metric

print(f"Best hyperparameters: k={best_k}, distance_metric={best_metric}")
print(f"Best mean ROC AUC: {best_score:}")

Mean ROC AUC: 0.9007196349458138
Mean ROC AUC: 0.9006050983467088
Mean ROC AUC: 0.9140720881211382
Mean ROC AUC: 0.9106210450438198
Mean ROC AUC: 0.9179929735449992
Mean ROC AUC: 0.9153873727072653
Mean ROC AUC: 0.9181740216285267
Mean ROC AUC: 0.9168007085247103
Mean ROC AUC: 0.9179001374650386
Mean ROC AUC: 0.9181502644759393
Mean ROC AUC: 0.9178381571911857
Mean ROC AUC: 0.918390008104532
Mean ROC AUC: 0.9175086121734124
Mean ROC AUC: 0.9181492745534632
Mean ROC AUC: 0.9172424256174667
Mean ROC AUC: 0.918240054281136
Mean ROC AUC: 0.9168523947467395
Mean ROC AUC: 0.9175092733548581
Mean ROC AUC: 0.9167060363536421
Mean ROC AUC: 0.9178428727043402
Best hyperparameters: k=60, distance_metric=manhattan
Best mean ROC AUC: 0.918390008104532


In [6]:
# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

In [7]:
# Save test predictions
submission = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)