In [1]:
import numpy as np
import pandas as pd

In [2]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        return np.array([self._predict_single(x) for x in X])

    def _predict_single(self, dist):
        distances = self.compute_distances(self.X_train, dist)
        indices = np.argsort(distances)[:self.k]
        labels = self.y_train[indices]

        return np.mean(labels == 1)

    def compute_distances(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")
        return distances

In [3]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing

    # unhelpful columns
    train_data = train_data.drop(columns=['CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['CustomerId', 'Surname'])

    # dummy variables
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    # reformat to have same columns
    for column in train_data.columns:
        if column not in test_data.columns:
            test_data[column] = 0
    # reorder to match / drop exited
    test_data = test_data[train_data.columns.drop('Exited')]


    y = train_data['Exited']
    X = train_data.drop(columns=['Exited'])
    X_test = test_data 

    # scale by mean normalization
    mean = X.mean()
    std = X.std()

    X = (X - mean) / std
    X_test = (X_test - mean) / std 

    return X.values, y.values, X_test.values

In [4]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    size = len(X) // n_splits
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    ROC_AUC_scores = []

    for i in range(n_splits):
        # split the data into training and validation sets
        X_val = X[i * size:(i + 1) * size]
        y_val = y[i * size:(i + 1) * size]

        X_train = np.concatenate([X[:i * size], X[(i + 1) * size:]])
        y_train = np.concatenate([y[:i * size], y[(i + 1) * size:]])

        # fit and make predictions
        knn.fit(X_train, y_train)
        prediction = knn.predict(X_val)

        # ROC-AUC score 
        score = roc_auc(y_val, prediction)
        ROC_AUC_scores.append(score)

    return np.mean(ROC_AUC_scores)

def roc_auc(real, predicted):
    sorted = np.argsort(predicted)[::-1]
    real = real[sorted]

    # Compute true positive rate and false positive rate
    positives = np.cumsum(real)
    negatives = np.cumsum(1 - real)
    total_pos = np.sum(real)
    total_neg = len(real) - total_pos
    tpr = positives / total_pos
    fpr = negatives / total_neg

    return np.trapz(tpr, fpr)

In [5]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
scores = cross_validate(X, y, knn)

print("Cross-validation scores:", scores)


best_k = 37
best_metric = 'manhattan'

knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

submission_df = pd.DataFrame({
    'id': pd.read_csv('test.csv')['id'],
    'Exited': test_predictions
})
submission_df.to_csv('submissions.csv', index=False)
print('Test predictions saved to submissions.csv')

Cross-validation scores: 0.8678493268229162
Test predictions saved to submissions.csv
