In [2]:
import numpy as np
import pandas as pd

In [17]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X.astype(float)
        self.y_train = y.astype(int)

    def predict(self, X):
        X = X.astype(float)
        predictions = []
        for x in X:
            distances = self.compute_distance(x, self.X_train)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            prediction = np.bincount(k_nearest_labels).argmax()
            predictions.append(prediction)
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        X1 = np.atleast_2d(X1).astype(float)
        X2 = np.atleast_2d(X2).astype(float)

        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X2 - X1)**2, axis=1))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X2 - X1), axis=1)
        else:
            raise ValueError("Unsupported distance metric")

        return distances.flatten()

In [13]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    combined_data = pd.concat([train_data, test_data], axis=0)
    combined_data = pd.get_dummies(combined_data, columns=['Geography', 'Gender'])

    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    combined_data[numerical_features] = (combined_data[numerical_features] - combined_data[numerical_features].mean()) / combined_data[numerical_features].std()

    X = combined_data.drop(['Exited', 'CustomerId', 'Surname', 'id'], axis=1)
    y = combined_data['Exited']

    X_train = X[:len(train_data)].astype(float)
    y_train = y[:len(train_data)].astype(int)
    X_test = X[len(train_data):].astype(float)

    return X_train.values, y_train.values, X_test.values

In [5]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    fold_size = len(X) // n_splits
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    scores = []
    for i in range(n_splits):
        test_indices = indices[i*fold_size:(i+1)*fold_size]
        train_indices = np.concatenate([indices[:i*fold_size], indices[(i+1)*fold_size:]])

        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)

        scores.append(compute_auc(y_test, y_pred))

    return np.array(scores)

In [21]:
# Define AUC computation function
def compute_auc(y_true, y_pred):
    n_pos = np.sum(y_true)
    n_neg = len(y_true) - n_pos

    pos_ranks = np.sum(np.argsort(y_pred)[y_true == 1])
    auc = (pos_ranks - n_pos * (n_pos + 1) / 2) / (n_pos * n_neg)

    return auc

# Load and preprocess data
X, y, X_test = preprocess_data('/content/train.csv', '/content/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)
print("Mean AUC:", np.mean(cv_scores))

k_values = [3, 5, 7, 9, 11, 13, 15, 17, 19]
distance_metrics = ['euclidean', 'manhattan']

best_k = 0
best_metric = ''
best_score = 0

for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)
        scores = cross_validate(X, y, knn)
        mean_score = np.mean(scores)

        if mean_score > best_score:
            best_score = mean_score
            best_k = k
            best_metric = metric

print(f"Best hyperparameters: k={best_k}, distance_metric={best_metric}")
print(f"Best mean AUC: {best_score}")

# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/content/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)
print("Successfuly saved to submissions.csv")

Cross-validation scores: [0.50795405 0.4929044  0.49640002 0.49714863 0.5057462 ]
Mean AUC: 0.500030657979862
Best hyperparameters: k=3, distance_metric=euclidean
Best mean AUC: 0.5074551794160483
Successfuly saved to submissions.csv
