In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [7]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for x in X:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            nearest_neighbors = np.argsort(distances)[:self.k]
            nearest_labels = self.y_train[nearest_neighbors].astype(int)
            prediction = np.bincount(nearest_labels).argmax()
            predictions.append(prediction)
        return np.array(predictions)


    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.linalg.norm(X2 - X1, axis=1) 

In [8]:
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # separate target from features
    y = train_data['Exited'].values
    X = train_data.drop(columns=['id', 'CustomerId', 'Surname', 'Exited'])
    X_test = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    # preprocessing
    categorical_cols = ['Geography', 'Gender']
    numerical_cols = [col for col in X.columns if col not in categorical_cols]

    # preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(), categorical_cols)
        ]
    )

    X = preprocessor.fit_transform(X)
    X_test = preprocessor.transform(X_test)

    return X, y, X_test

In [9]:
def cross_validate(X, y, knn, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        knn.fit(X_train, y_train)
        y_val_pred = knn.predict(X_val)
        auc = roc_auc_score(y_val, y_val_pred)
        auc_scores.append(auc)

    return auc_scores

In [10]:
X, y, X_test = preprocess_data('train.csv', 'test.csv')

knn = KNN(k=5, distance_metric='euclidean')

cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)
print("Average AUC score:", np.mean(cv_scores))

knn.fit(X, y)
test_predictions = knn.predict(X_test)

pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: [np.float64(0.7712108355441939), np.float64(0.7804350772567091), np.float64(0.7771226641640085), np.float64(0.7642299729256252), np.float64(0.7718441430848281)]
Average AUC score: 0.772968538595073
