In [1]:
import numpy as np
import pandas as pd

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X.values if isinstance(X, pd.DataFrame) else X
        self.y_train = y.values if isinstance(y, pd.Series) else y

    def compute_distance(self, X1, X2):
        X1 = np.array(X1, dtype=np.float64)
        X2 = np.array(X2, dtype=np.float64)
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2)**2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)

    def predict(self, X):
        X = X.values if isinstance(X, pd.DataFrame) else X
        if X.ndim == 1:
            X = X.reshape(1, -1)
        probas = []
        for sample in X:
            distances = self.compute_distance(sample, self.X_train)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            proba = np.sum(k_nearest_labels) / self.k
            probas.append(proba)
        return np.array(probas)

In [2]:
class CustomStandardScaler:
    def __init__(self):
        self.mean_ = None
        self.scale_ = None

    def fit(self, X):
        X = self._check_input(X)
        self.mean_ = np.mean(X, axis=0)
        self.scale_ = np.std(X, axis=0, ddof=1)
        return self

    def transform(self, X):
        X = self._check_input(X)
        return (X - self.mean_) / self.scale_

    def fit_transform(self, X):
        return self.fit(X).transform(X)

    def _check_input(self, X):
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
            return X.values
        return np.array(X)

In [3]:
import pandas as pd
import numpy as np
from scipy import stats

# Define data preprocessing function
def preprocess_data(train_path, test_path):
    
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    train_data.drop(columns=['CustomerId', 'Surname', 'id'], inplace=True)
    test_data.drop(columns=['CustomerId', 'Surname', 'id'], inplace=True)

    train_data['HasCrCard'] = train_data['HasCrCard'].astype('object')
    train_data['IsActiveMember'] = train_data['IsActiveMember'].astype('object')

    test_data['HasCrCard'] = test_data['HasCrCard'].astype('object')
    test_data['IsActiveMember'] = test_data['IsActiveMember'].astype('object')
    
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    numerical_features = train_data.select_dtypes(include=[np.number]).columns.tolist()

    if 'Exited' in numerical_features:
        numerical_features.remove('Exited')
    
    def detect_outliers(df, features, threshold=3):
        outliers = np.zeros(df.shape[0])
        for feature in numerical_features:
            z_scores = np.abs(stats.zscore(df[feature]))
            outliers += (z_scores > threshold).astype(int)
        return outliers > 0

    outliers = detect_outliers(train_data, numerical_features)
    train_data = train_data[~outliers]

    scaler = CustomStandardScaler()
    train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])
    test_data[numerical_features] = scaler.transform(test_data[numerical_features])

    X = train_data.drop('Exited', axis=1)
    X = X.astype('float')
    y = train_data['Exited']
    X_test = test_data

    return X, y, X_test

In [4]:
# TODO: Implement cross-validation
class StratifiedKFold:
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state
    
    def split(self, X, y):
        np.random.seed(self.random_state)
        y = np.array(y)
        unique_classes, y_indices = np.unique(y, return_inverse=True)
        n_classes = len(unique_classes)
        
        folds = [[] for _ in range(self.n_splits)]
        
        for cls in range(n_classes):
            cls_indices = np.where(y_indices == cls)[0]
            if self.shuffle:
                np.random.shuffle(cls_indices)
            fold_sizes = np.full(self.n_splits, len(cls_indices) // self.n_splits, dtype=int)
            fold_sizes[:len(cls_indices) % self.n_splits] += 1
            current = 0
            for fold, fold_size in enumerate(fold_sizes):
                folds[fold].extend(cls_indices[current:current + fold_size])
                current += fold_size
        
        for fold in range(self.n_splits):
            val_indices = np.array(folds[fold])
            train_indices = np.array([idx for f in range(self.n_splits) if f != fold for idx in folds[f]])
            yield train_indices, val_indices

# Compute ROC AUC scores
def roc_auc_score(y_true, y_scores):
    desc_score_indices = np.argsort(-y_scores)
    y_true = y_true[desc_score_indices]
    y_scores = y_scores[desc_score_indices]
    
    distinct_value_indices = np.where(np.diff(y_scores))[0]
    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
    
    tps = np.cumsum(y_true)[threshold_idxs]
    fps = 1 + threshold_idxs - tps
    
    tps = np.r_[0, tps]
    fps = np.r_[0, fps]
    
    fpr = fps / fps[-1]
    tpr = tps / tps[-1]
    
    auc = np.trapz(tpr, fpr)
    return auc

# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    
    for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
        X_train, X_val = X.iloc[train_index].reset_index(drop=True), X.iloc[val_index].reset_index(drop=True)
        y_train, y_val = y.iloc[train_index].reset_index(drop=True), y.iloc[val_index].reset_index(drop=True)
        
        knn.fit(X_train, y_train)
        y_pred_proba = knn.predict(X_val)
        
        score = roc_auc_score(y_val.values, y_pred_proba)
        scores.append(score)
        print(f"Fold {fold} ROC AUC: {score}")
    
    mean_score = np.mean(scores)
    print(f"Mean ROC AUC: {mean_score}")
    return scores

In [5]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=20, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

Fold 1 ROC AUC: 0.9198138383498723
Fold 2 ROC AUC: 0.9010084352261915
Fold 3 ROC AUC: 0.9159531751030693
Fold 4 ROC AUC: 0.9177291016610838
Fold 5 ROC AUC: 0.9165611520901207
Mean ROC AUC: 0.9142131404860676
Cross-validation scores: [0.9198138383498723, 0.9010084352261915, 0.9159531751030693, 0.9177291016610838, 0.9165611520901207]


In [6]:
# Hyperparameter tuning
k_values = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
distance_metrics = ['euclidean', 'manhattan']

best_k = 0
best_metric = None
best_score = 0

for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)
        scores = cross_validate(X, y, knn)
        mean_score = np.mean(scores)
        
        if mean_score > best_score:
            best_score = mean_score
            best_k = k
            best_metric = metric

print(f"Best hyperparameters: k={best_k}, distance_metric={best_metric}")
print(f"Best mean ROC AUC: {best_score:}")

Fold 1 ROC AUC: 0.9110656001602622
Fold 2 ROC AUC: 0.8961619363101073
Fold 3 ROC AUC: 0.9088528544434265
Fold 4 ROC AUC: 0.9035825006882944
Fold 5 ROC AUC: 0.8999904642545771
Mean ROC AUC: 0.9039306711713335
Fold 1 ROC AUC: 0.9016859720542889
Fold 2 ROC AUC: 0.8911126056191914
Fold 3 ROC AUC: 0.908948050274851
Fold 4 ROC AUC: 0.8982887280319368
Fold 5 ROC AUC: 0.9012125453127151
Mean ROC AUC: 0.9002495802585967
Fold 1 ROC AUC: 0.9198138383498723
Fold 2 ROC AUC: 0.9010084352261915
Fold 3 ROC AUC: 0.9159531751030693
Fold 4 ROC AUC: 0.9177291016610838
Fold 5 ROC AUC: 0.9165611520901207
Mean ROC AUC: 0.9142131404860676
Fold 1 ROC AUC: 0.9160923939873078
Fold 2 ROC AUC: 0.8999906990720536
Fold 3 ROC AUC: 0.9157166170407696
Fold 4 ROC AUC: 0.9160983457991099
Fold 5 ROC AUC: 0.9104224263524985
Mean ROC AUC: 0.911664096450348
Fold 1 ROC AUC: 0.9247451188014681
Fold 2 ROC AUC: 0.9036456060270013
Fold 3 ROC AUC: 0.9188498482592762
Fold 4 ROC AUC: 0.9217882462258524
Fold 5 ROC AUC: 0.916808866379

In [7]:
# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

In [8]:
# Save test predictions
submission = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)