In [1]:
import numpy as np
import pandas as pd

In [2]:
def most_common_label(arr):
    length = len(arr)
    prob = np.sum(arr == 1) / length
    return round(prob, 2), int(prob >= 0.5)

# Define the KNN class
class KNN:
    def __init__(self, k, distance_metric):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
      predictions, probabilities = [], []
      for x in X:
          distances = self.compute_distance(self.X_train, x)
          k_nearest_indices = np.argsort(distances)[:self.k]
          k_nearest_labels = self.y_train[k_nearest_indices]
          prob, common = most_common_label(k_nearest_labels)
          predictions.append(common)
          probabilities.append(prob)
      return predictions, probabilities


    def compute_distance(self, X1, X2):
        X1_without_index = X1[:, 1:]  # Remove index column from X1
        X2_without_index = X2[1:]  # Remove index from X2
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X1_without_index - X2_without_index) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X1_without_index - X2_without_index), axis=1)
        else:
            raise ValueError("Unsupported distance metric")
        return distances

In [3]:
def preprocess_data(train_path, test_path):
    # Load data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Handle missing values by dropping rows with any missing values
    train_data = train_data.dropna()
    test_data = test_data.dropna()

    X = train_data.drop(columns=['CustomerId', 'Surname', 'Exited','Geography', 'Gender'])
    y = train_data['Exited'].values
    X_test = test_data.drop(columns=['CustomerId', 'Surname','Geography', 'Gender'])

    numerical_cols = train_data.select_dtypes(include=[np.number]).columns
    X = (X - X.mean()) / X.std()
    X_test = (X_test - X_test.mean()) / X_test.std()

    X_test = X_test.reindex(columns=X.columns, fill_value=0)

    return X.values, y, X_test.values


In [4]:
def cross_validate(X, y, knn, n_splits):
    n_samples = len(X)
    fold_size = n_samples // n_splits
    scores = {'roc_auc': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': []}

    for i in range(n_splits):
        start, end = i * fold_size, (i + 1) * fold_size

        # val and train sets created
        X_val = X[start:end]
        y_val = y[start:end]
        X_train = np.concatenate([X[:start], X[end:]])
        y_train = np.concatenate([y[:start], y[end:]])

        knn.fit(X_train, y_train)
        y_pred, probs = knn.predict(X_val)

        auc_score, accuracy, precision, recall, f1_score = calculate_metrics(y_val, y_pred)
        print(f"Fold {i + 1}: auc-score:{auc_score}, accuracy:{accuracy}, precision:{precision}, recall:{recall}, f1-score:{f1_score}")

        scores['roc_auc'].append(auc_score)
        scores['accuracy'].append(accuracy)
        scores['precision'].append(precision)
        scores['recall'].append(recall)
        scores['f1_score'].append(f1_score)

    return {metric: np.mean(scores[metric]) for metric in scores}

def calculate_metrics(y_true, y_scores):
    # sort the instances by the predicted score in descending order
    sorted_indices = np.argsort(y_scores)[::-1]
    y_true = y_true[sorted_indices]
    y_scores = np.array(y_scores)
    y_scores = y_scores[sorted_indices]

    # total number of positive and negative samples
    P = np.sum(y_true)
    N = len(y_true) - P

    tp = np.sum((y_true == 1.0) & (y_scores == 1.0))
    tn = np.sum((y_true == 0.0) & (y_scores == 0.0))
    fp = np.sum((y_true == 0.0) & (y_scores == 1.0))
    fn = np.sum((y_true == 1.0) & (y_scores == 0.0))

    accuracy = (tp + tn) / len(y_true)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # true pos rate, false pos rate
    tpr, fpr = [], []
    tp, fp = 0, 0

    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1

        tpr.append(tp / P if P > 0 else 0)
        fpr.append(fp / N if N > 0 else 0)

    # Calculate the area under the curve using the trapezoidal rule
    auc = 0.0
    for i in range(1, len(tpr)):
        auc += (fpr[i] - fpr[i - 1]) * (tpr[i] + tpr[i - 1]) / 2

    return auc, accuracy, precision, recall, f1_score


In [5]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# KNN with Euclidean distance
knn_euclidean = KNN(k=4, distance_metric='euclidean')
cv_scores_euclidean = cross_validate(X, y, knn_euclidean, n_splits=5)
print("Cross-validation results (Euclidean):", cv_scores_euclidean)

# KNN with Manhattan distance
knn_manhattan = KNN(k=4, distance_metric='manhattan')
cv_scores_manhattan = cross_validate(X, y, knn_manhattan, n_splits=5)
print("Cross-validation results (Manhattan):", cv_scores_manhattan)

best_k = 4
knn_best = KNN(k=best_k, distance_metric='euclidean')

# Train on the full dataset with optimal hyperparameters and make predictions on the test set
knn_best.fit(X, y)
test_predictions, probabilities = knn_best.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': probabilities}).to_csv('submissions.csv', index=False)

Fold 1: auc-score:0.7855358777128921, accuracy:0.8453333333333334, precision:0.6124260355029586, recall:0.672077922077922, f1-score:0.6408668730650154
Fold 2: auc-score:0.7938791334028867, accuracy:0.8566666666666667, precision:0.6180981595092024, recall:0.690068493150685, f1-score:0.6521035598705502
Fold 3: auc-score:0.7814762579004052, accuracy:0.856, precision:0.6299694189602446, recall:0.6843853820598007, f1-score:0.6560509554140128
Fold 4: auc-score:0.807074290484139, accuracy:0.864, precision:0.653125, recall:0.6920529801324503, f1-score:0.6720257234726686
Fold 5: auc-score:0.7718101905339951, accuracy:0.854, precision:0.6497622820919176, recall:0.6539074960127592, f1-score:0.6518282988871223
Cross-validation results (Euclidean): {'roc_auc': 0.7879551500068637, 'accuracy': 0.8552, 'precision': 0.6326761792128647, 'recall': 0.6784984546867234, 'f1_score': 0.6545750821418739}
Fold 1: auc-score:0.7762688758389269, accuracy:0.841, precision:0.6023564064801178, recall:0.66396103896103