In [46]:
import numpy as np
import pandas as pd

In [47]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        # Store the training data
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        return np.array([self._predict_single(dist) for dist in X])

    def _predict_single(self, dist):
        # Compute distances between X and all examples in the training set
        distances = self.compute_distances(self.X_train, dist)
        # Get the k nearest samples
        k_indices = np.argsort(distances)[:self.k]
        # Get the labels of the k nearest samples
        k_labels = self.y_train[k_indices]

        # Return the most common class label with mean 
        most_common = np.mean(k_labels == 1)
        return most_common

    def compute_distances(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            # Euclidean distance calculation
            distances = np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            # Manhattan distance calculation
            distances = np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")
        return distances

In [48]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    # pass

    # Drop columns that aren't useful for training
    train_data = train_data.drop(columns=['CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['CustomerId', 'Surname'])

    # # Handle missing values
    # numeric_cols = train_data.select_dtypes(include=['float64', 'int64']).columns
    # train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].mean())
    # test_data[numeric_cols] = test_data[numeric_cols].fillna(test_data[numeric_cols].mean())

    # Convert categorical variables to dummy variables
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    # # Ensure that both train and test datasets have the same columns after get_dummies
    for col in train_data.columns:
        if col not in test_data.columns:
            test_data[col] = 0
    # reorder columns to match and drop the 'Exited' column from test data
    test_data = test_data[train_data.columns.drop('Exited')]

    # Separate features and labels
    y = train_data['Exited']
    X = train_data.drop(columns=['Exited'])
    X_test = test_data # test data has no 'Exited' column

    # Feature scaling (min-max normalization)
    # for col in X.columns:
    #     min_val = X[col].min()
    #     max_val = X[col].max()
    #     X[col] = (X[col] - min_val) / (max_val - min_val)
    #     X_test[col] = (X_test[col] - min_val) / (max_val - min_val)

    # min-max normalization did nit work well, reverted to mean normalization

    # Scale features manually (mean normalization)
    # mean normalization (standardization) with the mean and standard deviation calculated from the training set, and applies these values to both the train and test sets. 
    # This ensures that the test set is scaled in the same way as the train set, which is important for consistency and was the problem with min-max normalization.
    X_train_mean = X.mean()
    X_train_std = X.std()

    X = (X - X_train_mean) / X_train_std
    X_test = (X_test - X_train_mean) / X_train_std  # Use train mean and std for test data

    # return the values
    return X.values, y.values, X_test.values

In [49]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    # pass
    fold_size = len(X) // n_splits
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    ROC_AUC_scores = []

    for fold in range(n_splits):
        # split the data into training and validation sets
        X_val = X[fold * fold_size:(fold + 1) * fold_size]
        y_val = y[fold * fold_size:(fold + 1) * fold_size]

        X_train = np.concatenate([X[:fold * fold_size], X[(fold + 1) * fold_size:]])
        y_train = np.concatenate([y[:fold * fold_size], y[(fold + 1) * fold_size:]])

        # Fit the KNN model and make predictions
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)

        # Calculate ROC-AUC score from function and store it
        ROC_AUC = compute_roc_auc(y_val, y_pred)
        ROC_AUC_scores.append(ROC_AUC)

    # return the average ROC AUC score
    return np.mean(ROC_AUC_scores)

def compute_roc_auc(y_true, y_pred):
    # Compute ROC AUC score
    # sort by predicted probabilities
    sorted = np.argsort(y_pred)[::-1]
    # sort the true labels according to the sorted predicted probabilities
    y_true = y_true[sorted]

    # Compute true positive rate and false positive rate
    cumulative_positives = np.cumsum(y_true)
    cumulative_negatives = np.cumsum(1 - y_true)
    total_positives = np.sum(y_true)
    total_negatives = len(y_true) - total_positives
    tpr = cumulative_positives / total_positives
    fpr = cumulative_negatives / total_negatives

    # Compute ROC AUC by trapezoidal rule
    return np.trapz(tpr, fpr)

In [50]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# Hyperparameter tuning using grid search
def hyperparameter_tuning(X, y, k_values, distance_metrics):
    best_k = 0
    best_metric = None
    best_score = 0

    for k in k_values:
        for metric in distance_metrics:
            knn = KNN(k=k, distance_metric=metric)
            score = cross_validate(X, y, knn)

            if score > best_score:
                best_score = score
                best_k = k
                best_metric = metric

    print(f"Best hyperparameters: k={best_k}, distance_metric={best_metric}")
    return best_k, best_metric

# TODO: hyperparamters tuning
# Tune hyperparameters
# Try k-values from 1 to 20 and both euclidean and manhattan distance metrics
# already tried ranges:
# k_values = range(1, 21)
    #  best was k = 20 with manhattan
# k_values = range(10, 26)
    # best was k = 25 with manhattan
# k_values = range(20, 34)
    # best was k = 33 with manhattan
k_values = range(32, 40)
distance_metrics = ['euclidean', 'manhattan']

# starter
best_k = 1
best_metric = 'euclidean'
best_score = 0

# go through the k_values and distance_metrics to find the best hyperparameters
for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)
        score = cross_validate(X, y, knn)
        print(f"k={k}, distance_metric={metric}, ROC AUC score={score}")

        # Find the best hyperparameters
        if score > best_score:
            best_k = k
            best_metric = metric
            best_score = score

print(f"Best hyperparameters with score {best_score:.4f}: k={best_k}, distance_metric={best_metric}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)
print('Predictions saved to submissions.csv')

  return np.trapz(tpr, fpr)


Cross-validation scores: 0.8678493268229162
k=32, distance_metric=euclidean, ROC AUC score=0.902680096738502
k=32, distance_metric=manhattan, ROC AUC score=0.905546709140215
k=33, distance_metric=euclidean, ROC AUC score=0.9029775093753122
k=33, distance_metric=manhattan, ROC AUC score=0.9059691608080627
k=34, distance_metric=euclidean, ROC AUC score=0.9036890887482828
k=34, distance_metric=manhattan, ROC AUC score=0.9061357130758371
k=35, distance_metric=euclidean, ROC AUC score=0.9037854866389712
k=35, distance_metric=manhattan, ROC AUC score=0.9064278305969399
k=36, distance_metric=euclidean, ROC AUC score=0.9039242858710897
k=36, distance_metric=manhattan, ROC AUC score=0.9059750542725518
k=37, distance_metric=euclidean, ROC AUC score=0.9037316111880266
k=37, distance_metric=manhattan, ROC AUC score=0.9067002474252848
k=38, distance_metric=euclidean, ROC AUC score=0.9038087982950304
k=38, distance_metric=manhattan, ROC AUC score=0.9065212162135738
k=39, distance_metric=euclidean, R