In [48]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from collections import Counter

In [54]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # Store the training data
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        predictions = []

        for x in X:
            # Compute distances from the test point to all training points
            distances = self.compute_distance(self.X_train, x)
            
            # Get the indices of the k nearest neighbors
            k_nearest_indices = np.argsort(distances)[:self.k]
            
            # Get the labels of the k nearest neighbors
            k_nearest_labels = [self.y_train[i] for i in k_nearest_indices]
            
            # Majority vote for classification
            most_common_label = Counter(k_nearest_labels).most_common(1)[0][0]
            
            predictions.append(most_common_label)
        
        return np.array(predictions)

    def compute_distance(self, X_train, x_test):
        if self.distance_metric == 'euclidean':
            # Euclidean distance: sqrt(sum((x1 - x2)^2))
            distances = np.sqrt(np.sum((X_train - x_test) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            # Manhattan distance: sum(abs(x1 - x2))
            distances = np.sum(np.abs(X_train - x_test), axis=1)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")
        
        return distances


In [60]:
def preprocess_data(train_path, test_path):
    # Load the training and testing data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Combine train and test for consistent preprocessing if needed
    combined_data = pd.concat([train_data, test_data], axis=0)
    
    # Drop unnecessary columns
    combined_data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)

    # Handle categorical variables (e.g., Geography and Gender)
    combined_data = pd.get_dummies(combined_data, columns=['Geography', 'Gender'], drop_first=True)
    
    # Split combined data back into train and test sets
    train_data = combined_data.iloc[:train_data.shape[0], :]
    test_data = combined_data.iloc[train_data.shape[0]:, :]
    
    # Separate features and target variable from training data
    X_train = train_data.drop('Exited', axis=1)
    y_train = train_data['Exited']

    # For the test set, remove the 'Exited' column if it exists
    if 'Exited' in test_data.columns:
        X_test = test_data.drop('Exited', axis=1)
    else:
        X_test = test_data
    
    # Scale the features using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train

In [62]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # Initialize KFold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # List to store the ROC AUC scores for each fold
    roc_auc_scores = []
    
    # Loop over each split
    for train_index, val_index in kf.split(X):
        # Split the data into training and validation sets
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        # Train the KNN classifier
        knn.fit(X_train, y_train)
        
        # Make predictions on the validation set
        y_val_pred = knn.predict(X_val)
        
        # Compute the ROC AUC score for the current fold
        roc_auc = roc_auc_score(y_val, y_val_pred)
        roc_auc_scores.append(roc_auc)
    
    # Return the average ROC AUC score across all folds
    return np.mean(roc_auc_scores)

In [64]:
# Load and preprocess data
X, X_test, y = preprocess_data('train.csv', 'test.csv')

# Hyperparameter tuning: Explore different values of k
best_k = None
best_score = 0
k_values = [3, 5, 7, 9, 11, 20, 30, 50]  # You can expand this range

for k in k_values:
    knn = KNN(k=k, distance_metric='euclidean')
    cv_score = cross_validate(X, y, knn)
    
    print(f"K: {k}, Cross-validation ROC AUC Score: {cv_score}")
    
    if cv_score > best_score:
        best_score = cv_score
        best_k = k

print(f"Best K: {best_k} with ROC AUC Score: {best_score}")



# Train on the full dataset with the optimal hyperparameters
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X, y)

# Make predictions on the test set
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)
print("Predictions saved to submissions.csv")

K: 3, Cross-validation ROC AUC Score: 0.7548341452965064
K: 5, Cross-validation ROC AUC Score: 0.7553408465522782
K: 7, Cross-validation ROC AUC Score: 0.7571202790216982
K: 9, Cross-validation ROC AUC Score: 0.7595691393906827
K: 11, Cross-validation ROC AUC Score: 0.7564602216082232
K: 20, Cross-validation ROC AUC Score: 0.7503640500489164
K: 30, Cross-validation ROC AUC Score: 0.7423405503965391
K: 50, Cross-validation ROC AUC Score: 0.7317815212744053
Best K: 9 with ROC AUC Score: 0.7595691393906827
Predictions saved to submissions.csv
