In [7]:
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from collections import Counter

In [26]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # Convert X and y to numpy arrays and reset index to avoid mismatches
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        # Convert X to numpy array if it's not already
        X = np.array(X)
        predictions = [self._predict_single(x) for x in X]
        return np.array(predictions)

    def _predict_single(self, x):
        # Compute the distances from the current test point to all training points
        distances = self.compute_distance(self.X_train, x)
        # Get indices of the k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        # Retrieve the labels of the k nearest neighbors (use numpy arrays, no need for pandas indexing)
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Return the most common label among the neighbors
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2)**2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)


In [27]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop columns that are not useful for prediction
    train_data = train_data.drop(columns=['id', 'CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    # Handle categorical variables (Geography and Gender)
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    # Split features and target
    X_train = train_data.drop(columns=['Exited'])
    y_train = train_data['Exited']

    # Scale features using StandardScaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(test_data)

    return X_train, y_train, X_test

In [28]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits)
    roc_auc_scores = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)
        
        roc_auc = roc_auc_score(y_val, y_pred)
        roc_auc_scores.append(roc_auc)

    return np.mean(roc_auc_scores)



In [None]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=8, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparameters tuning (optimize k, distance metric, etc.)
# For now, we are using k=5 and euclidean distance

# Train on the full dataset with optimal hyperparameters and make predictions on the test set
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)


Cross-validation scores: 0.7688646985430012
