In [5]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [20]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x):
        distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train.iloc[k_indices]  # Use iloc to safely index by position
        most_common = self._most_common_label(k_nearest_labels)
        return most_common

    def compute_distance(self, a, b):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((a - b) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(a - b))
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")

    def _most_common_label(self, labels):
        return pd.Series(labels).mode()[0]


In [16]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop columns that won't be used
    train_data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)
    test_data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)

    # Define feature columns and target column
    feature_cols = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
                    'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
    target_col = 'Exited'

    # Separate features and target in training data
    X_train = train_data[feature_cols]
    y_train = train_data[target_col]

    # Features in the test data
    X_test = test_data[feature_cols]

    # Preprocessing for numerical data: scaling
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
                          'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Preprocessing for categorical data: one-hot encoding
    categorical_features = ['Geography', 'Gender']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Fit and transform the train data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    return X_train_preprocessed, y_train, X_test_preprocessed


In [17]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold


def cross_validate(X, y, knn, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    roc_auc_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)

        roc_auc = roc_auc_score(y_val, y_pred)
        roc_auc_scores.append(roc_auc)

    return np.mean(roc_auc_scores), np.std(roc_auc_scores)


In [21]:
def custom_grid_search(X, y, param_grid, n_splits=5):
    best_score = 0
    best_params = None
    
    for k in param_grid['k']:
        for distance_metric in param_grid['distance_metric']:
            knn = KNN(k=k, distance_metric=distance_metric)
            mean_score, std_score = cross_validate(X, y, knn, n_splits)
            print(f"Params: k={k}, distance_metric={distance_metric}, Mean ROC AUC: {mean_score}, Std: {std_score}")

            if mean_score > best_score:
                best_score = mean_score
                best_params = {'k': k, 'distance_metric': distance_metric}
    
    return best_params, best_score


# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn, n_splits=5)

print("Cross-validation scores:", cv_scores)

# Hyperparameters tuning
param_grid = {
    'k': [3, 5, 7],
    'distance_metric': ['euclidean', 'manhattan']
}
best_params, best_score = custom_grid_search(X, y, param_grid, n_splits=5)
print(f"Best Params: {best_params}, Best ROC AUC: {best_score}")

# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_params['k'], distance_metric=best_params['distance_metric'])
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: (0.772968538595073, 0.0055447682380054646)
Params: k=3, distance_metric=euclidean, Mean ROC AUC: 0.7681268484255398, Std: 0.008625182321144568
Params: k=3, distance_metric=manhattan, Mean ROC AUC: 0.7645737636794798, Std: 0.00997101111467224
Params: k=5, distance_metric=euclidean, Mean ROC AUC: 0.772968538595073, Std: 0.0055447682380054646
Params: k=5, distance_metric=manhattan, Mean ROC AUC: 0.7690510478821524, Std: 0.00693751241209164
Params: k=7, distance_metric=euclidean, Mean ROC AUC: 0.7744699150850485, Std: 0.006366057741985525
Params: k=7, distance_metric=manhattan, Mean ROC AUC: 0.7682608728642004, Std: 0.006169267860072259
Best Params: {'k': 7, 'distance_metric': 'euclidean'}, Best ROC AUC: 0.7744699150850485
