In [7]:
import numpy as np
import pandas as pd
from collections import Counter
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
      predictions = []
      for x in X:
        distances = self.compute_distance(x)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        predictions.append(most_common[0][0])
      return np.array(predictions)

    def compute_distance(self, x):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((self.X_train - x)**2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(self.X_train - x), axis=1)
        else:
            raise ValueError("Unsupported distance metric")

    def probabilities(self, X):
        probas = []
        for x in X:
            distances = self.compute_distance(x)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            class_counts = Counter(k_nearest_labels)
            total = sum(class_counts.values())
            proba = {class_label: count / total for class_label, count in class_counts.items()}
            probas.append([proba.get(0, 0), proba.get(1, 0)])  # Assuming binary classification (0 and 1)
        return np.array(probas)


In [8]:

def preprocess_data(train_path, test_path):
    # Load the data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Combine train and test for preprocessing
    all_data = pd.concat([train_data, test_data], axis=0, sort=False)

    # Handle missing values
    numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    categorical_features = ['Geography', 'Gender']

    for feature in numeric_features:
        median_value = all_data[feature].median()
        all_data[feature].fillna(median_value, inplace=True)

    # Handle categorical variables
    all_data = pd.get_dummies(all_data, columns=categorical_features, drop_first=True)

    # Transforming numeric featues to have mean 0 and sd 1
    for feature in numeric_features:
        mean_value = all_data[feature].mean()
        std_value = all_data[feature].std()
        all_data[feature] = (all_data[feature] - mean_value) / std_value

    # Split data back into train and test just like it was split before
    train_preprocessed = all_data[:len(train_data)]
    test_preprocessed = all_data[len(train_data):]

    # Prepare features and target for train data
    X_train = train_preprocessed.drop(['Exited', 'id', 'CustomerId', 'Surname'], axis=1)
    y_train = train_preprocessed['Exited'] #target column

    # Prepare features for test data
    X_test = test_preprocessed.drop(['id', 'CustomerId', 'Surname'], axis=1)
    if 'Exited' in X_test.columns:
        X_test = X_test.drop('Exited', axis=1)

    return X_train.values, y_train.values, X_test.values

In [16]:
def k_fold_split(X, y, n_splits=5):
    """
    Custom implementation of k-fold split (not stratified).
    Splits data into `n_splits` folds for cross-validation.
    """
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    fold_sizes = np.full(n_splits, len(X) // n_splits, dtype=int)
    fold_sizes[:len(X) % n_splits] += 1  # Distribute remainder to some folds

    current = 0
    folds = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        folds.append(indices[start:stop])
        current = stop
    return folds

def custom_roc_auc_score(y_true, y_pred_proba):
    # Sort by predicted probabilities (in descending order)
    desc_order = np.argsort(-y_pred_proba)
    y_true = y_true[desc_order]
    y_pred_proba = y_pred_proba[desc_order]

    # Get the number of positives and negatives
    pos_count = np.sum(y_true)
    neg_count = len(y_true) - pos_count

    if pos_count == 0 or neg_count == 0:
        return 0.5  # Handle edge case where all data points belong to one class

    # Calculate true positive rate (TPR) and false positive rate (FPR)
    tpr = np.cumsum(y_true) / pos_count
    fpr = np.cumsum(1 - y_true) / neg_count

    # Calculate the area under the curve using the trapezoidal rule
    auc = np.trapz(tpr, fpr)

    return auc

# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):

    # Generate custom k-fold indices
    folds = k_fold_split(X, y, n_splits)  # Call your k_fold_split function here

    # Initialize list to store AUC scores
    auc_scores = []

    for fold_num, val_index in enumerate(folds, 1):
        # Split data
        train_index = np.hstack([folds[i] for i in range(n_splits) if i != fold_num - 1])
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Fit the model
        knn.fit(X_train, y_train)

        # Predict probabilities
        y_pred_proba = knn.probabilities(X_val)[:, 1]

        # Compute AUC score
        auc = custom_roc_auc_score(y_val, y_pred_proba)
        auc_scores.append(auc)

        print(f"Fold {fold_num} AUC: {auc:.4f}")

    # Compute and print mean AUC
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    print(f"\nMean AUC: {mean_auc:.4f} (+/- {std_auc:.4f})")

    return auc_scores, mean_auc


In [18]:


# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Ensure X, y, and X_test are numpy arrays
X = np.array(X, dtype=float)
y = np.array(y, dtype=int)
X_test = np.array(X_test, dtype=float)

# Hyperparameter tuning
k_values = [22, 23, 24, 25, 26]
distance_metrics = ['euclidean', 'manhattan']
best_k = None
best_metric = None
best_auc = 0

for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)
        auc_scores, mean_auc = cross_validate(X, y, knn)

        print(f"k={k}, metric={metric}: Mean AUC = {mean_auc:.4f}")

        if mean_auc > best_auc:
            best_auc = mean_auc
            best_k = k
            best_metric = metric

print(f"\nBest hyperparameters: k={best_k}, metric={best_metric}")

# Train on full dataset with optimal hyperparameters
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)

# Make predictions on test set
test_predictions_proba = knn.probabilities(X_test)[:, 1]

# Saving
output_df = pd.DataFrame({
    'id': pd.read_csv('test.csv')['id'],
    'Exited': test_predictions_proba
})
output_path = 'submission_1.csv'
output_df.to_csv(output_path, index=False)

print(f"Predictions saved to {output_path}")



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_data[feature].fillna(median_value, inplace=True)


Fold 1 AUC: 0.9054
Fold 2 AUC: 0.9152
Fold 3 AUC: 0.9212
Fold 4 AUC: 0.9152
Fold 5 AUC: 0.9202

Mean AUC: 0.9155 (+/- 0.0056)
k=22, metric=euclidean: Mean AUC = 0.9155
Fold 1 AUC: 0.9020
Fold 2 AUC: 0.9192
Fold 3 AUC: 0.9111
Fold 4 AUC: 0.9026
Fold 5 AUC: 0.9202

Mean AUC: 0.9110 (+/- 0.0078)
k=22, metric=manhattan: Mean AUC = 0.9110
Fold 1 AUC: 0.9052
Fold 2 AUC: 0.9192
Fold 3 AUC: 0.9284
Fold 4 AUC: 0.9025
Fold 5 AUC: 0.9163

Mean AUC: 0.9143 (+/- 0.0095)
k=23, metric=euclidean: Mean AUC = 0.9143
Fold 1 AUC: 0.9034
Fold 2 AUC: 0.9216
Fold 3 AUC: 0.9168
Fold 4 AUC: 0.9136
Fold 5 AUC: 0.9082

Mean AUC: 0.9127 (+/- 0.0064)
k=23, metric=manhattan: Mean AUC = 0.9127
Fold 1 AUC: 0.9070
Fold 2 AUC: 0.9198
Fold 3 AUC: 0.9137
Fold 4 AUC: 0.9283
Fold 5 AUC: 0.9101

Mean AUC: 0.9158 (+/- 0.0076)
k=24, metric=euclidean: Mean AUC = 0.9158
Fold 1 AUC: 0.9101
Fold 2 AUC: 0.9133
Fold 3 AUC: 0.9173
Fold 4 AUC: 0.9122
Fold 5 AUC: 0.9093

Mean AUC: 0.9124 (+/- 0.0028)
k=24, metric=manhattan: Mean AUC =