In [24]:
import numpy as np
import pandas as pd

In [25]:
# Adjusting the KNN class to handle NumPy arrays directly without needing the `.values` attribute

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        """Fit the KNN model with training data."""
        self.X_train = X  # No need to convert to NumPy array
        self.y_train = y.astype(int)  # Ensure integer labels

    def compute_distance(self, X1, X2):
        """Compute distance between two points based on the specified distance metric."""
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")

    def predict(self, X):
        """Predict labels for input data."""
        predictions = []
        for i in range(X.shape[0]):
            distances = self.compute_distance(self.X_train, X[i, :])
            nearest_neighbors = np.argsort(distances)[:self.k]
            nearest_labels = self.y_train[nearest_neighbors]
            majority_label = np.argmax(np.bincount(nearest_labels))  # Majority voting
            predictions.append(majority_label)
        return np.array(predictions)

    def predict_proba(self, X):
        """Predict the probability of each class for input data."""
        probabilities = []
        for i in range(X.shape[0]):
            distances = self.compute_distance(self.X_train, X[i, :])
            nearest_neighbors = np.argsort(distances)[:self.k]
            nearest_labels = self.y_train[nearest_neighbors]
            prob = np.mean(nearest_labels)  # Probability is the mean of nearest labels
            probabilities.append(prob)
        return np.array(probabilities)


In [26]:
# Implementing a custom standard scaler function to replace StandardScaler from sklearn
def custom_standard_scaler(X):
    """Custom standard scaler to standardize features."""
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    return (X - mean) / std

def custom_label_encode(column):
    """Custom label encoder to map unique values in a column to integers."""
    unique_values = column.unique()
    encoding_map = {value: index for index, value in enumerate(unique_values)}
    return column.map(encoding_map)

# Updating the preprocessing function to use the custom standard scaler
def preprocess_data_custom(train_data, test_data):
    # Combine train and test for consistent preprocessing
    test_data['Exited'] = -1  # Assign a placeholder value for Exited in test data
    combined_data = pd.concat([train_data, test_data], axis=0)

    # Handle missing values (if any) - let's fill with median for simplicity
    combined_data.fillna(combined_data.median(numeric_only=True), inplace=True)

    # Custom encoding for categorical variables
    combined_data['Geography'] = custom_label_encode(combined_data['Geography'])
    combined_data['Gender'] = custom_label_encode(combined_data['Gender'])

    # Standardize numerical features using custom standard scaler
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    combined_data[numerical_features] = custom_standard_scaler(combined_data[numerical_features].values)

    # Split back the train and test data
    train_data = combined_data[combined_data['Exited'] != -1]
    test_data = combined_data[combined_data['Exited'] == -1].drop('Exited', axis=1)

    # Separate features and target for training data
    X = train_data.drop(columns=['CustomerId', 'Surname', 'Exited', 'id'])
    y = train_data['Exited']
    X_test = test_data.drop(columns=['CustomerId', 'Surname', 'id'])

    return X, y, X_test



In [27]:
# Re-implementing metrics and cross-validation from scratch without sklearn

def accuracy_score_custom(y_true, y_pred):
    """Compute accuracy score."""
    return np.sum(y_true == y_pred) / len(y_true)

def precision_score_custom(y_true, y_pred):
    """Compute precision score."""
    true_positive = np.sum((y_true == 1) & (y_pred == 1))
    predicted_positive = np.sum(y_pred == 1)
    return true_positive / predicted_positive if predicted_positive > 0 else 0

def recall_score_custom(y_true, y_pred):
    """Compute recall score."""
    true_positive = np.sum((y_true == 1) & (y_pred == 1))
    actual_positive = np.sum(y_true == 1)
    return true_positive / actual_positive if actual_positive > 0 else 0

def f1_score_custom(y_true, y_pred):
    """Compute F1 score."""
    precision = precision_score_custom(y_true, y_pred)
    recall = recall_score_custom(y_true, y_pred)
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

def roc_auc_score_custom(y_true, y_pred_proba):
    """Compute ROC AUC score from scratch."""
    pos_label = 1
    neg_label = 0
    y_true = np.array(y_true)
    sorted_indices = np.argsort(y_pred_proba)[::-1]
    y_true_sorted = y_true[sorted_indices]

    # Count the number of positive and negative samples
    pos_count = np.sum(y_true == pos_label)
    neg_count = np.sum(y_true == neg_label)

    tpr_list = []
    fpr_list = []
    tpr = 0
    fpr = 0

    # Step through the sorted true labels
    for i in range(len(y_true_sorted)):
        if y_true_sorted[i] == pos_label:
            tpr += 1 / pos_count
        else:
            fpr += 1 / neg_count
        tpr_list.append(tpr)
        fpr_list.append(fpr)

    # Compute the area under the curve using the trapezoidal rule
    auc = np.trapz(tpr_list, fpr_list)
    return auc

def cross_validate_custom(X, y, knn, n_splits=5):
    """Perform cross-validation from scratch and return performance metrics."""
    fold_size = len(X) // n_splits
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    roc_auc_scores = []

    for fold in range(n_splits):
        # Split the data into training and validation sets
        start_val = fold * fold_size
        end_val = (fold + 1) * fold_size if fold != n_splits - 1 else len(X)

        X_train = np.concatenate([X[:start_val], X[end_val:]], axis=0)
        y_train = np.concatenate([y[:start_val], y[end_val:]], axis=0)
        X_val = X[start_val:end_val]
        y_val = y[start_val:end_val]

        # Fit the KNN model on the training data
        knn.fit(X_train, y_train)

        # Predict on the validation set
        y_pred = knn.predict(X_val)
        y_pred_proba = knn.predict_proba(X_val)

        # Compute and store performance metrics
        accuracy_scores.append(accuracy_score_custom(y_val, y_pred))
        precision_scores.append(precision_score_custom(y_val, y_pred))
        recall_scores.append(recall_score_custom(y_val, y_pred))
        f1_scores.append(f1_score_custom(y_val, y_pred))
        roc_auc_scores.append(roc_auc_score_custom(y_val, y_pred_proba))

    return {
        "accuracy": np.mean(accuracy_scores),
        "precision": np.mean(precision_scores),
        "recall": np.mean(recall_scores),
        "f1_score": np.mean(f1_scores),
        "roc_auc": np.mean(roc_auc_scores)
    }

# Now we can use these custom implementations in cross-validation and evaluation
# I will now proceed with training the KNN model using the updated cross-validation.


In [29]:
# Making predictions on the test set and generating the final submission file
# Initialize KNN with the best parameters
# Reload the datasets and apply the updated preprocessing function
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

# Apply the preprocessing function with the custom scaler
X, y, X_test = preprocess_data_custom(train_data, test_data)
knn_model_final_custom = KNN(k=17, distance_metric='manhattan')

# Perform cross-validation with custom implementations
cv_results_custom = cross_validate_custom(X.values, y.values.astype(int), knn_model_final_custom, n_splits=5)

# Display the cross-validation results
cv_results_custom
# Fit the final model on the full training dataset
knn_model_final_custom.fit(X.values, y.values.astype(int))

# Predict probabilities on the test set
test_probabilities_custom = knn_model_final_custom.predict_proba(X_test.values)

# Load the original test set to get the 'id' column
test_data_full = pd.read_csv('./test.csv')

# Create the submission DataFrame
submission_custom = pd.DataFrame({'id': test_data_full['id'], 'Exited': test_probabilities_custom})

# Save the submission to a CSV file
submission_file_path_custom = './submission.csv'
submission_custom.to_csv(submission_file_path_custom, index=False)

submission_file_path_custom  # Output the file path for user to download


'./submission.csv'