In [6]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [7]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [8]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict_single(x) for x in X]
        return np.array(predictions)

    def _predict_single(self, x):
        distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))
        else:
            raise ValueError("Unsupported distance metric")



In [18]:

def preprocess_data(train, test):
    # Dropping irrelevant columns: 'id', 'CustomerId', 'Surname'
    train = train.drop(['id', 'CustomerId', 'Surname'], axis=1)
    test = test.drop(['id', 'CustomerId', 'Surname'], axis=1)
    
    # Handle missing values for numerical and categorical columns
    numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    categorical_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
    
    # Fill missing numerical columns with mean
    for col in numerical_cols:
        train[col].fillna(train[col].mean(), inplace=True)
        test[col].fillna(test[col].mean(), inplace=True)
    
    # Fill missing categorical columns with most frequent value
    for col in categorical_cols:
        train[col].fillna(train[col].mode()[0], inplace=True)
        test[col].fillna(test[col].mode()[0], inplace=True)
    
    # Encoding categorical variables
    label_encodings = {}
    for col in categorical_cols:
        unique_vals = train[col].unique()
        label_encodings[col] = {val: i for i, val in enumerate(unique_vals)}
        train[col] = train[col].map(label_encodings[col])
        test[col] = test[col].map(lambda x: label_encodings[col].get(x, -1))  # Handle unseen categories
    
    # Feature engineering: Balance to Salary ratio
    train['Balance_to_Salary'] = train['Balance'] / (train['EstimatedSalary'] + 1)
    test['Balance_to_Salary'] = test['Balance'] / (test['EstimatedSalary'] + 1)
    
    # Standardization of numerical features
    for col in numerical_cols:
        mean = train[col].mean()
        std = train[col].std()
        train[col] = (train[col] - mean) / std
        test[col] = (test[col] - mean) / std
    
    # Split features and target
    X_train = train.drop('Exited', axis=1).values
    y_train = train['Exited'].values
    X_test = test.values
    
    return X_train, y_train, X_test

In [10]:
def cross_validate_sample(X, y, knn, n_splits=3, sample_size=0.3):
    n_samples = len(X)
    sample_size = int(sample_size * n_samples)
    indices = np.random.choice(n_samples, sample_size, replace=False)
    
    X_sample = X[indices]
    y_sample = y[indices]
    
    fold_size = sample_size // n_splits
    auc_scores = []

    for fold in range(n_splits):
        val_start = fold * fold_size
        val_end = val_start + fold_size
        
        X_val = X_sample[val_start:val_end]
        y_val = y_sample[val_start:val_end]
        
        X_train_fold = np.concatenate((X_sample[:val_start], X_sample[val_end:]))
        y_train_fold = np.concatenate((y_sample[:val_start], y_sample[val_end:]))
        
        knn.fit(X_train_fold, y_train_fold)
        y_val_pred = knn.predict(X_val)
        
        auc = compute_auc(y_val, y_val_pred)
        auc_scores.append(auc)

    return np.mean(auc_scores)

In [20]:
def compute_auc(y_true, y_pred):
    # Custom implementation to compute AUC based on true positives, false positives, etc.
    sorted_indices = np.argsort(y_pred)
    y_true_sorted = y_true[sorted_indices]
    
    total_positive = np.sum(y_true)
    total_negative = len(y_true) - total_positive
    tp, fp = 0, 0
    auc = 0
    for label in y_true_sorted:
        if label == 1:
            tp += 1
        else:
            fp += 1
            auc += tp
    
    if total_positive * total_negative == 0:
        return 0
    
    return auc / (total_positive * total_negative)

In [13]:
train_data = pd.read_csv('/mnt/data/train.csv')
test_data = pd.read_csv('/mnt/data/test.csv')

# Preprocess data
X_train, y_train, X_test = preprocess_data(train_data, test_data)

# Hyperparameter tuning
best_k = 3
best_distance = 'euclidean'
best_auc = 0

for k in range(3, 7):
    for metric in ['euclidean', 'manhattan']:
        knn = KNN(k=k, distance_metric=metric)
        auc_score = cross_validate_sample(X_train, y_train, knn, n_splits=3, sample_size=0.3)
        print(f"k: {k}, Metric: {metric}, AUC: {auc_score}")
        if auc_score > best_auc:
            best_k = k
            best_distance = metric
            best_auc = auc_score

print(f"Best k: {best_k}, Best Distance Metric: {best_distance}, Best AUC: {best_auc}")

k: 3, Metric: euclidean, AUC: 0.7603646264836792
k: 3, Metric: manhattan, AUC: 0.7652486377825815
k: 4, Metric: euclidean, AUC: 0.7441660075040418
k: 4, Metric: manhattan, AUC: 0.7667308316820064
k: 5, Metric: euclidean, AUC: 0.7509164771349983
k: 5, Metric: manhattan, AUC: 0.7804733050370333
k: 6, Metric: euclidean, AUC: 0.775610681826167
k: 6, Metric: manhattan, AUC: 0.748368750068425
Best k: 5, Best Distance Metric: manhattan, Best AUC: 0.7804733050370333


In [15]:
# Train the final model and make predictions
knn_final = KNN(k=best_k, distance_metric=best_distance)
knn_final.fit(X_train, y_train)
test_predictions = knn_final.predict(X_test)

In [16]:
# Prepare the submission file
test_data = pd.read_csv('test.csv')
submission = pd.DataFrame({
    'id': test_data['id'],
    'Exited': test_predictions
})

submission.to_csv('submission.csv', index=False)
print("Predictions saved to 'submission.csv'.")

Predictions saved to 'submission.csv'.
