In [10]:
import numpy as np
import pandas as pd

In [23]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        # Store the training data
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        # Predict labels for each point in X
        X = np.array(X)
        predictions = [self._predict_single(x) for x in X]
        return np.array(predictions)

    def predict_proba(self, X):
        # Predict probabilities for each point in X
        X = np.array(X)
        probabilities = [self._predict_single_proba(x) for x in X]
        return np.array(probabilities)

    def _predict_single(self, x):
        distances = self.compute_all_distances(x)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        return np.argmax(np.bincount(k_nearest_labels))

    def _predict_single_proba(self, x):
        distances = self.compute_all_distances(x)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        return np.mean(k_nearest_labels)

    def compute_all_distances(self, x):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(self.X_train - x), axis=1)
        else:
            raise ValueError("Unsupported distance metric")


In [24]:
def preprocess_data(train_path, test_path):
    # Load the training and test datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Adjust column names as needed
    drop_columns = ['CustomerId', 'Surname']
    # Only drop columns that exist
    drop_columns = [col for col in drop_columns if col in train_data.columns]

    # Drop unnecessary columns from train and test data
    train_data = train_data.drop(columns=drop_columns)
    test_data = test_data.drop(columns=drop_columns)

    # Separate features and target variable from training data
    X_train = train_data.drop(columns=['Exited'])
    y_train = train_data['Exited']

    # For test data, keep all features
    X_test = test_data

    # Identify numerical and categorical columns
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    categorical_features = ['Geography', 'Gender']

    # Create transformers for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
        ('scaler', StandardScaler())  # Scale features
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
        ('onehot', OneHotEncoder(drop='first'))  # One-hot encode and drop the first
    ])

    # Combine transformers into a preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    # Fit the preprocessor on the training data and transform both train and test sets
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Convert to DataFrame for compatibility
    X_train_processed = pd.DataFrame(X_train_processed)
    X_test_processed = pd.DataFrame(X_test_processed)

    return X_train_processed, y_train, X_test_processed


In [25]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import numpy as np

# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # Initialize KFold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Store the AUC scores for each split
    auc_scores = []

    # Convert X and y to numpy arrays for compatibility with KNN class
    X = np.array(X)
    y = np.array(y)

    # Iterate over each split
    for train_index, val_index in kf.split(X):
        # Split the data into training and validation sets
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Fit the KNN model on the training data
        knn.fit(X_train, y_train)

        # Predict probabilities on the validation set
        y_pred_prob = knn.predict_proba(X_val)

        # Calculate the ROC AUC score
        auc_score = roc_auc_score(y_val, y_pred_prob)
        auc_scores.append(auc_score)

    # Return the average AUC score across all splits and the individual scores
    return np.mean(auc_scores), auc_scores

# The function is now implemented and ready for use.


In [27]:
# Load and preprocess data
X, y, X_test = preprocess_data('/content/train.csv', '/content/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Sample a fraction of the dataset for testing (10% of the data)
X_sampled = X.sample(frac=0.1, random_state=42)  # Use 10% of the data
y_sampled = y.loc[X_sampled.index]

# Perform cross-validation with the smaller subset
mean_auc, auc_scores = cross_validate(X_sampled, y_sampled, knn, n_splits=3)  # Use fewer splits for testing
print("Cross-validation scores:", auc_scores)
print("Mean AUC score:", mean_auc)

# Hyperparameter tuning (example: exploring different values for k)
best_k = 5
best_auc = mean_auc

for k in range(1, 21):  # Test different values for k (1 to 20)
    knn = KNN(k=k, distance_metric='euclidean')
    mean_auc, _ = cross_validate(X_sampled, y_sampled, knn, n_splits=3)  # Use smaller subset for tuning
    if mean_auc > best_auc:
        best_auc = mean_auc
        best_k = k

print(f"Optimal k found: {best_k} with AUC: {best_auc}")

# Train on the full dataset with optimal hyperparameters and make predictions on the test set
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(np.array(X), np.array(y))
test_predictions = knn.predict_proba(np.array(X_test))

# Save test predictions to match the submission format
test_ids = pd.read_csv('/content/test.csv')['id']
submission = pd.DataFrame({'id': test_ids, 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)


Cross-validation scores: [0.8489186719463905, 0.8514589469808541, 0.8912643861892584]
Mean AUC score: 0.8638806683721677
Optimal k found: 13 with AUC: 0.889619440983877
