## Imports

In [43]:
import numpy as np
import pandas as pd
from collections import Counter

- Customer ID: A unique identifier for each customer
- Surname: The customer's surname or last name
- Credit Score: A numerical value representing the customer's credit score
- Geography: The country where the customer resides (France, Spain or Germany)
- Gender: The customer's gender (Male or Female)
- Age: The customer's age.
- Tenure: The number of years the customer has been with the bank
- Balance: The customer's account balance
- NumOfProducts: The number of bank products the customer uses (e.g., savings account, credit card)
- HasCrCard: Whether the customer has a credit card (1 = yes, 0 = no)
- IsActiveMember: Whether the customer is an active member (1 = yes, 0 = no)
- EstimatedSalary: The estimated salary of the customer
- **Exited: Whether the customer has churned (1 = yes, 0 = no)**

## Clean Data

In [131]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

train_data = train_data.drop(['CustomerId', 'Surname', 'id'], axis=1)
test_data = test_data.drop(['CustomerId', 'Surname', 'id'], axis=1)


## KNN Object

In [103]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        """ 
            X: numpy.ndarray of X_test
        """
        predictions = []
        for i, row in enumerate(X):
            # print(f"predicting {i} out of {len(X)}")
            predictions.append(self.predict_single_point(row))
            # print("====================================================")
        return np.array(predictions)
    
    def predict_single_point(self, x):
        # compute distances from the test point to all training points
        distances = [self.compute_distance(x, x_train_point) for x_train_point in self.X_train]
        
        # get the indices of the k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        # print(f"THE K INDICES ARE {k_indices}")

        # get the labels of the k nearest neighbors
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        
        # return the most common label among the nearest neighbors
        most_common_label = Counter(k_nearest_labels).most_common(1)[0][0]
        
        # print(f"predicted: {most_common_label}")
        
        return most_common_label

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.linalg.norm(X1 - X2)
        
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))

        elif self.distance_metric == 'cosine':
            dot_product = np.dot(X1, X2)
            norm_product = np.linalg.norm(X1) * np.linalg.norm(X2)
            return 1 - dot_product / norm_product 
        
        elif self.distance_metric == 'minkowski':
            return np.sum(np.abs(X1 - X2) ** 2) ** (1 / 2)
        
        elif self.distance_metric == 'hamming':
            return np.sum(X1 != X2)
        

## Pre-processing Function

In [230]:
# Define data preprocessing function

def standard_scaler(data, columns):
    """
        Data is a pandas df, columns is column indices that are int/float
        Formula: (X - mean) / std
    """
    scaled_data = data.copy()
    for col in columns:
        mean = scaled_data[col].mean()
        std = scaled_data[col].std()
        scaled_data[col] = (scaled_data[col] - mean) / std
    return scaled_data

def one_hot_encode(data, columns):
    """
        Data is a pandas df, columns is column indices that are int/float
    """
    return pd.get_dummies(data, columns=columns, drop_first=True)

def preprocess_data(train_data, test_data):
    """
        train_data is full train data as pandas df
        same for test_data

        Apply standardscalar to num
        and categorical encoder to int
    """

    # X_train = train_data
    y_train = train_data['Exited']
    X_train = train_data.drop(columns=['Exited'], axis=1)
    X_test = test_data

    # SELECT NUM/CAT FOR TRAINING
    numerical_cols_train = X_train.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols_train = X_train.select_dtypes(include=['object']).columns

    # SELECT NUM/CAT FOR TESTING
    numerical_cols_test = X_test.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols_test = X_test.select_dtypes(include=['object']).columns
    
    X_train_scaled = standard_scaler(X_train, numerical_cols_train)
    X_test_scaled = standard_scaler(X_test, numerical_cols_test)

    X_train_processed = one_hot_encode(X_train_scaled, categorical_cols_train)
    X_test_processed = one_hot_encode(X_test_scaled, categorical_cols_test)
    
    return X_train_processed.values, y_train.values, X_test_processed.values

## Cross-validation Function

In [111]:
# Define cross-validation function

def manual_kfold_split(X, y, n_splits=5):
    indices = np.arange(len(X))
    fold_size = len(X) // n_splits
    folds = []

    for i in range(n_splits):
        start = i * fold_size
        if i == n_splits - 1:
            end = len(X)
        else:
            end = (i + 1) * fold_size

        val_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])

        folds.append((train_indices, val_indices))
    
    return folds

def compute_roc_auc(y_true, y_pred):
    """
        y_true: Array of actual class
        y_pred: Array of predicted class
    """
    # True positives, false positives, true negatives, false negatives
    TP = np.sum((y_true == 1) & (y_pred == 1))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))

    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0  
    FPR = FP / (FP + TN) if (FP + TN) > 0 else 0 

    auc = (1 + TPR - FPR) / 2

    return auc

def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    """
        X: training data features
        y: train data results
        knn: knn object 
        n_splits: number of folds for cross-validation
    """

    # split data
    folds = manual_kfold_split(X, y, n_splits)
    cv_scores = []

    for train_index, val_index in folds:
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        knn.fit(X_train, y_train)

        y_val_pred = knn.predict(X_val)

        auc_score = compute_roc_auc(y_val, y_val_pred)
        cv_scores.append(auc_score)

    mean_auc = np.mean(cv_scores)
    
    return mean_auc

In [129]:
# Sample for cross-validation
X_train, y_train, X_test = preprocess_data(train_data, test_data)

knn = KNN(k=5, distance_metric='minkowski')
knn.fit(X_train, y_train)

cv_scores = cross_validate(X_train[:1000], y_train[:1000], knn)

print("Cross-validation scores:", cv_scores)

Cross-validation scores: 0.7541379063441267


## Make Prediction Function

In [231]:
# Make predictions on test

def make_prediction(n_neighbors, X_train, y_train):
    """ 
        n_neighors is k value for knn
        X_train and y_train is features + results of training data as np array
    """
    knn = KNN(k=n_neighbors, distance_metric='euclidean')
    knn.fit(X_train, y_train)
    predictions = knn.predict(X_test)
    
    return predictions

In [116]:
# Hyperparameter Tuning
def hparam_tune(max_range, distance_metric, X_train, y_train):

    n_neighbors_values = range(1, max_range+1)
    all_cv_scores = []

    for n_neighbors in n_neighbors_values:
        knn = KNN(k=n_neighbors, distance_metric=distance_metric)
        knn.fit(X_train, y_train)

        cv_score = cross_validate(X_train[:1000], y_train[:1000], knn)
        all_cv_scores.append(cv_score)
        print(f"cv scores for k={n_neighbors} is {cv_score}")

    k_best = n_neighbors_values[np.argmax(all_cv_scores)]
    print(f"best result is {k_best} neighbors")
    return k_best

hparam_tune(10, 'euclidean', X_train, y_train)

cv scores for k=1 is 0.7358824911889486
cv scores for k=2 is 0.7358824911889486
cv scores for k=3 is 0.738076576159838
cv scores for k=4 is 0.7447870144522485
cv scores for k=5 is 0.7541379063441267
cv scores for k=6 is 0.7395440117441825
cv scores for k=7 is 0.7237811729741781
cv scores for k=8 is 0.7342569105004643
cv scores for k=9 is 0.714410556566554
cv scores for k=10 is 0.7374420040870776
best result is 5 neighbors


5

## Balance Dataset

In [16]:
# UNDERSAMPLE MAJORITY
import random
random.seed(1212)

train_data_neg = train_data[train_data['Exited'] == 0] # this is the majority
train_data_pos = train_data[train_data['Exited'] == 1] # this is the minority

samples_to_remove = len(train_data_neg) - len(train_data_pos)

rand_index = random.sample(list(train_data_neg['id']), samples_to_remove)

train_data_balanced_1 = train_data[~train_data['id'].isin(rand_index)]

# Get new X_train and y_train
X_train_balanced, y_train_balanced, X_test = preprocess_data(train_data_balanced_1, test_data)

k_best = hparam_tune(20, 'euclidean',  X_train_balanced, y_train_balanced)
balanced_1_predictions = make_prediction(k_best, X_train_balanced, y_train_balanced)


cv scores for k=1 is 0.7587379768166812
cv scores for k=2 is 0.7587379768166812
cv scores for k=3 is 0.7860051218139782
cv scores for k=4 is 0.7848117883191423
cv scores for k=5 is 0.796381678773088
cv scores for k=6 is 0.7999965748291029
cv scores for k=7 is 0.8072815193923806
cv scores for k=8 is 0.7963958661798161
cv scores for k=9 is 0.7993907132554077
cv scores for k=10 is 0.7988064145747005
cv scores for k=11 is 0.7928565017446847
cv scores for k=12 is 0.7994121396254967
cv scores for k=13 is 0.7911051350869723
cv scores for k=14 is 0.795374322696998
cv scores for k=15 is 0.7820718690491231
cv scores for k=16 is 0.7894972150493632
cv scores for k=17 is 0.7802404802013365
cv scores for k=18 is 0.7875258543696313
cv scores for k=19 is 0.7792454766308342
cv scores for k=20 is 0.7870901969298033
best result is 7 neighbors


In [18]:
# OVERSAMPLE MINORITY
train_data_neg = train_data[train_data['Exited'] == 0] # this is the majority
train_data_pos = train_data[train_data['Exited'] == 1] # this is the minority

samples_to_add = len(train_data_neg) - len(train_data_pos)

oversampled_minority = train_data_pos.sample(n=samples_to_add, replace=True, random_state=1212)
train_data_balanced_2 = pd.concat([train_data_neg, train_data_pos, oversampled_minority])
train_data_balanced_2 = train_data_balanced_2.sample(frac=1, random_state=1212).reset_index(drop=True)

X_train_balanced, y_train_balanced, X_test = preprocess_data(train_data_balanced_2, test_data)

k_best = hparam_tune(20, 'euclidean', X_train_balanced, y_train_balanced)
balanced_2_predictions = make_prediction(k_best, X_train_balanced, y_train_balanced)


cv scores for k=1 is 0.7872382799243968
cv scores for k=2 is 0.7872382799243968
cv scores for k=3 is 0.8056358270528111
cv scores for k=4 is 0.8102098437641343
cv scores for k=5 is 0.8275862037984186
cv scores for k=6 is 0.8213402517735935
cv scores for k=7 is 0.8214284030384629
cv scores for k=8 is 0.8266439668106169
cv scores for k=9 is 0.8200016535116754
cv scores for k=10 is 0.8238991406920478
cv scores for k=11 is 0.8213796596566851
cv scores for k=12 is 0.8149125914829922
cv scores for k=13 is 0.8158673053564417
cv scores for k=14 is 0.8200265634495807
cv scores for k=15 is 0.8159889428768906
cv scores for k=16 is 0.8193300124277384
cv scores for k=17 is 0.8143879435841045
cv scores for k=18 is 0.8215750230545644
cv scores for k=19 is 0.8099824376280539
cv scores for k=20 is 0.81520386505682
best result is 5 neighbors


## Dropping Features

In [22]:
def preprocess_data_new(train_data, test_data, columns_to_keep):
    """
        train_data is full train data as pandas df
        same for test_data

        columns_to_keep is an array of strings of columns to keep from train and test data

        Apply standardscalar to num
        and categorical encoder to int
    """

    X_train = train_data.iloc[:, 3:13]
    y_train = train_data['Exited']

    X_test = test_data.iloc[:, 3:13]

    # SELECT NUM/CAT FOR TRAINING
    numerical_cols_train = X_train.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols_train = X_train.select_dtypes(include=['object']).columns

    # SELECT NUM/CAT FOR TESTING
    numerical_cols_test = X_test.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols_test = X_test.select_dtypes(include=['object']).columns
    
    X_train_scaled = standard_scaler(X_train, numerical_cols_train)
    X_test_scaled = standard_scaler(X_test, numerical_cols_test)

    X_train_processed = one_hot_encode(X_train_scaled, categorical_cols_train)[columns_to_keep]
    X_test_processed = one_hot_encode(X_test_scaled, categorical_cols_test)[columns_to_keep]
    
    return X_train_processed.values, y_train.values, X_test_processed.values

In [29]:
# based on ... manually trying a bunch of combinations
columns_to_keep = ['Age', 'Balance', 'EstimatedSalary', 'CreditScore', 'NumOfProducts']

columns_to_keep = ['Age', 'IsActiveMember', 'CreditScore', 'Balance', 'NumOfProducts']

columns_to_keep = ['Age', 'IsActiveMember', 'CreditScore', 'Balance', 'NumOfProducts']


X_train_new, y_train_new, X_test = preprocess_data_new(train_data_balanced_2, pd.read_csv("test.csv"), columns_to_keep)

k_best = hparam_tune(20, 'euclidean', X_train_new, y_train_new)

cv scores for k=1 is 0.8284362255118047
cv scores for k=2 is 0.8284362255118047
cv scores for k=3 is 0.8119776481558472
cv scores for k=4 is 0.825337474396329
cv scores for k=5 is 0.8064979692056982
cv scores for k=6 is 0.8242242298570126
cv scores for k=7 is 0.819937515758307
cv scores for k=8 is 0.8206709816499395
cv scores for k=9 is 0.8157889403237697
cv scores for k=10 is 0.819613086804792
cv scores for k=11 is 0.8168783592246252
cv scores for k=12 is 0.8194999368827036
cv scores for k=13 is 0.808532372585747
cv scores for k=14 is 0.8138859859676467
cv scores for k=15 is 0.8110843804935488
cv scores for k=16 is 0.8221276638371011
cv scores for k=17 is 0.8184793367930393
cv scores for k=18 is 0.8210572686591833
cv scores for k=19 is 0.8116100892447585
cv scores for k=20 is 0.8223594500013547
best result is 1 neighbors


In [30]:
new_predictions = make_prediction(k_best, X_train_new, y_train_new)

## Apply SVD

In [55]:
import numpy as np
import pandas as pd

def apply_svd_custom(X_train, X_test, n_components):
    # Ensure the data is numeric and handle non-numeric data
    X_train = pd.DataFrame(X_train).apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy().astype(np.float64)
    X_test = pd.DataFrame(X_test).apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy().astype(np.float64)
    
    # Apply SVD
    U_train, S_train, Vt_train = get_svd(X_train, n_components)
    U_test, S_test, Vt_test = get_svd(X_test, n_components)

    # Dimension mismatch checks
    print(f"U_train dims are: {U_train.shape}")
    print(f"S_train dims are: {S_train.shape}")
    print(f"Vt_train dims are: {Vt_train.shape}")
    
    # Reconstruct matrices with reduced dimensions
    train_reduced = np.dot(U_train, np.dot(S_train, Vt_train))
    test_reduced = np.dot(U_test, np.dot(S_test, Vt_test))

    return train_reduced, test_reduced

def get_A_t_A(dataset):
    A_transpose = np.transpose(dataset)
    A_t_A = np.dot(A_transpose, dataset)
    print(f"A_t_A type: {A_t_A.dtype}")  # Debug: Check the type of A_t_A
    return A_t_A

def get_s_matrix(A_t_A, n_components):
    # Ensure A_t_A is a float64 array
    A_t_A = A_t_A.astype(np.float64)

    # Check for NaNs and Infs
    if not np.isfinite(A_t_A).all():
        raise ValueError("Input matrix contains NaN or infinity values.")

    # Get eigenvalues of A^T A
    eigenvalues, _ = np.linalg.eig(A_t_A)

    # Sort eigenvalues in descending order
    eigenvalues_desc = np.sort(eigenvalues)[::-1]

    # Ensure all eigenvalues are non-negative
    eigenvalues_desc = np.real(eigenvalues_desc)
    eigenvalues_desc = np.where(eigenvalues_desc < 0, 0, eigenvalues_desc)

    # Take the square root of the eigenvalues to get singular values
    singular_values = np.sqrt(eigenvalues_desc[:n_components])

    # Create a diagonal matrix of singular values
    S_matrix = np.diag(singular_values)

    return S_matrix

def get_V_t_matrix(A_t_A, n_components):
    # Compute eigenvalues and eigenvectors of A_t_A
    eigenvalues, eigenvectors = np.linalg.eig(A_t_A)

    # Get indices of descending eigenvalues
    indx_desc = np.argsort(eigenvalues)[::-1]

    # Sort eigenvectors in the same order as descending eigenvalues
    v_matrix = eigenvectors[:, indx_desc]

    # Take the real part and reduce to n_components
    v_matrix = np.real(v_matrix)
    v_matrix_reduced = v_matrix[:, :n_components]

    # Transpose the matrix to get V^T
    v_matrix_transpose = np.transpose(v_matrix_reduced)

    return v_matrix_transpose

def get_U_matrix(dataset, V_t, S):
    # U = A * V * S_inv
    S_inv = np.linalg.pinv(S)
    U_matrix = np.dot(dataset, np.dot(np.transpose(V_t), S_inv))

    return U_matrix

def get_svd(dataset, n_components):
    A_t_A = get_A_t_A(dataset)

    # Get first n columns of S
    S_matrix = get_s_matrix(A_t_A, n_components)

    # Get the reduced V^T matrix
    V_t_matrix = get_V_t_matrix(A_t_A, n_components)

    # Select first n columns of U
    U_matrix = get_U_matrix(dataset, V_t_matrix, S_matrix)

    return U_matrix, S_matrix, V_t_matrix


In [67]:
# A = USV^T
X_train_svd, X_test_svd  = apply_svd_custom(X_train, X_test, 2)

A_t_A type: float64
A_t_A type: float64
U_train dims are: (15000, 2)
S_train dims are: (2, 2)
Vt_train dims are: (2, 11)


In [68]:
# Didn't work well
hparam_tune(5, 'euclidean', X_train_svd, y_train)

cv scores for k=1 is 0.6704704530969596
cv scores for k=2 is 0.6704704530969596
cv scores for k=3 is 0.6790075914774708
cv scores for k=4 is 0.681545094360975
cv scores for k=5 is 0.6873488643914674
best result is 5 neighbors


## Feature Extraction

In [234]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

train_data = train_data.drop(['CustomerId', 'Surname', 'id'], axis=1)
test_data = test_data.drop(['CustomerId', 'Surname', 'id'], axis=1)

def feature_extract(df):
    # df['BalanceToSalaryRatio'] = df['Balance'] / df['EstimatedSalary']
    # df = df.drop(columns=['Balance', 'EstimatedSalary'], axis=1)

    # df['TenureAgeRatio'] = df['Tenure'] / df['Age']
    # df = df.drop(columns=['Tenure', 'Age'], axis=1)

    df['TotalActivity'] = df['NumOfProducts'] + df['HasCrCard'] + df['IsActiveMember']
    df = df.drop(columns=['NumOfProducts', 'HasCrCard', 'IsActiveMember'], axis=1)


    return df

train_data = feature_extract(train_data)
test_data = feature_extract(test_data)

# OVERSAMPLE MINORITY
train_data_neg = train_data[train_data['Exited'] == 0] # this is the majority
train_data_pos = train_data[train_data['Exited'] == 1] # this is the minority

# # DROP ROWS THAT HAVE BALANCE = 0 AND DID NOT EXIT
# train_data_neg = train_data_neg[~((train_data_neg['Balance'] == 0) & (train_data_neg['Exited'] == 0))]

samples_to_add = len(train_data_neg) - len(train_data_pos)

oversampled_minority = train_data_pos.sample(n=samples_to_add, replace=True, random_state=1212)
train_data_balanced = pd.concat([train_data_neg, train_data_pos, oversampled_minority])
train_data_balanced = train_data_balanced.sample(frac=1, random_state=1212).reset_index(drop=True)

train_data_balanced = train_data_balanced.drop(columns=['Balance', 'Tenure', 'EstimatedSalary'], axis=1)
test_data = test_data.drop(columns=['Balance', 'Tenure', 'EstimatedSalary'], axis=1)

X_train_balanced, y_train_balanced, X_test = preprocess_data(train_data_balanced, test_data)


In [233]:
X_train_balanced.shape

(10000, 9)

In [228]:
y_train_balanced.shape

(23934,)

In [204]:
# WITH TotalActivity
k_best = hparam_tune(40, 'euclidean', X_train_balanced, y_train_balanced)

cv scores for k=1 is 0.7640758303668094
cv scores for k=2 is 0.7640758303668094
cv scores for k=3 is 0.7799673843861004
cv scores for k=4 is 0.7833831681089535
cv scores for k=5 is 0.8083262710941191
cv scores for k=6 is 0.8102246690383053
cv scores for k=7 is 0.8104154249053014
cv scores for k=8 is 0.8112164291727331
cv scores for k=9 is 0.8091756025959542
cv scores for k=10 is 0.8165997817995596
cv scores for k=11 is 0.812272174178671
cv scores for k=12 is 0.8083934945213078
cv scores for k=13 is 0.8081137992136554
cv scores for k=14 is 0.8158549464360684
cv scores for k=15 is 0.8171260484304963
cv scores for k=16 is 0.813417703947876
cv scores for k=17 is 0.8085990540371787
cv scores for k=18 is 0.8133726915503937
cv scores for k=19 is 0.8104067278936933
cv scores for k=20 is 0.816914169915154
cv scores for k=21 is 0.8181492745850413
cv scores for k=22 is 0.8179968975025546
cv scores for k=23 is 0.8179511952286299
cv scores for k=24 is 0.8161080638355086
cv scores for k=25 is 0.8141

In [223]:
# WITH BalanceToSalaryRatio ADDED
k_best = hparam_tune(20, 'euclidean', X_train_balanced, y_train_balanced)

cv scores for k=1 is 0.7640758303668094
cv scores for k=2 is 0.7640758303668094


KeyboardInterrupt: 

## Save Submission

In [235]:
new_predictions = make_prediction(k_best, X_train_balanced, y_train_balanced)

In [193]:
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': new_predictions}).to_csv('submissions.csv', index=False)