In [1]:
import numpy as np
import pandas as pd

In [27]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.pts = X.to_numpy()
        self.y = y.to_numpy()
        pass

    def predict(self, X):
        Xnp = X.to_numpy()
        out = []
        for prept in Xnp: 
            mdist = [] #find nearest neighbors to each point
            for spt in range(self.k):
                mdist.append((self.compute_distance(prept, self.pts[spt]), spt))
            mdist.sort(reverse=True)
            for compt in range(self.k, len(self.pts)):
                dist = self.compute_distance(self.pts[compt], prept)
                if dist < mdist[0][0]:
                    mdist[0] = (dist, compt)
                    mdist.sort(reverse=True)
            #perform majority vote (using distance weighing this got me a better roc auc)
            votes = weight = 0.0
            for dist, idx in mdist:
                vote = self.y[idx]
                wght = 10**10
                if dist != 0:
                    whgt = 1 / dist
                votes += vote * wght
                weight += wght
            out.append(votes/weight)
        return pd.Series(out)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1-X2)) #manhattan (got higher roc auc)
        return np.linalg.norm(X1-X2) #defaults to euclidean
        


In [22]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.

    train_data = train_data.drop(['id','CustomerId'], axis=1) #dropping id column
    test_data = test_data.drop(['id','CustomerId'], axis=1) #dropping id column

    x = train_data.drop('Exited', axis=1)
    y = train_data['Exited']

    nums = x.select_dtypes(include=['float64','int64']).columns
    cats = x.select_dtypes(include=['object']).columns

    for col in nums:
        test_data[col] = (test_data[col] - x[col].mean()) / x[col].std()
        x[col] = (x[col]-x[col].mean()) / x[col].std()
    for meow in cats:
        unique = pd.concat([x[meow], test_data[meow]]).unique().tolist()
        udict = {value: np.sqrt(index/len(unique))-0.5 for index, value in enumerate(unique)}
        x[meow] = x[meow].map(udict).fillna(-1)
        test_data[meow] = test_data[meow].map(udict).fillna(-1)

    return x, y, test_data

In [29]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    splitamt = len(X) / n_splits

    roc_auc_scores = []

    for x in range(n_splits):
        st = int(x * splitamt)
        nd = int((x + 1) * splitamt)

        cxtest, cytest = X.iloc[st:nd], y.iloc[st:nd]
        cxtrain, cytrain = pd.concat([X.iloc[:st], X.iloc[nd:]], axis=0), pd.concat([y.iloc[:st], y.iloc[nd:]], axis=0)
        #print(cxtest, cytest, cxtrain, cytrain)
        knn.fit(cxtrain, cytrain)

        pred = knn.predict(cxtest)

        score = calculate_roc_auc(cytest.to_numpy(), pred.to_numpy())
        roc_auc_scores.append(score)

        print("fold ", (x+1), " out of ", n_splits, ". Score: ", score)
    return roc_auc_scores

def calculate_roc_auc(y_true, y_scores):
    sorted_indices = np.argsort(-y_scores)
    y_true_sorted = y_true[sorted_indices]
    n_positives = sum(y_true)
    n_negatives = len(y_true) - n_positives

    tpr = fpr = []

    tp = fp = 0

    for i in range(len(y_true_sorted)):
        if y_true_sorted[i] == 1:
            tp += 1
        else:
            fp += 1

        tpr.append(tp / n_positives) 
        fpr.append(fp / n_negatives) 

    auc = 0.0 #trapezoidal rule
    for i in range(1, len(fpr)):
        auc += (fpr[i] - fpr[i-1]) * (tpr[i] + tpr[i-1]) / 2

    return auc

In [26]:
# Load and preprocess data
X, y, X_test = preprocess_data('./files/506_5/train.csv', './files/506_5/test.csv')

# Create and evaluate model
knn = KNN(k=82, distance_metric='manhattan')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=82, distance_metric='manhattan')
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('./files/506_5/test.csv')['id'], 'Exited': test_predictions}).to_csv('./files/506_5/submissions.csv', index=False)

fold  1  out of  5 . Score:  0.9107251808594256
fold  2  out of  5 . Score:  0.9218112129184443
fold  3  out of  5 . Score:  0.9106051831675743
fold  4  out of  5 . Score:  0.9191194761688841
fold  5  out of  5 . Score:  0.9130751254645108
Cross-validation scores: [np.float64(0.9107251808594256), np.float64(0.9218112129184443), np.float64(0.9106051831675743), np.float64(0.9191194761688841), np.float64(0.9130751254645108)]
