In [2]:
import numpy as np
import pandas as pd


In [34]:
# Define the KNN class

def most_common(arr):
    length = len(arr)
    # print(arr)
    # Find unique elements and their counts
    values, counts = np.unique(arr, return_counts=True)
    # print(values,counts)
    max_count = np.max(counts)
    # Find the index of the maximum count
    max_index = np.argmax(counts)
    # Return the element with the highest count
    max = values[max_index]
    if max > 0:
      prob = max_count/length
    else:
      prob = 1 - max_count/length
    # print(f"prob:{prob}")
    return round(prob,2),max
class KNN:
    def __init__(self, k):
        self.k = k

    def fit(self, X, y):
      self.X_train = X
      self.y_train = y

    def predict(self,input_data):
      # print("input data",input_data)
      predictions =[]
      probabilities = []
      for x in input_data:
        distances = self.compute_distance(self.X_train,x)
        #print("distances",distances)
        k_nearest_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_nearest_indices]
        prob, common = most_common(k_nearest_labels)
        predictions.append(common)
        probabilities.append(prob)
      return predictions,probabilities



    def compute_distance(self, X1, X2):
      # print("X1",X1[:self.k])
      # print("X2",X2)
      # X1 = np.array(X1, dtype=float)
      # X2 = np.array(X1, dtype=float)
      x1_without_index = X1[:, 1:]
      x2_without_index = X2[1:]
      # x1_without_index = x1_without_index[:, :-3]
      # x2_without_index = x2_without_index[:-3]
      x1_without_index = np.array(x1_without_index, dtype=float)
      x2_without_index = np.array(x2_without_index, dtype=float)

      # print("X1",x1_without_index[:self.k])
      # print("X2",x2_without_index)
      return np.sqrt(np.sum((x2_without_index-x1_without_index) ** 2, axis=1))

In [33]:
def preprocess_data(train_path, test_path):
    # Load data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Handle missing values by dropping rows with any missing values
    train_data = train_data.dropna()
    test_data = test_data.dropna()

    X = train_data.drop(columns=['CustomerId', 'Surname', 'Exited','Geography', 'Gender'])
    y = train_data['Exited'].values
    X_test = test_data.drop(columns=['CustomerId', 'Surname','Geography', 'Gender'])

    # Define numerical and categorical columns
    numerical_cols = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 'Tenure', 'NumOfProducts']
    # categorical_cols = ['Geography', 'Gender']

    # Standardize numerical columns manually
    for col in numerical_cols:
        mean = X[col].mean()
        std = X[col].std()
        X[col] = (X[col] - mean) / std
        X_test[col] = (X_test[col] - mean) / std

    ## One-hot encode categorical columns using pandas' get_dummies
    # X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
    # X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

    # Align columns of X_test with X in case there are mismatches (e.g., different categories)
    X_test = X_test.reindex(columns=X.columns, fill_value=0)

    # Convert the dataframes to numpy arrays for modeling
    X = X.values
    X_test = X_test.values

    return X, y, X_test


In [10]:
def cross_validate(X, y, knn, n_splits):
    print("Called cross validate")
    n_samples = len(X)
    fold_size = n_samples // n_splits
    scores = {'roc_auc': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_score': []}

    for i in range(n_splits):
        start, end = i * fold_size, (i + 1) * fold_size
        print(f"Fold {i + 1}:")
        print(f"Start: {start}, End: {end}")

        # Create validation and training sets
        X_val = X[start:end]
        y_val = y[start:end]
        X_train = np.concatenate([X[:start], X[end:]])
        y_train = np.concatenate([y[:start], y[end:]])

        # Print sizes of train and validation sets
        print(f"Training set size: {len(X_train)}, Validation set size: {len(X_val)}")
        print(f"y_train: {y_train[:knn.k]}, y_val: {y_val[:knn.k]}")  # Show first 5 labels

        # Fit the KNN model

        knn.fit(X_train, y_train)
        #print("X_val",X_val)
        y_pred,probs = knn.predict(X_val)


        # mean_validation_score = cross_val_score(
        #   knn, X_train, y_train, cv=n_splits, scoring='roc_auc'
        # ).mean()
        # print("Mean Validation Score",mean_validation_score)


        # Print predictions
        print(f"Predictions for fold {i + 1}: {y_pred}")

        # Calculate the ROC-AUC score manually
        auc_score, accuracy, precision, recall, f1_score = calculate_metrics(y_val, y_pred)
        print(f"AUC Score for fold {i + 1}: {auc_score},accuracy:{accuracy},precision:{precision},recall:{recall},f1:{f1_score}")

        scores['roc_auc'].append(auc_score)
        scores['accuracy'].append(accuracy)
        scores['precision'].append(precision)
        scores['recall'].append(recall)
        scores['f1_score'].append(f1_score)
        # accuracy = (X_test,y)
        # print("accuracy is",accuracy)

    return {metric: np.mean(scores[metric]) for metric in scores}

def calculate_metrics(y_true, y_scores):
    # Sort the instances by the predicted score in descending order
    sorted_indices = np.argsort(y_scores)[::-1]
    y_true = y_true[sorted_indices]
    y_scores = np.array(y_scores)
    y_scores = y_scores[sorted_indices]

    # Total number of positive and negative samples
    P = np.sum(y_true)
    N = len(y_true) - P

    # Print true positive and negative counts
    print(f"Total Positives (P): {P}, Total Negatives (N): {N}")
    tp = np.sum((y_true == 1.0) & (y_scores == 1.0))
    tn = np.sum((y_true == 0.0) & (y_scores == 0.0))
    fp = np.sum((y_true == 0.0) & (y_scores == 1.0))
    fn = np.sum((y_true == 1.0) & (y_scores == 0.0))

    accuracy = (tp + tn) / len(y_true)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Initialize variables
    tpr = []  # True Positive Rate
    fpr = []  # False Positive Rate
    tp = 0     # True positives
    fp = 0     # False positives

    # Iterate through sorted true values to calculate TPR and FPR
    for i in range(len(y_true)):
        if y_true[i] == 1:
            tp += 1
        else:
            fp += 1

        tpr.append(tp / P if P > 0 else 0)  # Avoid division by zero
        fpr.append(fp / N if N > 0 else 0)  # Avoid division by zero

    # Print TPR and FPR for debugging
    # print(f"TPR: {tpr}, FPR: {fpr}")

    # Calculate the area under the curve using the trapezoidal rule
    auc = 0.0
    for i in range(1, len(tpr)):
        auc += (fpr[i] - fpr[i - 1]) * (tpr[i] + tpr[i - 1]) / 2

    return auc, accuracy, precision, recall, f1_score


In [36]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'CS506 Customer Churn.csv')

# print(X)
# print(f"X: {X[:5]}")
# print(f"y: {y[:5]}")
# print(f"X_test: {X_test[:5]}")
# Create and evaluate model
knn = KNN(k=4)

# Perform cross-validation
cv_scores = cross_validate(X, y, knn,4)

# print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
best_k = 4  # Assume after tuning you find k=5 is the best
knn = KNN(k=best_k)

#TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn.fit(X, y)
test_predictions,probabilities = knn.predict(X_test)
print(probabilities)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('CS506 Customer Churn.csv')['id'], 'Exited': probabilities}).to_csv('submissions.csv', index=False)

Called cross validate
Fold 1:
Start: 0, End: 3750
Training set size: 11250, Validation set size: 3750
y_train: [1. 0. 1. 0.], y_val: [0. 0. 0. 0.]
Predictions for fold 1: [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0