In [387]:
import numpy as np
import pandas as pd

In [388]:
# Define the KNN class
class KNN:
    def __init__(self, k=9, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.sample_weight = None  # Initialize sample weights

    def fit(self, X, y, sample_weight=None):
        self.X_train = np.array(X)
        self.y_train = np.array(y)
        if sample_weight is not None:
            self.sample_weight = np.array(sample_weight)
        else:
            self.sample_weight = np.ones(len(y))

    """def predict(self, X):
        predictions = []
        
        for x in X:
            # Compute distances from the test point to all training points
            distances = self.compute_distance(self.X_train, x)
            
            # Get the indices of the k nearest neighbors manually (without argsort)
            sorted_indices = self.manual_sort(distances)
            k_nearest_indices = sorted_indices[:self.k]
            
            # Get the labels of the k nearest neighbors
            k_nearest_labels = [self.y_train[i] for i in k_nearest_indices]
            
            # Majority vote for classification without Counter
            most_common_label = self.weighted_majority_vote(k_nearest_labels)
            
            predictions.append(most_common_label)
        
        return np.array(predictions)"""
    
    def predict(self, X):
        predictions = []

        for x in X:
            # Compute distances from the test point to all training points
            distances = self.compute_distance(self.X_train, x)

            # Get the indices of the k nearest neighbors manually
            sorted_indices = self.manual_sort(distances)
            k_nearest_indices = sorted_indices[:self.k]

            # Get the labels and distances of the k nearest neighbors
            k_nearest_labels = [self.y_train[i] for i in k_nearest_indices]
            k_nearest_distances = [distances[i] for i in k_nearest_indices]

            # Calculate weights (e.g., inverse of distance)
            # To avoid division by zero, add a small epsilon to distances
            epsilon = 1e-5
            weights = [1 / (distance + epsilon) for distance in k_nearest_distances]

            # Weighted majority vote
            most_common_label = self.weighted_majority_vote(k_nearest_labels, weights)

            predictions.append(most_common_label)

        return np.array(predictions)

    def manual_sort(self, distances):
        # Manual sorting to get indices of sorted distances
        return np.argsort(distances)  # Use argsort to avoid relying on Counter

    """def majority_vote(self, labels):
        # Implement majority vote without Counter
        label_count = {}
        for label in labels:
            if label in label_count:
                label_count[label] += 1
            else:
                label_count[label] = 1
        # Get the label with the highest count
        max_label = None
        max_count = -1
        for label, count in label_count.items():
            if count > max_count:
                max_count = count
                max_label = label
        return max_label"""
    
    def weighted_majority_vote(self, labels, weights):
        # Implement weighted majority vote without Counter
        label_weight = {}
        for label, weight in zip(labels, weights):
            if label in label_weight:
                label_weight[label] += weight
            else:
                label_weight[label] = weight
        # Get the label with the highest total weight
        max_label = None
        max_weight = -1
        for label, total_weight in label_weight.items():
            if total_weight > max_weight:
                max_weight = total_weight
                max_label = label
        return max_label
    
    # Extend the compute_distance function to handle additional distance metrics
    def compute_distance(self, X_train, x_test):
        if self.distance_metric == 'euclidean':
            # Euclidean distance: sqrt(sum((x1 - x2)^2))
            distances = np.sqrt(np.sum((X_train - x_test) ** 2, axis=1))

        elif self.distance_metric == 'manhattan':
            # Manhattan distance: sum(abs(x1 - x2))
            distances = np.sum(np.abs(X_train - x_test), axis=1)

        elif self.distance_metric == 'chebyshev':
            # Chebyshev distance: max(|x1 - x2|)
            distances = np.max(np.abs(X_train - x_test), axis=1)

        elif self.distance_metric == 'minkowski':
            # Minkowski distance (generalized Euclidean and Manhattan): (sum(abs(x1 - x2)^p))^(1/p)
            p = 5  # Example value for p, can be parameterized
            distances = np.sum(np.abs(X_train - x_test) ** p, axis=1) ** (1 / p)

        elif self.distance_metric == 'cosine':
            # Cosine similarity-based distance: 1 - (dot(x1, x2) / (||x1|| * ||x2||))
            dot_product = np.sum(X_train * x_test, axis=1)
            X_train_norm = np.linalg.norm(X_train, axis=1)
            x_test_norm = np.linalg.norm(x_test)
            cosine_similarity = dot_product / (X_train_norm * x_test_norm)
            distances = 1 - cosine_similarity

        elif self.distance_metric == 'hamming':
            # Hamming distance: proportion of different elements between x1 and x2
            distances = np.mean(X_train != x_test, axis=1)

        elif self.distance_metric == 'mahalanobis':
            # Mahalanobis distance: sqrt((x1 - x2)^T * S^-1 * (x1 - x2)), where S is covariance matrix
            cov_matrix = np.cov(X_train.T)

            # Add regularization to the covariance matrix to avoid singularities
            reg_factor = 1e-5  # Small regularization constant
            inv_cov_matrix = np.linalg.inv(cov_matrix + np.eye(cov_matrix.shape[0]) * reg_factor)

            diff = X_train - x_test

            # Ensure no negative values inside sqrt
            distances = np.sqrt(np.maximum(0, np.sum(np.dot(diff, inv_cov_matrix) * diff, axis=1)))
            
        elif self.distance_metric == 'jaccard':
            # Jaccard distance: 1 - (intersection / union)
            # Treat non-zero values as '1' and zeros as '0' for binary comparison
            X_train_binary = X_train > 0
            x_test_binary = x_test > 0

            intersection = np.sum(X_train_binary & x_test_binary, axis=1)
            union = np.sum(X_train_binary | x_test_binary, axis=1)
            jaccard_similarity = intersection / union

            distances = 1 - jaccard_similarity

        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")
            
        # Multiply distances by sample weights
        if self.sample_weight is not None:
            distances = distances * self.sample_weight

        return distances





In [389]:
class AdaBoostKNN:
    def __init__(self, base_estimator, n_estimators=10):
        self.base_estimator = base_estimator  # Your modified KNN classifier
        self.n_estimators = n_estimators
        self.estimators = []
        self.estimator_weights = []

    def fit(self, X, y):
        n_samples = X.shape[0]
        # Initialize sample weights uniformly
        sample_weights = np.full(n_samples, 1 / n_samples)

        for _ in range(self.n_estimators):
            # Clone the base estimator
            estimator = KNN(k=self.base_estimator.k, distance_metric=self.base_estimator.distance_metric)
            # Fit the estimator with current sample weights
            estimator.fit(X, y, sample_weight=sample_weights)
            # Predict training data
            y_pred = estimator.predict(X)
            # Compute misclassification
            incorrect = (y_pred != y).astype(int)
            # Compute error
            error = np.dot(sample_weights, incorrect)
            if error > 0.5:
                break  # Stop if error is too high
            # Compute estimator weight
            estimator_weight = 0.5 * np.log((1 - error) / (error + 1e-10))
            self.estimator_weights.append(estimator_weight)
            self.estimators.append(estimator)
            # Update sample weights
            sample_weights *= np.exp(-estimator_weight * (y * 2 - 1) * (y_pred * 2 - 1))
            sample_weights /= np.sum(sample_weights)  # Normalize

    def predict(self, X):
        # Initialize final prediction
        final_prediction = np.zeros(X.shape[0])

        for estimator, weight in zip(self.estimators, self.estimator_weights):
            prediction = estimator.predict(X)
            # Convert labels to -1 and 1
            prediction = np.where(prediction == 1, 1, -1)
            final_prediction += weight * prediction

        # Convert final prediction back to original labels
        final_prediction = np.where(final_prediction >= 0, 1, 0)
        return final_prediction

In [397]:
# Min-Max scaling without leakage
def manual_min_max_scaler(X, min_val=None, max_val=None):
    # Compute min and max for each column if not provided
    if min_val is None or max_val is None:
        X_min = np.min(X, axis=0)
        X_max = np.max(X, axis=0)
    else:
        X_min = min_val
        X_max = max_val
    
    # Apply min-max normalization
    return (X - X_min) / (X_max - X_min), X_min, X_max

def manual_hybrid_sample(df, target_column='Exited', target_ratio=1.0):
    # Separate minority and majority classes
    minority_class = df[df[target_column] == 1]
    majority_class = df[df[target_column] == 0]
    
    # Number of samples for balanced ratio (target_ratio defines desired balance)
    majority_target_size = int(len(minority_class) * target_ratio)
    
    # Oversample minority to increase size
    samples_needed = majority_target_size - len(minority_class)
    if samples_needed > 0:
        oversampled_minority = minority_class.sample(n=samples_needed, replace=True, random_state=42)
        minority_class = pd.concat([minority_class, oversampled_minority])
    
    # Undersample the majority class to match the target size
    undersampled_majority = majority_class.sample(n=majority_target_size, replace=False, random_state=42)
    
    # Combine to form balanced dataset
    balanced_df = pd.concat([minority_class, undersampled_majority], axis=0)
    
    # Shuffle dataset
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return balanced_df

def manual_oversample(df, target_column='Exited'):
    # Separate the minority and majority classes
    minority_class = df[df[target_column] == 1]
    majority_class = df[df[target_column] == 0]
    
    # Calculate the number of samples to add to the minority class
    samples_needed = len(majority_class) - len(minority_class)
    
    # Randomly sample from the minority class with replacement
    oversampled_minority = minority_class.sample(n=samples_needed, replace=True, random_state=42)
    
    # Combine the majority class with the oversampled minority class
    balanced_df = pd.concat([majority_class, minority_class, oversampled_minority], axis=0)
    
    # Shuffle the dataset
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return balanced_df

def manual_undersample(df, target_column='Exited'):
    # Separate the minority and majority classes
    minority_class = df[df[target_column] == 1]
    majority_class = df[df[target_column] == 0]
    
    # Randomly sample from the majority class without replacement
    undersampled_majority = majority_class.sample(n=len(minority_class), replace=False, random_state=42)
    
    # Combine the minority class with the undersampled majority class
    balanced_df = pd.concat([minority_class, undersampled_majority], axis=0)
    
    # Shuffle the dataset
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return balanced_df

def feature_engineering(df):
    # Age binning into categories: Young, Middle-Aged, Senior
    df['AgeGroup'] = pd.cut(df['Age'], 
                            bins=[18, 35, 45, 60, 100], 
                            labels=['Youth', 'Adults', 'Middle-Aged Adults', 'Seniors'], 
                            right=False)
    # Tenure to Age Ratio
    df['TenureToAge'] = df['Tenure'] / (df['Age'] + 1e-5)
    
    # Interaction between NumOfProducts and Balance
    df['ProductsBalanceInteraction'] = df['NumOfProducts'] * df['Balance']
    
    df['YearsAsMember'] = df['Age'] - df['Tenure']

    
    # Convert the AgeGroup to one-hot encoding
    age_groups = pd.get_dummies(df['AgeGroup'], prefix='AgeGroup')
    df = pd.concat([df, age_groups], axis=1)
    
    # Drop the original AgeGroup column
    df.drop(['AgeGroup'], axis=1, inplace=True)

    return df

def kmeans_train(df, num_clusters=3, max_iterations=100):
    # Convert DataFrame to numpy array for easier manipulation
    X = df[['Balance', 'Tenure', 'Age']].values  # Use selected features for clustering

    # Initialize cluster centers randomly
    np.random.seed(42)
    random_indices = np.random.choice(X.shape[0], size=num_clusters, replace=False)
    centroids = X[random_indices]

    for _ in range(max_iterations):
        # Assign each point to the nearest centroid
        distances = np.array([np.sqrt(np.sum((X - centroid) ** 2, axis=1)) for centroid in centroids])
        labels = np.argmin(distances, axis=0)

        # Recalculate centroids as the mean of assigned points
        new_centroids = np.array([X[labels == k].mean(axis=0) for k in range(num_clusters)])

        # Check for convergence (if centroids do not change)
        if np.all(centroids == new_centroids):
            break

        centroids = new_centroids

    return centroids

def assign_clusters(df, centroids):
    X = df[['Balance', 'Tenure', 'Age']].values  # Use selected features for clustering
    distances = np.array([np.sqrt(np.sum((X - centroid) ** 2, axis=1)) for centroid in centroids])
    labels = np.argmin(distances, axis=0)
    return labels

def preprocess_data(train_path, test_path):
    # Load the training and testing data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Drop unnecessary columns
    train_data.drop(['id', 'Surname'], axis=1, inplace=True)
    test_data.drop(['id', 'Surname'], axis=1, inplace=True)

    # Handle categorical variables manually (Gender and Geography)
    train_data['Gender'] = train_data['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
    test_data['Gender'] = test_data['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
    
    # Manual one-hot encoding for 'Geography'
    geography_unique_train = train_data['Geography'].unique()
    geography_unique_test = test_data['Geography'].unique()
    geography_unique = np.union1d(geography_unique_train, geography_unique_test)
    
    for country in geography_unique:
        train_data[f'Geography_{country}'] = (train_data['Geography'] == country).astype(int)
        test_data[f'Geography_{country}'] = (test_data['Geography'] == country).astype(int)
    
    # Drop the original 'Geography' column
    train_data.drop('Geography', axis=1, inplace=True)
    test_data.drop('Geography', axis=1, inplace=True)
    
    # Apply feature engineering to the training and testing data
    train_data = feature_engineering(train_data)
    test_data = feature_engineering(test_data)
    
    # Step 1: Calculate KMeans centroids using training data
    centroids = kmeans_train(train_data, num_clusters=3)

    # Step 2: Assign clusters to both training and test data using the same centroids
    train_data['CustomerCluster'] = assign_clusters(train_data, centroids)
    test_data['CustomerCluster'] = assign_clusters(test_data, centroids)
    
    balanced_train_data = manual_undersample(train_data, target_column='Exited')
    
    # Separate features and target variable
    X_train = balanced_train_data.drop('Exited', axis=1)
    y_train = balanced_train_data['Exited']
    
    if 'Exited' in test_data.columns:
        X_test = test_data.drop('Exited', axis=1)
    else:
        X_test = test_data
    print(X_train.columns)
    # Normalize the features using Min-Max normalization
    X_train_scaled, X_train_min, X_train_max = manual_min_max_scaler(X_train.values)
    X_test_scaled, _, _ = manual_min_max_scaler(X_test.values, X_train_min, X_train_max)

    return X_train_scaled, X_test_scaled, y_train

In [398]:
def cross_validate(X, y, knn, n_splits=5):
    # Combine X and y into a single array for easier splitting
    data = np.column_stack((X, y))
    
    # Shuffle the data manually
    np.random.seed(42)
    np.random.shuffle(data)
    
    # Split the data into 'n_splits' folds
    fold_size = len(data) // n_splits
    roc_auc_scores = []
    roc_auc_train_scores = []
    for i in range(n_splits):
        # Create validation fold
        val_data = data[i * fold_size:(i + 1) * fold_size]
        X_val = val_data[:, :-1]
        y_val = val_data[:, -1]
        
        # Remaining data for training
        train_data = np.concatenate([data[:i * fold_size], data[(i + 1) * fold_size:]], axis=0)
        X_train = train_data[:, :-1]
        y_train = train_data[:, -1]

        # Train the KNN classifier
        knn.fit(X_train, y_train)
        
        # Make predictions on the validation set
        y_val_pred = knn.predict(X_val)
        y_train_pred = knn.predict(X_train)
        
        # Compute the ROC AUC score manually
        roc_auc = manual_roc_auc_score(y_val, y_val_pred)
        roc_auc_scores.append(roc_auc)
        #roc_auc_train = manual_roc_auc_score(y_val, y_train_pred)
        #roc_auc_train_scores.append(roc_auc_train)
    
    # Return the average ROC AUC score across all folds
    return np.mean(roc_auc_scores)#,np.mean(roc_auc_train_scores)

def manual_roc_auc_score(y_true, y_pred):
    # Sort the true labels and predicted values by predicted scores
    sorted_indices = np.argsort(y_pred)
    y_true_sorted = y_true[sorted_indices]
    
    # Count positive and negative instances
    P = np.sum(y_true_sorted == 1)
    N = np.sum(y_true_sorted == 0)
    
    # Compute the rank sum for positive class instances
    rank_sum = np.sum(np.where(y_true_sorted == 1)[0] + 1)  # Rank starts from 1, hence +1

    # Compute the AUC using the rank-sum formula
    auc = (rank_sum - (P * (P + 1)) / 2) / (P * N)
    
    return auc



In [401]:
# Assume that preprocess_data, KNN, and cross_validate have already been defined.

# Load and preprocess data
X_train, X_test, y_train = preprocess_data('train.csv', 'test.csv')

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")

print(f"Shape of X_train: {X_train[0]}")
print(f"Shape of X_train: {X_train[1]}")
print(f"Shape of X_train: {X_train[2]}")

# Hyperparameter tuning: Explore different values of k
best_k = None
best_score = 0
#k_values = [3, 5, 7, 9, 11, 20, 30, 50, 100, 150, 200]  # You can expand this range
k_values = [30]  # You can expand this range

for k in k_values:
    # Initialize KNN with Mahalanobis distance (this should be implemented in the KNN class)
    knn = KNN(k=k, distance_metric='mahalanobis')
    
    # Perform cross-validation to evaluate the current k
    cv_score = cross_validate(X_train, y_train, knn)
    
    print(f"K: {k}, Cross-validation ROC AUC Score: {cv_score}")
    
    # Check if this is the best k so far
    if cv_score > best_score:
        best_score = cv_score
        best_k = k

# Output the best hyperparameters
print(f"Best K: {best_k} with ROC AUC Score: {best_score}")

# Train on the full dataset using the optimal k value
knn = KNN(k=best_k, distance_metric='mahalanobis')
#knn.fit(X_train, y_train)

# Make predictions on the test set
#test_predictions = knn.predict(X_test)

# Initialize AdaBoostKNN with the base KNN
adaboost_knn = AdaBoostKNN(base_estimator=knn, n_estimators=10)

# Fit the AdaBoostKNN on the training data
adaboost_knn.fit(X_train, y_train)

# Make predictions on the test set
test_predictions = adaboost_knn.predict(X_test)

# Save test predictions as before
test_ids = pd.read_csv('test.csv')['id']
submission = pd.DataFrame({'id': test_ids, 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)

# Save test predictions
# Assuming that the test set has an 'id' column in the CSV
test_ids = pd.read_csv('test.csv')['id']
submission = pd.DataFrame({'id': test_ids, 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)

print("Predictions saved to submissions.csv")


Index(['CustomerId', 'CreditScore', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Geography_France', 'Geography_Germany', 'Geography_Spain',
       'TenureToAge', 'ProductsBalanceInteraction', 'YearsAsMember',
       'AgeGroup_Youth', 'AgeGroup_Adults', 'AgeGroup_Middle-Aged Adults',
       'AgeGroup_Seniors', 'CustomerCluster'],
      dtype='object')
Shape of X_train: (6066, 21)
Shape of y_train: (6066,)
Shape of X_train: [0.04118358 0.72553699 0.         0.76363636 0.1        0.
 0.33333333 1.         0.         0.6072403  0.         0.
 1.         0.05573771 0.         0.81355932 0.         0.
 0.         1.         1.        ]
Shape of X_train: [0.08327932 0.42959427 0.         0.32727273 0.3        0.
 0.33333333 1.         1.         0.89314062 1.         0.
 0.         0.27567568 0.         0.33898305 0.         1.
 0.         0.         1.        ]
Shape of X_train: [0.37699337 0.56801909 0.         0.509