In [55]:
import numpy as np
import matplotlib.pyplot as plt
import preprocessing as pp
import implementations as imp
import helpers as hlp

## Data cleaning and Preprocessing

In [56]:
path = "/Users/maelynenguyen/Desktop/"

x_train_ = np.load(path+"f_x_train_.npy")
x_test_ = np.load(path+"f_x_test_.npy")
y_train_ = np.load(path+"f_y_train_.npy")
test_ids = np.load(path+"f_test_ids_.npy")
train_ids = np.load(path+"f_train_ids_.npy")

In [57]:
x, x_submit, y = pp.Edited_clean_data(x_train_,  y_train_, x_test_)

#important to add a constant term for the bias
x_train = np.concatenate((x,np.zeros((x.shape[0],1))+1),axis=1)
x_test = np.concatenate((x_submit,np.zeros((x_submit.shape[0],1))+1),axis=1)

assert x.shape[1]+1 == x_train.shape[1]

y_train = y.copy()
y_train = np.where(y == -1, 0, 1)
y_train = y_train.astype(int)

#X,Y,X_T,Y_T = pp.split_data(x,y_train ,.5)


Max median NaN score rows :  0.5688708393430034
Max median NaN score columns :  0.7598051755967697
Number of rows dropped because of a NaN score > 0.5:  29455
Number of columns dropped because of a NaN score > 0.5:  149
Number of columns with std < 0.1: 5
Number of columns with correl_coef > 0.95: 19
Handling NaN values...
Number of columns with corr_coef> 0.95 after cleaning: 2
The data has been cleaned and standardized
The cleaned x-data has the following shape:  (298680, 146)
The cleaned y-data has the following shape:  (298680,)
The cleaned x-data-to-predict has the following shape:  (109379, 146)


## Logistic Regression

### Implementing a Cross Validation in a Grid Search to find the best parameters

In [61]:
#Cross-Validation x Grid Search for a logistic regression
def k_fold_split(x, y, k):
    """Utility function to split data into k folds."""
    indices = np.arange(len(y))
    np.random.shuffle(indices)
    fold_size = len(y) // k
    folds = []
    
    for i in range(k):
        test_indices = indices[i * fold_size: (i + 1) * fold_size]
        train_indices = np.concatenate((indices[:i * fold_size], indices[(i + 1) * fold_size:]))
        folds.append((train_indices, test_indices))
    
    return folds
    
def grid_search_logistic_regression(y_train, x_train_cleaned, param_grid, w_initial, k=5):
    best_params = None
    best_score = float("inf")
    best_w = w_initial
    losses = []
        
        # Generate folds for cross-validation
    folds = k_fold_split(x_train_cleaned, y_train, k)

        # Iterate over each combination of parameters
    for max_iters in param_grid["max_iters"]:
        for gamma in param_grid["gamma"]:
            total_loss = 0
                
                # Perform k-fold cross-validation
            for train_indices, test_indices in folds:
                x_train_fold = x_train_cleaned[train_indices]
                y_train_fold = y_train[train_indices]
                x_test_fold = x_train_cleaned[test_indices]
                y_test_fold = y_train[test_indices]
                    
                w, loss = imp.logistic_regression(y_train_fold, x_train_fold, w_initial, max_iters, gamma)
                total_loss += loss

            avg_loss = total_loss / k
            losses.append((max_iters, gamma, avg_loss))
            print(f"Max Iters: {max_iters}, Gamma: {gamma}, Avg Loss: {avg_loss}")

            if avg_loss < best_score:
                best_score = avg_loss
                best_w = w
                best_params = {
                    "max_iters": max_iters,
                    "gamma": gamma,
                }

    return best_w, best_params, losses

In [62]:
#Pris de chat mais à modifier si vous voyez des trucs à changer 
def f1_score_(y_true, y_pred):
    # True Positives (TP): Both predicted and actual are positive (1)
    TP = np.sum((y_true == 1) & (y_pred == 1))
    
    # False Positives (FP): Predicted positive (1) but actual negative (0)
    FP = np.sum((y_true == 0) & (y_pred == 1))
    
    # False Negatives (FN): Predicted negative (0) but actual positive (1)
    FN = np.sum((y_true == 1) & (y_pred == 0))
    
    # Precision: TP / (TP + FP)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    
    # Recall: TP / (TP + FN)
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    
    # F1 Score: 2 * (Precision * Recall) / (Precision + Recall)
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return f1_score

In [63]:
def best_threshold(y_pred,y):
    # Search for the best threshold
    thresholds = np.arange(0.0, 1.0, 0.01)
    f1_scores = []

    for threshold in thresholds:
        y_pred_thres = np.where(y_pred > threshold, 1, -1)
        f1 = f1_score_(y, y_pred_thres)
        f1_scores.append(f1)

# Find the threshold with the highest F1 score
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    return optimal_threshold

In [64]:
param_grid = {
    "max_iters": [1500],
    "gamma": [1,0.1, 0.001],
    "lambda_": [1e-4,1e-5,1e-6],
}

In [65]:
w_grid, best_params, losses = grid_search_logistic_regression(y_train, x_train, param_grid, np.zeros(x_train.shape[1]))
print("Best hyperparameters found: ", best_params)

y_train_grid = imp.sigmoid(np.dot(x_train,w_grid))
optimal_threshold = best_threshold(y_train_grid,y_train)
print(y_train_grid)
y_pred = imp.sigmoid(np.dot(x_test, w_grid))
y_pred = np.where(y_pred > optimal_threshold , 1, -1)
print(optimal_threshold)
hlp.create_csv_submission(test_ids, y_pred, "y_pred_grid_retest.csv")

Max Iters: 1500, Gamma: 1, Avg Loss: 0.2293439389906225
Max Iters: 1500, Gamma: 0.1, Avg Loss: 0.22991550387791732
Max Iters: 1500, Gamma: 0.001, Avg Loss: 0.4828176750376191
Best hyperparameters found:  {'max_iters': 1500, 'gamma': 1}
[0.02266315 0.02811419 0.01355373 ... 0.08385015 0.02119777 0.02104295]
0.81


### Testing of an Adam optimizer 

In [66]:
#Adam optimizer 
def adam_optimizer(y, x, w, max_iters, gamma, beta1=0.9, beta2=0.999, epsilon=1e-8):
    m, v = np.zeros_like(w), np.zeros_like(w)
    for iter in range(max_iters):
        # Compute predictions and gradients
        
        predictions = imp.sigmoid(np.dot(x, w))
        gradient = np.dot(x.T, predictions - y)

        m = beta1 * m + (1 - beta1) * gradient
        v = beta2 * v + (1 - beta2) * (gradient ** 2)

        # Bias correction
        m_hat = m / (1 - beta1 ** (iter + 1))
        v_hat = v / (1 - beta2 ** (iter + 1))

        # Update weights
        w -= gamma * m_hat / (np.sqrt(v_hat) + epsilon)
    return w


In [67]:
w_adam = adam_optimizer(y_train, x_train, np.zeros(x_train.shape[1]), 1500, 0.1)

y_train_adam = imp.sigmoid(np.dot(x_train,w_adam))
optimal_threshold = best_threshold(y_train_adam,y_train)

y_pred_adam = imp.sigmoid(np.dot(x_test, w_adam))
y_pred_adam  = np.where(y_pred_adam > optimal_threshold , 1, -1)

hlp.create_csv_submission(test_ids, y_pred_adam, "y_pred_adam_old.csv")