In [101]:
from helpers import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from implementations import *
import importlib
from preprocess_data import *
from cross_validation import *
from utils import *
from predict import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data('data/dataset')

In [102]:
DATA_FOLDER = "data/dataset_to_release/"


def model(x_train, y_train, gammas = [0.001, 0.01], lambdas = [0.01, 0.03]):
    """
    Train a regularized logistic regression model using cross-validation to find the best hyperparameters.
    Train the model with the best hyper parameters on the given dataset

    Parameters:
    tx (numpy.ndarray): Feature matrix of shape (num_samples, num_features).
    y (numpy.ndarray): Label vector of shape (num_samples,).
    max_iters (int): Maximum number of iterations for the logistic regression optimization.
    gammas (list): List of learning rate values to search through during cross-validation.
    lambdas (list): List of lambda (regularization strength) values to search through during cross-validation.
    regulizer_orders (list): List of regularization orders to search through during cross-validation.

    Returns:
    w (numpy.ndarray): Weight vector of the trained logistic regression model.
    loss (float): Loss value of the trained model.
    """

    functions = ["least_squares", "mean_squared_error_gd", "mean_squared_error_sgd", "ridge_regression", "logistic_regression",  "reg_logistic_regression"]
    cur_functions = ["least_squares", "mean_squared_error_sgd", "ridge_regression", "logistic_regression",  "reg_logistic_regression"]
    best_hyper_params = []

    
    print("using least_squares")
    best_hyper_params.append(
        hypertuning(y_train, x_train, [0], [0], functions[0], np.arange(0, 0.3, 0.01))
    )
    
    #print("using mean_squared_error_gd")
    #best_hyper_params.append(
    #    hypertuning(y_train, x_train, gammas, [0], functions[1])
    #)

    print("using mean_squared_error_sgd")
    best_hyper_params.append(
        hypertuning(y_train, x_train, gammas, [0], functions[2], np.arange(0, 0.3, 0.01))
    )

    print("using ridge linear regression")
    best_hyper_params.append(
        hypertuning(y_train, x_train, [0], lambdas, functions[3], np.arange(0, 0.3, 0.01))
    )

    print("using logistic regression")
    best_hyper_params.append(
        hypertuning(y_train, x_train, gammas, [0], functions[4], np.arange(0.4, 0.6, 0.01))
    )

    print("using regularized logistic regression")
    best_hyper_params.append(
       hypertuning(y_train, x_train, gammas, lambdas, functions[5], [0.5])
    )

    perfs = [x[3][2] for x in best_hyper_params]
    best_np_hyperparams = [x[:3] for x in best_hyper_params]
    best_arr_hyper_params = np.array(best_np_hyperparams)

    # Determine the best model based on a combination of F1 score and accuracy
    best_idx = np.argmax(perfs)
    best_lambda = best_arr_hyper_params[best_idx, 0]
    best_gamma = best_arr_hyper_params[best_idx, 1]
    best_threshold = best_arr_hyper_params[best_idx, 2]
    best_model = cur_functions[best_idx]


    
    x_train_proc, x_test_proc, y_train_proc = preprocess(x_train, x_test, y_train, best_model)
    w = 0
    # Train the model using the specified method
    if best_model == "mean_squared_error_gd":
        w, _ = mean_squared_error_gd(y_train_proc, x_train_proc, initial_w, max_iters, best_gamma)
    elif best_model == "mean_squared_error_sgd":
        w, _ = mean_squared_error_sgd(y_train_proc, x_train_proc, initial_w, max_iters, best_gamma)
    elif best_model == "least_squares":
        w, _ = least_squares(y_train_proc, x_train_proc)
    elif best_model == "ridge_regression":
        w, _ = ridge_regression(y_train_proc, x_train_proc, best_lambda)
    elif best_model == "logistic_regression":
        w, _ = logistic_regression(y_train_proc, x_train_proc, initial_w, max_iters, best_gamma)
    elif best_model == "reg_logistic_regression":
        w, _ = reg_logistic_regression(y_train_proc, x_train_proc, best_lambda, initial_w, max_iters, best_gamma)
    
    return w, best_model 

In [None]:
## Define hyperparameter search spaces
print("Start training")
lambdas = np.logspace(-4, 3, 8)
gammas = np.logspace(-4, 3, 8)

# Train the model and find the best hyperparameters
w, best_model = model(x_train, y_train, gammas, lambdas)

Start training
using regularized logistic regression
Labels converted from -1 to 0.
Preprocessing for fold 1:

Labels converted from -1 to 0.
6 features where all values are the same removed.
Replaced all NaN values with -1.
Removed 3698 outliers (z-score > 2).
Data standardized using z-score scaling.
Original sizes:
  - Majority class (0): 146297 (91.23%)
  - Minority class (1): 14072 (8.77%)
Minority class upsampled:
  - Class (0): 146297 (80.00%)
  - Class (1): 36574 (20.00%)
PCA performed to reduce features from 315 to 299.
Labels converted from -1 to 0.
Preprocessing for fold 2:

Labels converted from -1 to 0.
6 features where all values are the same removed.
Replaced all NaN values with -1.
Removed 2908 outliers (z-score > 2).
Data standardized using z-score scaling.
Original sizes:
  - Majority class (0): 146855 (91.12%)
  - Minority class (1): 14305 (8.88%)
Minority class upsampled:
  - Class (0): 146855 (80.00%)
  - Class (1): 36713 (20.00%)
PCA performed to reduce features fr

In [94]:
x_train_proc, x_test_proc, y_train_proc = preprocess(x_train, x_test, y_train, best_model)
w = 0
# Train the model using the specified method
if best_model == "mean_squared_error_gd":
    w, _ = mean_squared_error_gd(y_train_proc, x_train_proc, initial_w, max_iters, best_gamma)
elif best_model == "mean_squared_error_sgd":
    w, _ = mean_squared_error_sgd(y_train_proc, x_train_proc, initial_w, max_iters, best_gamma)
elif best_model == "least_squares":
    w, _ = least_squares(y_train_proc, x_train_proc)
elif best_model == "ridge_regression":
    w, _ = ridge_regression(y_train_proc, x_train_proc, best_lambda)
elif best_model == "logistic_regression":
    w, _ = logistic_regression(y_train_proc, x_train_proc, initial_w, max_iters, best_gamma)
elif best_model == "reg_logistic_regression":
    w, _ = reg_logistic_regression(y_train_proc, x_train_proc, best_lambda, initial_w, max_iters, best_gamma)



Labels converted from -1 to 0.
6 features where all values are the same removed.
Replaced all NaN values with -1.
Data standardized using z-score scaling.
Original sizes:
  - Majority class (0): 299160 (91.17%)
  - Minority class (1): 28975 (8.83%)
Minority class upsampled:
  - Class (0): 299160 (80.00%)
  - Class (1): 74790 (20.00%)
PCA performed to reduce features from 315 to 299.


In [95]:
# Save the trained model parameters
print("The best model is ", best_model)
np.save("model_performance/model_parameters.npy", w)

# Perform classification on the predictions and save the results
x_train_proc, x_test_proc, y_train_proc = preprocess(x_train, x_test, y_train, best_model)

The best model is  ridge_regression
Labels converted from -1 to 0.
6 features where all values are the same removed.
Replaced all NaN values with -1.
Data standardized using z-score scaling.
Original sizes:
  - Majority class (0): 299160 (91.17%)
  - Minority class (1): 28975 (8.83%)
Minority class upsampled:
  - Class (0): 299160 (80.00%)
  - Class (1): 74790 (20.00%)
PCA performed to reduce features from 315 to 299.


In [97]:
y_pred = predict(w, x_test_proc, best_model, threshold = 0)

y_pred = np.where(y_pred==0, -1, 1)

print(y_pred)

# Use the create_csv_submission function to save predictions
create_csv_submission(test_ids, y_pred, "predictions.csv")

[-1 -1  1 ... -1  1  1]
