In [1]:
import numpy as np
import matplotlib.pyplot as plt
import preprocessing as pp
import implementations as imp
import helpers as hlp
import os
from functions import *
import csv

%load_ext autoreload
%autoreload 2

## Data cleaning and Preprocessing

In [10]:
path = os.getcwd() + "/dataset_to_release/"

x_train_, x_test_, y_train_, train_ids_, test_ids_ = hlp.load_csv_data(path)

In [12]:
x, x_submit, y = pp.clean_data_final(x_train_, y_train_, x_test_)

# important to add a constant term for the bias for logistic regression
x_train = np.concatenate((x, np.zeros((x.shape[0], 1)) + 1), axis=1)
x_test = np.concatenate((x_submit, np.zeros((x_submit.shape[0], 1)) + 1), axis=1)


assert x.shape[1] + 1 == x_train.shape[1]

y_train = y.copy()
y_train = np.where(y == -1, 0, 1)  # change -1 to 0
y_train = y_train.astype(int)  # change to int

Unknown values replaced with NaN
Max median NaN score rows :  0.6060963867246555
Max median NaN score columns :  0.7652518774030354
Number of rows dropped because of a NaN score > 0.5:  37081
Number of columns dropped because of a NaN score > 0.5:  150
Number of columns with std < 0.1: 5
Number of columns with correl_coef > 0.95: 11
Handling NaN values...
Data clipped between 5th and 95th percentiles
Number of columns with std < 0.1 after cleaning: 11
Number of columns with corr_coef> 0.95 after cleaning: 14
The data has been cleaned and standardized
The cleaned x-data has the following shape:  (291054, 130)
The cleaned y-data has the following shape:  (291054,)
The cleaned x-data-to-predict has the following shape:  (109379, 130)


## Logistic Regression

### Test for specific hyperparameters

In [13]:
w, loss, losses = imp.logistic_regression(
    y_train, x_train, np.zeros(x_train.shape[1]), 1500, 0.1, True
)

In [14]:
y_pred = imp.sigmoid(x_train @ w)
thr = best_threshold(y_pred, y)  # y is (-1,1) and y_pred is (0,1)
y_pred_ = np.where(y_pred > thr, 1, -1)  # y_pred_ is (-1,1)

In [15]:
f1_score_(y, y_pred_)

0.41686046511627906

### Implementing a Cross Validation in a Grid Search to find the best parameters

In [16]:
param_grid = {
    "max_iters": [1500],
    "gamma": [1, 0.1, 0.001],
    "lambda_": [1e-4, 1e-5, 1e-6],
}

In [39]:
w_grid, best_params, losses = grid_search_logistic_regression(
    y_train, x_train, param_grid, np.zeros(x_train.shape[1])
)

Max Iters: 1500, Gamma: 1, Avg Loss: 0.22977808706011765
Max Iters: 1500, Gamma: 0.1, Avg Loss: 0.22991349322823157
Max Iters: 1500, Gamma: 0.001, Avg Loss: 0.4900182968043444


In [40]:
print("Best hyperparameters found: ", best_params)

Best hyperparameters found:  {'max_iters': 1500, 'gamma': 1}


In [41]:
y_train_pred = imp.sigmoid(x_train @ w_grid)  # y_train_pred is (0,1)
optimal_threshold = best_threshold(y_train_pred, y)
print("Optimal threshold found: ", optimal_threshold)

Optimal threshold found:  0.19


In [42]:
y_pred = imp.sigmoid(x_test @ w_grid)
y_pred_ = np.where(y_pred > optimal_threshold, 1, -1)
hlp.create_csv_submission(test_ids_, y_pred_, "logistic_regression24102147.csv")

### Testing of an Adam optimizer 

In [59]:
w_adam = adam_optimizer(y_train, x_train, np.zeros(x_train.shape[1]), 1500, 0.1)

y_train_adam = imp.sigmoid(np.dot(x_train, w_adam))
optimal_threshold = best_threshold(
    y_train_adam, y
)  # y_train is (-1,1) and y_train_adam is (0,1)

y_pred_adam = imp.sigmoid(np.dot(x_test, w_adam))
y_pred_adam = np.where(y_pred_adam > optimal_threshold, 1, -1)

hlp.create_csv_submission(test_ids_, y_pred_adam, "adam_final.csv")

### Logistic Regression with Regularization 

In [45]:
w_reg, best_params, losses = grid_search_reg_logistic_regression(
    y_train, x_train, param_grid, np.zeros(x_train.shape[1])
)
# w_reg, loss_reg = imp.reg_logistic_regression(y_train, x_train, 1e-6, np.zeros(x_train.shape[1]), 1500, 0.1)

Max Iters: 1500, Gamma: 1, Lambda: 0.0001, Avg Loss: 0.2297606158482602
Max Iters: 1500, Gamma: 1, Lambda: 1e-05, Avg Loss: 0.22976041756546514
Max Iters: 1500, Gamma: 1, Lambda: 1e-06, Avg Loss: 0.2297616920874618
Max Iters: 1500, Gamma: 0.1, Lambda: 0.0001, Avg Loss: 0.22993585946287043
Max Iters: 1500, Gamma: 0.1, Lambda: 1e-05, Avg Loss: 0.2298919415847535
Max Iters: 1500, Gamma: 0.1, Lambda: 1e-06, Avg Loss: 0.22988811346281293
Max Iters: 1500, Gamma: 0.001, Lambda: 0.0001, Avg Loss: 0.4900305808722318
Max Iters: 1500, Gamma: 0.001, Lambda: 1e-05, Avg Loss: 0.4900109562359458
Max Iters: 1500, Gamma: 0.001, Lambda: 1e-06, Avg Loss: 0.49000899364689304


In [46]:
print("Best hyperparameters found: ", best_params)

Best hyperparameters found:  {'max_iters': 1500, 'gamma': 1, 'lambda_': 1e-05}


In [47]:
y_train_reg = imp.sigmoid(np.dot(x_train, w_reg))
optimal_threshold_reg = best_threshold(
    y_train_reg, y
)  # y_train is (-1,1) and y_train_reg is (0,1)
print("Optimal threshold found for the regularized model: ", optimal_threshold_reg)

Optimal threshold found for the regularized model:  0.19


In [48]:
y_pred_reg = imp.sigmoid(np.dot(x_test, w_reg))
y_pred_reg = np.where(y_pred_reg > optimal_threshold_reg, 1, -1)

hlp.create_csv_submission(test_ids_, y_pred_reg, "y_pred_reg27101206.csv")