In [34]:
import numpy as np
import matplotlib.pyplot as plt
import preprocessing as pp
import implementations as imp
import helpers as hlp
import os
from functions import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data cleaning and Preprocessing

In [None]:
path = "/Users/maelynenguyen/Desktop/"

x_train_ = np.load(path+"f_x_train_.npy")
x_test_ = np.load(path+"f_x_test_.npy")
y_train_ = np.load(path+"f_y_train_.npy")
test_ids = np.load(path+"f_test_ids_.npy")
train_ids = np.load(path+"f_train_ids_.npy")

In [2]:
path = os.getcwd() + '/dataset_to_release'
x_train_, x_test_, y_train_, train_ids_, test_ids_ = hlp.load_csv_data(path)

In [108]:
x, x_submit, y = pp.clean_data(x_train_,  y_train_, x_test_)

# important to add a constant term for the bias
x_train = np.concatenate((x,np.zeros((x.shape[0],1))+1),axis=1)
x_test = np.concatenate((x_submit,np.zeros((x_submit.shape[0],1))+1),axis=1)


assert x.shape[1]+1 == x_train.shape[1]

y_train = y.copy()
y_train = np.where(y == -1, 0, 1)  # change -1 to 0
y_train = y_train.astype(int)      # change to int


Max median NaN score rows :  0.5688708393430034
Max median NaN score columns :  0.7598051755967697
Number of rows dropped because of a NaN score > 0.5:  29455
Number of columns dropped because of a NaN score > 0.5:  149
Number of columns with std < 0.1: 5
Number of columns with correl_coef > 0.95: 19
Handling NaN values...
Number of columns with corr_coef> 0.95 after cleaning: 2
The data has been cleaned and standardized
The cleaned x-data has the following shape:  (298680, 146)
The cleaned y-data has the following shape:  (298680,)
The cleaned x-data-to-predict has the following shape:  (109379, 146)


## Logistic Regression

### Test for specific hyperparameters

In [109]:
w, loss, losses = imp.logistic_regression(y_train, x_train, np.zeros(x_train.shape[1]), 1500, 0.1, True)

In [110]:
y_pred = imp.sigmoid(x_train@w)
thr = best_threshold(y_pred,y) # y is (-1,1) and y_pred is (0,1)
y_pred_ = np.where(y_pred > thr, 1, -1) # y_pred_ is (-1,1)

In [113]:
f1_score_(y, y_pred_)

0.4131290271431249

### Implementing a Cross Validation in a Grid Search to find the best parameters

In [114]:
param_grid = {
    "max_iters": [1500],
    "gamma": [1,0.1, 0.001],
    "lambda_": [1e-4,1e-5,1e-6],
}

In [77]:
w_grid, best_params, losses = grid_search_logistic_regression(y_train, x_train, param_grid, np.zeros(x_train.shape[1]))


Max Iters: 1500, Gamma: 1, Avg Loss: 0.2300075615646316
Max Iters: 1500, Gamma: 0.1, Avg Loss: 0.23050654885438465
Max Iters: 1500, Gamma: 0.001, Avg Loss: 0.48293189202025494


In [78]:
print("Best hyperparameters found: ", best_params)

Best hyperparameters found:  {'max_iters': 1500, 'gamma': 1}


In [79]:
y_train_pred = imp.sigmoid(x_train@w_grid) # y_train_pred is (0,1)
optimal_threshold = best_threshold(y_train_pred,y) 
print("Optimal threshold found: ", optimal_threshold)


Optimal threshold found:  0.2


In [82]:
y_pred = imp.sigmoid(x_test@w_grid)
y_pred_ = np.where(y_pred > optimal_threshold, 1, -1)
hlp.create_csv_submission(test_ids_, y_pred_, "logistic_regression24102147.csv")

### Testing of an Adam optimizer 

In [116]:
w_adam = adam_optimizer(y_train, x_train, np.zeros(x_train.shape[1]), 1500, 0.1)

y_train_adam = imp.sigmoid(np.dot(x_train, w_adam)) 
optimal_threshold = best_threshold(y_train_adam, y) # y_train is (-1,1) and y_train_adam is (0,1)

y_pred_adam = imp.sigmoid(np.dot(x_test, w_adam))
y_pred_adam  = np.where(y_pred_adam > optimal_threshold , 1, -1)

hlp.create_csv_submission(test_ids_, y_pred_adam, "y_pred_adam25101206.csv")