# This method is (normally) reproducible for logistic regression

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import datetime
from implementations import *
%load_ext autoreload
%autoreload 2

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = r'C:\Users\Lolo\Documents\Master_EPFL\Ma3\ML\ML_course\projects\project1\data\train.csv\train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [3]:
features_names = ['Id', 'Prediction', 'DER_mass_MMC', 'DER_mass_transverse_met_lep',
       'DER_mass_vis', 'DER_pt_h', 'DER_deltaeta_jet_jet',
       'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_deltar_tau_lep',
       'DER_pt_tot', 'DER_sum_pt', 'DER_pt_ratio_lep_tau',
       'DER_met_phi_centrality', 'DER_lep_eta_centrality', 'PRI_tau_pt',
       'PRI_tau_eta', 'PRI_tau_phi', 'PRI_lep_pt', 'PRI_lep_eta',
       'PRI_lep_phi', 'PRI_met', 'PRI_met_phi', 'PRI_met_sumet',
       'PRI_jet_num', 'PRI_jet_leading_pt', 'PRI_jet_leading_eta',
       'PRI_jet_leading_phi', 'PRI_jet_subleading_pt',
       'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi',
       'PRI_jet_all_pt']

In [4]:
print(y.shape)

(250000,)


In [5]:
print(tX.shape)

(250000, 30)


### Preprocessing

In [6]:
def clean_data(tX):
    '''Set NaN values to the value of the mean for each feature'''
    
    tX[tX <= -999] = np.nan
    col_mean = np.nanmean(tX, axis=0)
    inds = np.where(np.isnan(tX))
    tX[inds] = np.take(col_mean, inds[1])
    
    return tX

In [7]:
def normalize_data(tX):
    
    temp_mean = []
    temp_min_max = []
    
    for i in range(tX.shape[1]):
        temp_mean.append(np.mean(tX[:,i]))
        temp_min_max.append((np.max(tX[:,i]) - np.min(tX[:,i])))
        tX[:,i] = tX[:,i] - np.mean(tX[:,i])
        tX[:,i] = tX[:,i] / (np.max(tX[:,i]) - np.min(tX[:,i]))
    
    tX[:,22] = temp_min_max[22]*tX[:,22]
    tX[:,22] += temp_mean[22]
    
    return tX

In [8]:
tX = clean_data(tX)
tX = normalize_data(tX)

In [17]:
def polynomial_regression_building(x_train, x_test, degree):
    """Creation of a polynomial regression for the current feature matrix.
       This method avoids the modification of the categorical feature, 
       needs to be called to modify the testing set """
    
    # Avoid feature 22 (= categorical) for polynomial augmentation 
    x_train_categorical = x_train[:,22]
    print(x_train_categorical.shape)
    x_test_categorical = x_test[:,22]
    x_train = np.delete(x_train,[22],axis=1)
    x_test = np.delete(x_test,[22],axis=1)
    
    print("x_train shape: ", x_train.shape)
    
    # Build polynomial train and test sets
    x_train = build_poly(x_train, degree)
    x_test = build_poly(x_test, degree)
    
    # Take feature 22 back
    x_train = np.c_[x_train, x_train_categorical]
    x_test = np.c_[x_test, x_test_categorical]
    
    print("x_train shape final: ", x_train.shape)
    
    # Initialize
    initial_w = np.zeros(x_train.shape[1])
    print("w shape: ", initial_w.shape)
    
    return initial_w, x_train, x_test

### Determination of best degree and gamma by CV

In [18]:
def cross_validation_gamma(y, x, k_indices, k, gamma, choose_model, models, degree=1):
    """Cross validation for each models"""

    x_test = x[k_indices[k]]
    y_test = y[k_indices[k]]
    x_train = x[np.concatenate(([x_train for i,x_train in enumerate(k_indices) if i!=k]), axis=0)]
    y_train = y[np.concatenate(([y_train for i,y_train in enumerate(k_indices) if i!=k]), axis=0)]
    
    initial_w, x_train, x_test = polynomial_regression_building(x_train, x_test, degree)
    
    # Calculate loss and optimal weights for each model
    
    if choose_model == models[0]:
        w, loss = least_squares_GD(y_train, x_train, initial_w, max_iters, gamma)
    elif choose_model == models[1]:
        w, loss = least_squares_SGD(y_train, x_train, initial_w, max_iters, gamma)
    elif choose_model == models[2]:
        w, loss = least_squares(y_train, x_train) 
    else:
        w, loss = logistic_regression(y_train, x_train, initial_w, max_iters, gamma)

    #loss_tr = compute_loss(y_train,x_train,w)
    #loss_te = compute_loss(y_test,x_test,w)
    
    #Note: We can also use mae to obtain the loss
    
    # To uncomment if we decide to plot rmse
    loss_tr = np.sqrt(2*compute_loss(y_train,x_train,w))
    loss_te = np.sqrt(2*compute_loss(y_test,x_test,w))

    return loss_tr, loss_te, w

### Use ctr+y to turn to code, can be modified to create the loss evolution by boxplot representation

seed = 1
degree = 1 # degree parameter not used until polynomial regression is tested
k_fold = 4
gammas = np.linspace(0, 1, 10)
max_iters = 10
initial_w = np.zeros(tX.shape[1])

# Define the model (change indice to define which model to test)
models = ["GD", "SGD", "LS", "LRGD"]   
choose_model = models[0]

# split data in k fold
k_indices = build_k_indices(y, k_fold, seed)

# define lists to store the loss of training data and test data
mse_tr = []
mse_te = []

for ind, gamma in enumerate(gammas):
    temp_tr = []
    temp_te = []
    for k in range(k_fold):
        loss_tr, loss_te,_ = cross_validation_gamma(y, tX, initial_w, k_indices, k, gamma, choose_model, models, degree)
        temp_tr.append(loss_tr)
        temp_te.append(loss_te)
    mse_tr.append(temp_tr)
    mse_te.append(temp_te)
plt.boxplot(mse_te)

In [14]:
seed = 1
degrees = [1, 3] # degree parameter not used until polynomial regression is tested
k_fold = 4
gammas = np.logspace(-8, 1, 30)
max_iters = 100

# Define the model (change indice to define which model to test)
models = ["GD", "SGD", "LS", "LRGD"]   
choose_model = models[0]

# split data in k fold
k_indices = build_k_indices(y, k_fold, seed)

temp_mse_tr = []
temp_mse_te = []
for degree in degrees:
# define lists to store the loss of training data and test data
    mse_tr = []
    mse_te = []

    for ind, gamma in enumerate(gammas):
        temp_tr = 0
        temp_te = 0

        for k in range(k_fold):
            loss_tr, loss_te,_ = cross_validation_gamma(y, tX, k_indices, k, gamma, choose_model, models, degree)
            temp_tr += loss_tr
            temp_te += loss_te
        mse_tr.append(temp_tr/k_fold)
        mse_te.append(temp_te/k_fold)
    min_gamma = np.argmin(mse_te)
    temp_mse_te.append(mse_te[min_gamma])
    
len(temp_mse_te)
min_degree = np.argmin(temp_mse_te)
print('optimal gamma: ', gammas[min_gamma], 'optimal degree: ', degrees[min_degree])

(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)


x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 31)
w shape:  (31,)
(187500,)
x_train shape:  (187500, 29)
x_train sh

(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 89)
w shape:  (89,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 89)
w shape:  (89,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 89)
w shape:  (89,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 89)
w shape:  (89,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 89)
w shape:  (89,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 89)
w shape:  (89,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 89)
w shape:  (89,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 89)
w shape:  (89,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 89)
w shape:  (89,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 89)
w shape:  (89,)
(187500,)
x_train shape:  (187500, 29)
x_train shape final:  (187500, 89)
w shape:  (89,)
(187500,)


### Generation of the weights using the optimized gamma and degree

In [19]:
x_train, x_test, y_train, y_test = split_data(tX, y, 0.8, 1)

degree = 3
gamma = 0.5736152510448681
#lambda_ = 0.02
max_iters = 500

w, x_train, x_test = polynomial_regression_building(x_train, x_test, degree)

w, loss = least_squares_GD(y_train, x_train, w, max_iters, gamma)
rmse_te = compute_loss(y_test, x_test, w)

(200000,)
x_train shape:  (200000, 29)
x_train shape final:  (200000, 89)
w shape:  (89,)
0.35134814735063297
[-2.88042074e-01 -7.15835725e-02 -2.12074852e+00 -8.32645850e-02
  3.08125393e-01  5.47846710e-01  3.28894611e-01 -2.34378203e-01
  6.48497919e-01 -8.55612153e-02  5.61002111e-01 -1.29187349e+00
  4.06006645e-01  3.17153531e-01  1.15489252e+00 -5.16423615e-03
 -4.36335445e-03 -6.36917369e-02  7.89247805e-03  7.42082898e-03
 -7.50255158e-03  1.09783685e-02  2.77167338e-01  2.82814130e-01
 -2.34076578e-03  3.43967648e-03 -1.44075818e-01  1.16289640e-02
 -1.38374808e-02  1.14904355e-01 -2.72382574e-01  6.11768671e-02
 -1.07794070e-01  2.02591258e-02  5.19449841e-01  9.89076118e-02
  1.49293061e-01 -7.12012740e-01  2.37193573e-03 -6.65262432e-02
  2.38909530e-02  5.51027336e-01  5.95850049e-01  1.06531473e-02
 -7.91348220e-01 -1.19621629e-03  1.43819792e-02 -1.01156928e+00
 -6.79934377e-03  7.72908682e-03  1.03122486e-02 -9.22597265e-02
 -1.01855222e-02  8.40430158e-01  9.57605274e

In [20]:
print(rmse_te)
print(w)

0.35134814735063297
[-2.88042074e-01 -7.15835725e-02 -2.12074852e+00 -8.32645850e-02
  3.08125393e-01  5.47846710e-01  3.28894611e-01 -2.34378203e-01
  6.48497919e-01 -8.55612153e-02  5.61002111e-01 -1.29187349e+00
  4.06006645e-01  3.17153531e-01  1.15489252e+00 -5.16423615e-03
 -4.36335445e-03 -6.36917369e-02  7.89247805e-03  7.42082898e-03
 -7.50255158e-03  1.09783685e-02  2.77167338e-01  2.82814130e-01
 -2.34076578e-03  3.43967648e-03 -1.44075818e-01  1.16289640e-02
 -1.38374808e-02  1.14904355e-01 -2.72382574e-01  6.11768671e-02
 -1.07794070e-01  2.02591258e-02  5.19449841e-01  9.89076118e-02
  1.49293061e-01 -7.12012740e-01  2.37193573e-03 -6.65262432e-02
  2.38909530e-02  5.51027336e-01  5.95850049e-01  1.06531473e-02
 -7.91348220e-01 -1.19621629e-03  1.43819792e-02 -1.01156928e+00
 -6.79934377e-03  7.72908682e-03  1.03122486e-02 -9.22597265e-02
 -1.01855222e-02  8.40430158e-01  9.57605274e-02 -3.49215881e-02
  3.91890604e-01  1.17585692e-01 -9.48765212e-02 -6.18105480e-02
  1.6