#### Packages

In [None]:
import numpy as np
from extra_helpers import *
from feature_importance import *
from proj1_helpers import *
from data_processing import *
from implementations import *
from objective_functions import *
from run_functions import *

## Data

In [None]:
## 1 - Data importation.
y, tX, ids = load_csv_data("data/train.csv")

## 2 - Changing "-1" to "0" in the response vector
##     to be in phase with the major part of the scientific literature.
y[np.where(y == -1)] = 0

## 3 - Sending the categorical feature (PRI_jet_num) as the last column of the data matrix
##     using the rearrange_continuous_categorical_features() function.
tX = rearrange_continuous_categorical_features(tX)

In [None]:
## 1 - Knowing that the last column is the categorical feature, we isolate this column
##     and identify the indexes for each cases, i.e. PRI_jet_num = 0, 1, 2 or 3.
categories = tX[:, -1]
zeros_index = np.where(categories == 0)[0]
one_index = np.where(categories == 1)[0]
two_index = np.where(categories == 2)[0]
three_index = np.where(categories == 3)[0]

## 2 - We isolate the four different datasets corresponding to the different cases of PRI_jet_num.
##   - For each of them:
##      - we identify the features having null variance and we delete them.
##      - we transform all -999 values to np.nan.
##      - we impute the missing values using the median imputation technique.
zeros = tX[zeros_index, :]
y_zero = y[zeros_index]
null_var_index_zero = np.where(np.std(zeros, axis=0) == 0)[0]
zeros = np.delete(zeros, null_var_index_zero, axis=1)
zeros[np.where(zeros == -999)] = np.nan
zeros = median_imputation(zeros)

ones = tX[one_index, :]
y_one = y[one_index]
null_var_index_one = np.where(np.std(ones, axis=0) == 0)[0]
ones = np.delete(ones, null_var_index_one, axis=1)
ones[np.where(ones == -999)] = np.nan
ones = median_imputation(ones)

two = tX[two_index, :]
y_two = y[two_index]
null_var_index_two = np.where(np.std(two, axis=0) == 0)[0]
two = np.delete(two, null_var_index_two, axis=1)
two[np.where(two == -999)] = np.nan
two = median_imputation(two)

three = tX[three_index, :]
y_three = y[three_index]
null_var_index_three = np.where(np.std(three, axis=0) == 0)[0]
three = np.delete(three, null_var_index_three, axis=1)
three[np.where(three == -999)] = np.nan
three = median_imputation(three)

## Searching the optimal data augmentation for each datasets

After concluding that the optimal penalty parameter ($\lambda^{*}_{i}$) for each model were zeros, we lightened the code by deleting this part. For the sake of transparency we will leave visible the code we developed using the golden search algorithm for the model $i = 0$ as a text cell. In the same way, we noted that pairwise interactions were always beneficial.

The following cells show how we found the most adequate polynomial degree for each model $i$.

The procedure is the following :

for $j = 1, 2, ..., 20$ being the degree of the polynomial augmentation:

    1 - We augment the data.
    2 - We scale the data.
    3 - We apply a change of basis.
    4 - We add a bias term.
    5 - We perform 5-fold cross validation.

### PRI_jet_num = 0

In [None]:
means_0 = list()
medians_0 = list()
stds_0 = list()
for i in range(1, 21):
    print("*****************")
    print("\n")
    print(f"Polynomial of degree = {i}")
    tx = process_data(x=zeros, degree=i, pairwise=True, bias=False) # Data augmentation.
    tx, _, __ = gaussian_scaling(tx) # Scaling.
    tx, tosolve_tx = orthogonal_basis(tx) # Change of basis.
    tx = process_data(x=tx, degree=0, pairwise=False, bias=True) # Adding a bias.
    acc, m, md, std_ = cross_validation(y_zero, tx, k_fold=5) # Cross-validating.
    means_0.append(m)
    medians_0.append(md)
    stds_0.append(std_)
    print("\n")

### PRI_jet_num = 1

In [None]:
means_1 = list()
medians_1 = list()
stds_1 = list()
for i in range(1, 21):
    print("*****************")
    print("\n")
    print(i)
    tx = process_data(x = ones, degree=i, pairwise=True, bias=False) # Data augmentation.
    tx, _, __ = gaussian_scaling(tx) # Scaling.
    tx, tosolve_tx = orthogonal_basis(tx) # Change of basis.
    tx = process_data(x = tx, degree=0, pairwise=False, bias=True) # Adding a bias.
    acc, m, md, std_ = cross_validation(y_one, tx, k_fold=5) # Cross-validating.
    means_1.append(m)
    medians_1.append(md)
    stds_1.append(std_)

### PRI_jet_num = 2

In [None]:
means_2 = list()
medians_2 = list()
stds_2 = list()
for i in range(1, 21):
    print("*****************")
    print("\n")
    print(i)
    tx = process_data(x = two, degree=i, pairwise=True, bias=False) # Data augmentation.
    tx, _, __ = gaussian_scaling(tx) # Scaling.
    tx, tosolve_tx = orthogonal_basis(tx) # Change of basis.
    tx = process_data(x = tx, degree=0, pairwise=False, bias=True) # Adding a bias.
    acc, m, md, std_ = cross_validation(y_two, tx, k_fold=5) # Cross-validating.
    means_2.append(m)
    medians_2.append(md)
    stds_2.append(std_)

### PRI_jet_num = 3

In [None]:
means_3 = list()
medians_3 = list()
stds_3 = list()
for i in range(1, 21):
    print("\n")
    print("*****************")
    print("\n")
    print(i)
    tx = process_data(x = three, degree=i, pairwise=True, bias=False) # Data augmentation.
    tx, _, __ = gaussian_scaling(tx) # Scaling.
    tx, tosolve_tx = orthogonal_basis(tx) # Change of basis.
    tx = process_data(x = tx, degree=0, pairwise=False, bias=True) # Adding a bias.
    acc, m, md, std_ = cross_validation(y_three, tx, k_fold=5) # Cross-validating.
    means_3.append(m)
    medians_3.append(md)
    stds_3.append(std_)

## Model training

At this stage, the optimal parameters for all four models were determined. In this section, we train the four models starting with the Newton's method (because of the nice convergence of this algorithm) and if the optimization stops prematurely because of singular hessian or because of numerical instabilities, we pursue with a gradient descent algorithm.

### Model$_{PRI jet num = 0}$ i.e. $w_{0}$

In [None]:
## We apply the relevant transformations based on optimal parameters found earlier.
tx_zeros = process_data(x = zeros, degree=13, pairwise=True, bias=False)
tx_zeros, mean_tx_zeros, std_tx_zeros = gaussian_scaling(tx_zeros)
tx_zeros, tosolve_tx_zeros = orthogonal_basis(tx_zeros)
tx_zeros = process_data(x = tx_zeros, degree=0, pairwise=False, bias=True)

In [None]:
## Newton's method & gradient descent.
loss_0, w_0, grad_norm_0 = logistic_newton_descent(y_zero,
                                                   tx_zeros,
                                                   w=np.zeros(
                                                       tx_zeros.shape[1]),
                                                   lambda_=0,
                                                   max_iters=1000,
                                                   eps=1e-10,
                                                   w_start_OLS=True)
loss_0, w_0, grad_norm_0 = logistic_gradient_descent(y_zero,
                                                     tx_zeros,
                                                     w=w_0,
                                                     max_iters=30000,
                                                     lambda_=0,
                                                     gamma=0.05,
                                                     eps=1e-4,
                                                     w_start_OLS=False)

In [None]:
## 1 - We search for the optimal threshold that maximize the accuracy.
## 2 - We check the in-sample performance of our model.
thresh_0 = threshold(y_zero, sigmoid(tx_zeros@w_0))
pred = (sigmoid(tx_zeros@w_0) > thresh_0)*1
accuracy = 1 - sum(np.abs(pred - y_zero))/len(y_zero)
accuracy

### Model$_{PRI jet num = 1}$ i.e. $w_{1}$

In [None]:
## We apply the relevant transformations based on optimal parameters found earlier.
tx_ones = process_data(x = ones, degree=17, pairwise=True, bias=False)
tx_ones, mean_tx_ones, std_tx_ones = gaussian_scaling(tx_ones)
tx_ones, tosolve_tx_ones = orthogonal_basis(tx_ones)
tx_ones = process_data(x = tx_ones, degree=0, pairwise=False, bias=True)

In [None]:
## Newton's method & gradient descent.
loss_1, w_1, grad_norm_1 = logistic_newton_descent(y_one,
                                                   tx_ones,
                                                   w=np.zeros(
                                                       tx_ones.shape[1]),
                                                   lambda_=0,
                                                   max_iters=1000,
                                                   eps=1e-10,
                                                   w_start_OLS=True)
loss_1, w_1, grad_norm_1 = logistic_gradient_descent(y_one,
                                                     tx_ones,
                                                     w=w_1,
                                                     max_iters=30000,
                                                     lambda_=0,
                                                     gamma=0.05,
                                                     eps=2e-4,
                                                     w_start_OLS=False)

In [None]:
## 1 - We search for the optimal threshold that maximize the accuracy.
## 2 - We check the in-sample performance of our model.
thresh_1 = threshold(y_one, sigmoid(tx_ones@w_1))
pred = (sigmoid(tx_ones@w_1) > thresh_1)*1
accuracy = 1 - sum(np.abs(pred - y_one))/len(y_one)
accuracy

### Model$_{PRI jet num = 2}$ i.e. $w_{2}$

In [None]:
## We apply the relevant transformations based on optimal parameters found earlier.
tx_two = process_data(x = two, degree=13, pairwise=True, bias=False)
tx_two, mean_tx_two, std_tx_two = gaussian_scaling(tx_two)
tx_two, tosolve_tx_two = orthogonal_basis(tx_two)
tx_two = process_data(x = tx_two, degree=0, pairwise=False, bias=True)

In [None]:
## Newton's method & gradient descent.
loss_2, w_2, grad_norm_2 = logistic_newton_descent(y_two,
                                                   tx_two,
                                                   w=np.zeros(
                                                       tx_two.shape[1]),
                                                   lambda_=0,
                                                   max_iters=1000,
                                                   eps=1e-10,
                                                   w_start_OLS=True)
loss_2, w_2, grad_norm_2 = logistic_gradient_descent(y_two,
                                                     tx_two,
                                                     w=w_2,
                                                     max_iters=30000,
                                                     lambda_=0,
                                                     gamma=0.05,
                                                     eps=1e-4,
                                                     w_start_OLS=False)

In [None]:
## 1 - We search for the optimal threshold that maximize the accuracy.
## 2 - We check the in-sample performance of our model.
thresh_2 = threshold(y_two, sigmoid(tx_two@w_2))
pred = (sigmoid(tx_two@w_2) > thresh_2)*1
accuracy = 1 - sum(np.abs(pred - y_two))/len(y_two)
accuracy

### Model$_{PRI jet num = 3}$ i.e. $w_{3}$

In [None]:
## We apply the relevant transformations based on optimal parameters found earlier.
tx_three = process_data(x = three, degree=10, pairwise=True, bias=False)
tx_three, mean_tx_three, std_tx_three = gaussian_scaling(tx_three)
tx_three, tosolve_tx_three = orthogonal_basis(tx_three)
tx_three = process_data(x = tx_three, degree=0, pairwise=False, bias=True)

In [None]:
## Newton's method & gradient descent.
loss_3, w_3, grad_norm_3 = logistic_newton_descent(y_three,
                                                   tx_three,
                                                   w=np.zeros(
                                                       tx_three.shape[1]),
                                                   lambda_=0,
                                                   max_iters=1000,
                                                   eps=1e-10,
                                                   w_start_OLS=True)
loss_3, w_3, grad_norm_3 = logistic_gradient_descent(y_three,
                                                     tx_three,
                                                     w=w_3,
                                                     max_iters=30000,
                                                     lambda_=0,
                                                     gamma=0.05,
                                                     eps=1e-4,
                                                     w_start_OLS=False)

In [None]:
## 1 - We search for the optimal threshold that maximize the accuracy.
## 2 - We check the in-sample performance of our model.
thresh_3 = threshold(y_three, sigmoid(tx_three@w_3))
pred = (sigmoid(tx_three@w_3) > thresh_3)*1
accuracy = 1 - sum(np.abs(pred - y_three))/len(y_three)
accuracy

## Predictions

### Test set processing

In [None]:
## 1 - Loading the test set.
## 2 - Splitting the test set according to PRI_jet_num.
_, tX_test, ids_test = load_csv_data("data/test.csv")
zeros_test, ones_test, two_test, three_test, zeros_index_test, one_index_test, two_index_test, three_index_test = PRI_jet_num_split(
    tX_test)

In [None]:
## Applying the relevant transformation found on the training sets to the test sets.
##      - Deleting features on the test set based on null variance features of the training sets.
##      - Imputing missing values.
##      - Data augmentation based on optimal parameters found earlier.
##      - Scaling based on training statistics.
##      - Change of basis based on training eigenvectors.
zeros_test = process_testdata(
    zeros_test, null_var_index_zero, 13, mean_tx_zeros, std_tx_zeros, tosolve_tx_zeros)
ones_test = process_testdata(
    ones_test, null_var_index_one, 17, mean_tx_ones, std_tx_ones, tosolve_tx_ones)
two_test = process_testdata(
    two_test, null_var_index_two, 13, mean_tx_two, std_tx_two, tosolve_tx_two)
three_test = process_testdata(
    three_test, null_var_index_three, 10, mean_tx_three, std_tx_three, tosolve_tx_three)

### Predictions

In [None]:
## 1 - Predicting labels using trained models and optimal thresholds.
## 2 - Transforming back "0" to "-1".
## 3 - Mapping the predictions to their original place in the response vector.
predictions = _
predictions[zeros_index_test] = predict(zeros_test, w_0, thresh_0)
predictions[one_index_test] = predict(ones_test, w_1, thresh_1)
predictions[two_index_test] = predict(two_test, w_2, thresh_2)
predictions[three_index_test] = predict(three_test, w_3, thresh_3)

In [2]:
## Checking the proportion of "-1"
len(np.where(predictions==-1)[0])/(len(np.where(predictions==-1)[0])+len(np.where(predictions==1)[0]))

0.676399677599879

In [None]:
## Creating the prediction csv file
create_csv_submission(ids_test, predictions, "submission.csv")