In [None]:
from proj1_helpers import *
from functions import *
import pandas as pd
import numpy as np
y, tX, ids = load_csv_data("data/train.csv")
y[np.where(y == -1)] = 0
tX = rearrange_continuous_categorical_features(tX)

## Data preparation

In [None]:
categories = tX[:, -1]
zeros_index = np.where(categories == 0)[0]
one_index = np.where(categories == 1)[0]
two_index = np.where(categories == 2)[0]
three_index = np.where(categories == 3)[0]

zeros = tX[zeros_index, :]
null_var_index = np.where(np.std(zeros, axis=0) == 0)[0]
zeros = np.delete(zeros, null_var_index, axis=1)
zeros[np.where(zeros == -999)] = np.nan
zeros = median_imputation(zeros)
y_zero = y[zeros_index]

ones = tX[one_index, :]
null_var_index = np.where(np.std(ones, axis=0) == 0)[0]
ones = np.delete(ones, null_var_index, axis=1)
ones[np.where(ones == -999)] = np.nan
ones = median_imputation(ones)
y_one = y[one_index]

two = tX[two_index, :]
null_var_index = np.where(np.std(two, axis=0) == 0)[0]
two = np.delete(two, null_var_index, axis=1)
two[np.where(two == -999)] = np.nan
two = median_imputation(two)
y_two = y[two_index]

three = tX[three_index, :]
null_var_index = np.where(np.std(three, axis=0) == 0)[0]
three = np.delete(three, null_var_index, axis=1)
three[np.where(three == -999)] = np.nan
three = median_imputation(three)
y_three = y[three_index]

## Training

### Case 0

In [None]:
means_0 = list()
medians_0 = list()
stds_0 = list()
for i in range(1, 21):
    print("\n")
    print("*****************")
    print("\n")
    print(f"Polynomial of degree = {i}")
    tx = process_data(x=zeros, degree=i, pairwise=True, bias=False)
    tx, _, __ = gaussian_scaling(tx)
    tx, tosolve_tx = orthogonal_basis(tx)
    tx = process_data(x=tx, degree=0, pairwise=False, bias=True)
    acc, m, md, std_ = cross_validation(y_zero, tx, k_fold=5)
    means_0.append(m)
    medians_0.append(md)
    stds_0.append(std_)
    print("\n")

In [None]:
accuracies_0 = list()
means_0 = list()
medians_0 = list()
stds_0 = list()
lambdas_0 = list()

for i in range(20, 25):
    print("*****************")
    print("\n")
    print(f"Polynomial of degree = {i}")
    tx = process_data(x=zeros, degree=i, pairwise=True, bias=False)
    tx, _, __ = gaussian_scaling(tx)
    tx, tosolve_tx = orthogonal_basis(tx)
    tx = process_data(x=tx, degree=0, pairwise=False, bias=True)
    print("\n")

    def golden_search():
        Delta = (3 - np.sqrt(5))/2
        counter = 0
        ######################
        lambda_min = 0
        lambda_max = 0.2
        eps = 0.001
        ######################
        while abs(lambda_max - lambda_min) > eps:
            L = (lambda_max - lambda_min)
            a = lambda_min + Delta*L
            b = lambda_max - Delta*L
            print("\n")
            print("first bound : ")
            acc, fa, md, std_ = cross_validation(
                y_zero, tx, k_fold=3, lambda_=a)
            print("\n")
            print("Second bound : ")
            acc, fb, md, std_ = cross_validation(
                y_zero, tx, k_fold=3, lambda_=b)
            if fa < fb:
                lambda_min = a
            else:
                lambda_max = b
            print("\n")
            print(f"Current lambda : {(lambda_min + lambda_max)/2}                 \r", end="")
            print("\n")
            print(f"Uncertainty : {np.abs(lambda_min - lambda_max)}                 \r", end="")
            print("\n")
        return (lambda_min + lambda_max)/2

    golden_lambda = golden_search()
    if golden_lambda<0.001:
        golden_lambda = 0
    print("\n")
    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
    print("\n")
    print(f"For polynomial of degree = {i} with golden lambda = {golden_lambda}, we have : ")
    acc, m, md, std_ = cross_validation(
        y_zero, tx, k_fold=3, lambda_=golden_lambda)
    print("\n")
    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
    print("\n")
    accuracies_0.append(acc)
    means_0.append(m)
    medians_0.append(md)
    stds_0.append(std_)
    lambdas_0.append(golden_lambda)

In [None]:
res_0 = np.c_[means_0, medians_0, stds_0]
pd.DataFrame(res_0).to_csv("data/case_0_CV5_statistics_median_null_var_pairwise_bias_scaling_orth_pol_1_to_20_1e4.csv", index = False)

### Case 1

In [None]:
means_1 = list()
medians_1 = list()
stds_1 = list()
for i in range(1, 21):
    print("\n")
    print("*****************")
    print("\n")
    print(i)
    tx = process_data(x = ones, degree=i, pairwise=True, bias=False)
    tx, _, __ = gaussian_scaling(tx)
    tx, tosolve_tx = orthogonal_basis(tx)
    tx = process_data(x = tx, degree=0, pairwise=False, bias=True)
    acc, m, md, std_ = cross_validation(y_one, tx, k_fold=5)
    means_1.append(m)
    medians_1.append(md)
    stds_1.append(std_)

In [None]:
res_1 = np.c_[means_1, medians_1, stds_1]
pd.DataFrame(res_1).to_csv("data/case_1_CV5_statistics_median_null_var_pairwise_bias_scaling_orth_pol_1_to_20_1e4.csv", index = False)

### Case 2

In [None]:
means_2 = list()
medians_2 = list()
stds_2 = list()
for i in range(1, 21):
    print("\n")
    print("*****************")
    print("\n")
    print(i)
    tx = process_data(x = two, degree=i, pairwise=True, bias=False)
    tx, _, __ = gaussian_scaling(tx)
    tx, tosolve_tx = orthogonal_basis(tx)
    tx = process_data(x = tx, degree=0, pairwise=False, bias=True)
    acc, m, md, std_ = cross_validation(y_two, tx, k_fold=5)
    means_2.append(m)
    medians_2.append(md)
    stds_2.append(std_)

In [None]:
res_2 = np.c_[means_2, medians_2, stds_2]
pd.DataFrame(res_2).to_csv("data/case_2_CV5_statistics_median_null_var_pairwise_bias_scaling_orth_pol_1_to_20_1e4.csv", index = False)

### Case 3

In [None]:
means_3 = list()
medians_3 = list()
stds_3 = list()
for i in range(1, 21):
    print("\n")
    print("*****************")
    print("\n")
    print(i)
    tx = process_data(x = three, degree=i, pairwise=True, bias=False)
    tx, _, __ = gaussian_scaling(tx)
    tx, tosolve_tx = orthogonal_basis(tx)
    tx = process_data(x = tx, degree=0, pairwise=False, bias=True)
    acc, m, md, std_ = cross_validation(y_three, tx, k_fold=5)
    means_3.append(m)
    medians_3.append(md)
    stds_3.append(std_)

In [None]:
res_3 = np.c_[means_3, medians_3, stds_3]
pd.DataFrame(res_3).to_csv("data/case_3_CV5_statistics_median_null_var_pairwise_bias_scaling_orth_pol_1_to_20_1e4.csv", index = False)

## Reloading properly the dataset

In [2]:
from proj1_helpers import *
from functions import *
y, tX, ids = load_csv_data("data/train.csv")
y[np.where(y == -1)] = 0
tX = rearrange_continuous_categorical_features(tX)

categories = tX[:, -1]
zeros_index = np.where(categories == 0)[0]
one_index = np.where(categories == 1)[0]
two_index = np.where(categories == 2)[0]
three_index = np.where(categories == 3)[0]

zeros = tX[zeros_index, :]
y_zero = y[zeros_index]
null_var_index_zero = np.where(np.std(zeros, axis=0) == 0)[0]
zeros = np.delete(zeros, null_var_index_zero, axis=1)
zeros[np.where(zeros == -999)] = np.nan
zeros = median_imputation(zeros)
# print(zeros.shape)
# y_zero, zeros = remove_outliers(y_zero, zeros, quantile=0.5)
# print(zeros.shape)

ones = tX[one_index, :]
y_one = y[one_index]
null_var_index_one = np.where(np.std(ones, axis=0) == 0)[0]
ones = np.delete(ones, null_var_index_one, axis=1)
ones[np.where(ones == -999)] = np.nan
ones = median_imputation(ones)
# print(ones.shape)
# y_one, ones = remove_outliers(y_one, ones, quantile=0.5)
# print(ones.shape)

two = tX[two_index, :]
y_two = y[two_index]
null_var_index_two = np.where(np.std(two, axis=0) == 0)[0]
two = np.delete(two, null_var_index_two, axis=1)
two[np.where(two == -999)] = np.nan
two = median_imputation(two)
# print(two.shape)
# y_two, two = remove_outliers(y_two, two, quantile=0.5)
# print(two.shape)

three = tX[three_index, :]
y_three = y[three_index]
null_var_index_three = np.where(np.std(three, axis=0) == 0)[0]
three = np.delete(three, null_var_index_three, axis=1)
three[np.where(three == -999)] = np.nan
three = median_imputation(three)
# print(three.shape)
# y_three, three = remove_outliers(y_three, three, quantile=0.5)
# print(three.shape)

## w0

In [3]:
tx_zeros = process_data(x = zeros, degree=13, pairwise=True, bias=False)
tx_zeros, mean_tx_zeros, std_tx_zeros = gaussian_scaling(tx_zeros)
tx_zeros, tosolve_tx_zeros = orthogonal_basis(tx_zeros)
tx_zeros = process_data(x = tx_zeros, degree=0, pairwise=False, bias=True)


Polynomial augmentation progress : 100.0%                 
Pairwise interaction progress : 100.0%                 
Bias : ✔                                  

In [4]:
loss_0, w_0, grad_norm_0 = logistic_newton_descent(y_zero,
                                                   tx_zeros,
                                                   w=np.zeros(
                                                       tx_zeros.shape[1]),
                                                   lambda_=0,
                                                   max_iters=1000,
                                                   eps=1e-10,
                                                   w_start_OLS=True)
loss_0, w_0, grad_norm_0 = logistic_gradient_descent(y_zero,
                                                     tx_zeros,
                                                     w=w_0,
                                                     max_iters=30000,
                                                     lambda_=0,
                                                     gamma=0.05,
                                                     eps=1e-4,
                                                     w_start_OLS=False)

Gradient norm = 0.0013761                 

  return 1.0 / (1 + np.exp(-x))
  return 1.0 / (1 + np.exp(-x))


Gradient norm = 9.96e-05                  

In [5]:
thresh_0 = threshold(y_zero, sigmoid(tx_zeros@w_0))
pred = (sigmoid(tx_zeros@w_0) > thresh_0)*1
accuracy = 1 - sum(np.abs(pred - y_zero))/len(y_zero)
accuracy

0.8512305705964189

## w1

In [6]:
tx_ones = process_data(x = ones, degree=17, pairwise=True, bias=False)
tx_ones, mean_tx_ones, std_tx_ones = gaussian_scaling(tx_ones)
tx_ones, tosolve_tx_ones = orthogonal_basis(tx_ones)
tx_ones = process_data(x = tx_ones, degree=0, pairwise=False, bias=True)


Polynomial augmentation progress : 100.0%                 
Pairwise interaction progress : 100.0%                 
Bias : ✔                                  

In [7]:
loss_1, w_1, grad_norm_1 = logistic_newton_descent(y_one,
                                                   tx_ones,
                                                   w=np.zeros(
                                                       tx_ones.shape[1]),
                                                   lambda_=0,
                                                   max_iters=1000,
                                                   eps=1e-10,
                                                   w_start_OLS=True)
loss_1, w_1, grad_norm_1 = logistic_gradient_descent(y_one,
                                                     tx_ones,
                                                     w=w_1,
                                                     max_iters=30000,
                                                     lambda_=0,
                                                     gamma=0.05,
                                                     eps=2e-4,
                                                     w_start_OLS=False)

Gradient norm = 0.0001999                 

In [8]:
thresh_1 = threshold(y_one, sigmoid(tx_ones@w_1))
pred = (sigmoid(tx_ones@w_1) > thresh_1)*1
accuracy = 1 - sum(np.abs(pred - y_one))/len(y_one)
accuracy

0.8190962550294026

## w2

In [9]:
tx_two = process_data(x = two, degree=13, pairwise=True, bias=False)
tx_two, mean_tx_two, std_tx_two = gaussian_scaling(tx_two)
tx_two, tosolve_tx_two = orthogonal_basis(tx_two)
tx_two = process_data(x = tx_two, degree=0, pairwise=False, bias=True)


Polynomial augmentation progress : 100.0%                 
Pairwise interaction progress : 100.0%                 
Bias : ✔                                  

In [10]:
loss_2, w_2, grad_norm_2 = logistic_newton_descent(y_two,
                                                   tx_two,
                                                   w=np.zeros(
                                                       tx_two.shape[1]),
                                                   lambda_=0,
                                                   max_iters=1000,
                                                   eps=1e-10,
                                                   w_start_OLS=True)
loss_2, w_2, grad_norm_2 = logistic_gradient_descent(y_two,
                                                     tx_two,
                                                     w=w_2,
                                                     max_iters=30000,
                                                     lambda_=0,
                                                     gamma=0.05,
                                                     eps=1e-4,
                                                     w_start_OLS=False)

Gradient norm = 0.0001                    

In [11]:
thresh_2 = threshold(y_two, sigmoid(tx_two@w_2))
pred = (sigmoid(tx_two@w_2) > thresh_2)*1
accuracy = 1 - sum(np.abs(pred - y_two))/len(y_two)
accuracy

0.8543639214752179

## w3

In [12]:
tx_three = process_data(x = three, degree=10, pairwise=True, bias=False)
tx_three, mean_tx_three, std_tx_three = gaussian_scaling(tx_three)
tx_three, tosolve_tx_three = orthogonal_basis(tx_three)
tx_three = process_data(x = tx_three, degree=0, pairwise=False, bias=True)


Polynomial augmentation progress : 100.0%                 
Pairwise interaction progress : 100.0%                 
Bias : ✔                                  

In [13]:
loss_3, w_3, grad_norm_3 = logistic_newton_descent(y_three,
                                                   tx_three,
                                                   w=np.zeros(
                                                       tx_three.shape[1]),
                                                   lambda_=0,
                                                   max_iters=1000,
                                                   eps=1e-10,
                                                   w_start_OLS=True)
loss_3, w_3, grad_norm_3 = logistic_gradient_descent(y_three,
                                                     tx_three,
                                                     w=w_3,
                                                     max_iters=30000,
                                                     lambda_=0,
                                                     gamma=0.05,
                                                     eps=1e-4,
                                                     w_start_OLS=False)

Gradient norm = 0.0157439                 

  (1 - y).T.dot(np.log(1 - proba))) + 2*lambda_*np.linalg.norm(w)


Gradient norm = 0.0001                    

In [14]:
thresh_3 = threshold(y_three, sigmoid(tx_three@w_3))
pred = (sigmoid(tx_three@w_3) > thresh_3)*1
accuracy = 1 - sum(np.abs(pred - y_three))/len(y_three)
accuracy

0.8558924381880527

### Export w and t*

In [15]:
import pandas as pd

In [21]:
pd.DataFrame(w_0).to_csv("data/run/w_0.csv", index=False)
pd.DataFrame(np.c_[thresh_0, 1]).to_csv("data/run/thresh_0.csv", index=False)
pd.DataFrame(null_var_index_zero).to_csv("data/run/null_var_index_zero.csv", index=False)
pd.DataFrame(mean_tx_zeros).to_csv("data/run/mean_tx_zeros.csv", index=False)
pd.DataFrame(std_tx_zeros).to_csv("data/run/std_tx_zeros.csv", index=False)
pd.DataFrame(tosolve_tx_zeros).to_csv("data/run/tosolve_tx_zeros.csv", index=False)

In [22]:
pd.DataFrame(w_1).to_csv("data/run/w_1.csv", index=False)
pd.DataFrame(np.c_[thresh_1, 1]).to_csv("data/run/thresh_1.csv", index=False)
pd.DataFrame(null_var_index_one).to_csv("data/run/null_var_index_one.csv", index=False)
pd.DataFrame(mean_tx_ones).to_csv("data/run/mean_tx_ones.csv", index=False)
pd.DataFrame(std_tx_ones).to_csv("data/run/std_tx_ones.csv", index=False)
pd.DataFrame(tosolve_tx_ones).to_csv("data/run/tosolve_tx_ones.csv", index=False)

In [23]:
pd.DataFrame(w_2).to_csv("data/run/w_2.csv", index=False)
pd.DataFrame(np.c_[thresh_2, 1]).to_csv("data/run/thresh_2.csv", index=False)
pd.DataFrame(null_var_index_two).to_csv("data/run/null_var_index_two.csv", index=False)
pd.DataFrame(mean_tx_two).to_csv("data/run/mean_tx_two.csv", index=False)
pd.DataFrame(std_tx_two).to_csv("data/run/std_tx_two.csv", index=False)
pd.DataFrame(tosolve_tx_two).to_csv("data/run/tosolve_tx_two.csv", index=False)

In [24]:
pd.DataFrame(w_3).to_csv("data/run/w_3.csv", index=False)
pd.DataFrame(np.c_[thresh_3, 1]).to_csv("data/run/thresh_3.csv", index=False)
pd.DataFrame(null_var_index_three).to_csv("data/run/null_var_index_three.csv", index=False)
pd.DataFrame(mean_tx_three).to_csv("data/run/mean_tx_three.csv", index=False)
pd.DataFrame(std_tx_three).to_csv("data/run/std_tx_three.csv", index=False)
pd.DataFrame(tosolve_tx_three).to_csv("data/run/tosolve_tx_three.csv", index=False)

## Loading test set

In [25]:
_, tX_test, ids_test = load_csv_data("data/test.csv")
tX_test = rearrange_continuous_categorical_features(tX_test)

In [26]:
categories_test = tX_test[:, -1]
zeros_index_test = np.where(categories_test == 0)[0]
one_index_test = np.where(categories_test == 1)[0]
two_index_test = np.where(categories_test == 2)[0]
three_index_test = np.where(categories_test == 3)[0]

In [27]:
zeros_test = tX_test[zeros_index_test, :]
zeros_test = np.delete(zeros_test, null_var_index_zero, axis=1)
zeros_test[np.where(zeros_test == -999)] = np.nan
zeros_test = median_imputation(zeros_test)
zeros_test = process_data(x = zeros_test, degree=13, pairwise=True, bias=False)
zeros_test = (zeros_test - mean_tx_zeros) / std_tx_zeros
zeros_test = np.linalg.solve(tosolve_tx_zeros, zeros_test.T).T
zeros_test = process_data(x = zeros_test, degree=0, pairwise=False, bias=True)


ones_test = tX_test[one_index_test, :]
ones_test = np.delete(ones_test, null_var_index_one, axis=1)
ones_test[np.where(ones_test == -999)] = np.nan
ones_test = median_imputation(ones_test)
ones_test = process_data(x = ones_test, degree=17, pairwise=True, bias=False)
ones_test = (ones_test - mean_tx_ones) / std_tx_ones
ones_test = np.linalg.solve(tosolve_tx_ones, ones_test.T).T
ones_test = process_data(x = ones_test, degree=0, pairwise=False, bias=True)

two_test = tX_test[two_index_test, :]
two_test = np.delete(two_test, null_var_index_two, axis=1)
two_test[np.where(two_test == -999)] = np.nan
two_test = median_imputation(two_test)
two_test = process_data(x = two_test, degree=13, pairwise=True, bias=False)
two_test = (two_test - mean_tx_two) / std_tx_two
two_test = np.linalg.solve(tosolve_tx_two, two_test.T).T
two_test = process_data(x = two_test, degree=0, pairwise=False, bias=True)

three_test = tX_test[three_index_test, :]
three_test = np.delete(three_test, null_var_index_three, axis=1)
three_test[np.where(three_test == -999)] = np.nan
three_test = median_imputation(three_test)
three_test = process_data(x = three_test, degree=10, pairwise=True, bias=False)
three_test = (three_test - mean_tx_three) / std_tx_three
three_test = np.linalg.solve(tosolve_tx_three, three_test.T).T
three_test = process_data(x = three_test, degree=0, pairwise=False, bias=True)


Polynomial augmentation progress : 100.0%                 
Pairwise interaction progress : 100.0%                 
Bias : ✔                                  
Polynomial augmentation progress : 100.0%                 
Pairwise interaction progress : 100.0%                 
Bias : ✔                                  
Polynomial augmentation progress : 100.0%                 
Pairwise interaction progress : 100.0%                 
Bias : ✔                                  
Polynomial augmentation progress : 100.0%                 
Pairwise interaction progress : 100.0%                 
Bias : ✔                                  

### Predictions

#### w0

In [28]:
y_pred_zero = sigmoid(zeros_test@w_0)
y_pred_zero = (y_pred_zero>thresh_0)*1
y_pred_zero[np.where(y_pred_zero == 0)] = -1

  return 1.0 / (1 + np.exp(-x))
  return 1.0 / (1 + np.exp(-x))


#### w1

In [29]:
y_pred_one = sigmoid(ones_test@w_1)
y_pred_one = (y_pred_one>thresh_1)*1
y_pred_one[np.where(y_pred_one == 0)] = -1

#### w2

In [30]:
y_pred_two = sigmoid(two_test@w_2)
y_pred_two = (y_pred_two>thresh_2)*1
y_pred_two[np.where(y_pred_two == 0)] = -1

#### w3

In [31]:
y_pred_three = sigmoid(three_test@w_3)
y_pred_three = (y_pred_three>thresh_3)*1
y_pred_three[np.where(y_pred_three == 0)] = -1

### Mapping back to original place

In [32]:
predictions = _
predictions[zeros_index_test] = y_pred_zero
predictions[one_index_test] = y_pred_one
predictions[two_index_test] = y_pred_two
predictions[three_index_test] = y_pred_three

In [33]:
len(ids_test) == len(predictions)

True

In [34]:
len(np.where(predictions==-1)[0])/(len(np.where(predictions==-1)[0])+len(np.where(predictions==1)[0]))

0.6849084362538232

In [80]:
create_csv_submission(ids_test, predictions, "anthony_submission_0123_complex_1e5_2_mean_outliers_0.5.csv")

In [None]:
zeros_test

In [None]:
tx_zeros