In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Load the training data into feature matrix, class labels, and event ids:

In [None]:
from proj1_helpers import *
from functions import *
y, tX, ids = load_csv_data("data/train.csv")
tX = np.genfromtxt("data/X_train_median_imputed.csv", delimiter=",", skip_header=1)
tX = rearrange_continuous_categorical_features(tX)
# y, tX = remove_outliers(y, tX, 1.96)
y[np.where(y == -1)] = 0

In [None]:
tx = build_poly(tX, 10, pairwise_interaction=True, trigo=False, logabs=False, exp=False)
tx, _, __ = gaussian_scaling(tx)
acc, m, md, std_, seeds = bootstrap_validation(y, tx, repeats=70)

In [None]:
seeds[27]

In [None]:
x_train, x_validation, y_train, y_validation = split_data(y, tx, 0.8, seed=491)

In [None]:
newton_loss, w, newton_grad_norm = logistic_newton_descent(y_train,
                                                           x_train,
                                                           w=np.zeros(x_train.shape[1]),
                                                           lambda_=0,
                                                           max_iters=100,
                                                           eps=1e-6,
                                                           w_start_OLS=True)

In [None]:
GD_loss, w, GD_grad_norm = logistic_gradient_descent(y_train,
                                                     x_train,
                                                     w=w,
                                                     max_iters=1000,
                                                     lambda_=0,
                                                     gamma=0.05,
                                                     eps=1e-4,
                                                     w_start_OLS=False)

# Do your thing crazy machine learning thing here :) ...

## Bootstrap validation mean.

In [None]:
Accuracies = list()
for i in range(9,20):
    print("\n")
    print("*****************")
    print(i)
    tx = build_poly(tX, i, pairwise_interaction=True, trigo=False, logabs=False, exp=False)
    tx, _, __ = gaussian_scaling(tx)
    acc, m, md, std_ = bootstrap_validation(y, tx, repeats=5)
    Accuracies.append(acc)

In [None]:
res = np.genfromtxt("data/Bootstrap_validation_SKN.csv", delimiter=",", skip_header=1)
accuracies = np.array(res[:,2]).reshape((10,19))
plt.boxplot(accuracies)
plt.ylim(0.75, 0.86)
plt.title('Polynomial from 1 to 19 against accuracies \n Stochastic k-nearest neighbor imputation')

In [None]:
res = np.genfromtxt("data/Bootstrap_validation_mean.csv", delimiter=",", skip_header=1)
accuracies = np.array(res[:,2]).reshape((10,19))
plt.boxplot(accuracies)
plt.ylim(0.75, 0.86)
plt.title('Polynomial from 1 to 19 against accuracies \n Mean imputation')

## Golden search on accuracy

In [None]:
Polynomial_degrees = list()
Golden_lambdas = list()
Accuracies = list()

for z in range(1,20):
    
    tx = median_imputation(tX)
    tx = rearrange_continuous_categorical_features(tx)
    tx = build_poly(tx, z, pairwise_interaction=True, triplewise_interaction=False, triple_dimensions=False)
    tx, mean_, std_ = gaussian_scaling(tx)
    tx, todot = eigen(tx)
    tx = add_bias(tx)
    x_train, x_validation, y_train, y_validation = split_data(y, tx, 0.5, seed=1)

    def accuracy(lambda_):
        newton_loss, w, newton_grad_norm = logistic_newton_descent(y_train,
                                                                   x_train,
                                                                   w=np.zeros(x_train.shape[1]),
                                                                   lambda_=lambda_,
                                                                   max_iters=100,
                                                                   eps=1e-6,
                                                                   w_start_OLS=True)
        GD_loss, w, GD_grad_norm = logistic_gradient_descent(y_train,
                                                             x_train,
                                                             w=w,
                                                             max_iters=500,
                                                             lambda_=lambda_,
                                                             gamma=0.05,
                                                             eps=1e-2,
                                                             w_start_OLS=False)

        thresh = threshold(y_train, sigmoid(x_train@w))
        pred = sigmoid(x_validation@w)
        pred = (pred>thresh)*1
        return (1 - sum(np.abs(pred - y_validation))/len(y_validation)), w
    
    def accuracy2():
        thresh = threshold(y_train, sigmoid(x_train@w_opt))
        pred = sigmoid(x_validation@w_opt)
        pred = (pred>thresh)*1
        return (1 - sum(np.abs(pred - y_validation))/len(y_validation))

    def golden_search():
        Delta = (3 - np.sqrt(5))/2
        counter = 0
        ######################
        lambda_min = -10
        lambda_max = 1000
        eps = 10
        ######################
        while abs(lambda_max - lambda_min) > eps:
            L = (lambda_max - lambda_min)
            a = lambda_min + Delta*L
            b = lambda_max - Delta*L
            counter += 1
            fa, w = accuracy(a)
            print(f"Degree {z} polynome progress : {round((counter/20)*100, 2)}%                 \r", end="")
            counter += 1
            fb, w = accuracy(b)
            print(f"Degree {z} polynome progress : {round((counter/20)*100, 2)}%                 \r", end="")
            if fa < fb:
                lambda_min = a
            else:
                lambda_max = b
        return (lambda_min + lambda_max)/2, w

    golden_lambda, w_opt = golden_search()
    AC = accuracy2()
    print(f"Degree {z} polynome Golden Lambda = {round(golden_lambda, 5)} --- Validation accuracy : {round(AC*100,2)}%")
    Polynomial_degrees.append(z)
    Golden_lambdas.append(golden_lambda)
    Accuracies.append(AC)

## Golden search on cross-loss

In [None]:
Polynomial_degrees2 = list()
Golden_lambdas2 = list()
Accuracies2 = list()

for z in range(1,20):
    
    tx = median_imputation(tX)
    tx = rearrange_continuous_categorical_features(tx)
    tx = build_poly(tx, z, pairwise_interaction=True, triplewise_interaction=False, triple_dimensions=False)
    tx, mean_, std_ = gaussian_scaling(tx)
    tx, todot = eigen(tx)
    tx = add_bias(tx)
    x_train, x_validation, y_train, y_validation = split_data(y, tx, 0.5, seed=1)

    def cross_loss(lambda_):
        newton_loss, w, newton_grad_norm = logistic_newton_descent(y_train,
                                                                   x_train,
                                                                   w=np.zeros(x_train.shape[1]),
                                                                   lambda_=lambda_,
                                                                   max_iters=100,
                                                                   eps=1e-6,
                                                                   w_start_OLS=True)
        GD_loss, w, GD_grad_norm = logistic_gradient_descent(y_train,
                                                             x_train,
                                                             w=w,
                                                             max_iters=500,
                                                             lambda_=lambda_,
                                                             gamma=0.05,
                                                             eps=1e-2,
                                                             w_start_OLS=False)

        return logit_loss(y_validation, x_validation, w, lambda_=lambda_), w
    
    def accuracy2():
        thresh = threshold(y_train, sigmoid(x_train@w_opt))
        pred = sigmoid(x_validation@w_opt)
        pred = (pred>thresh)*1
        return (1 - sum(np.abs(pred - y_validation))/len(y_validation))

    def golden_search():
        Delta = (3 - np.sqrt(5))/2
        counter = 0
        ######################
        lambda_min = -10
        lambda_max = 1000
        eps = 10
        ######################
        while abs(lambda_max - lambda_min) > eps:
            L = (lambda_max - lambda_min)
            a = lambda_min + Delta*L
            b = lambda_max - Delta*L
            counter += 1
            fa, w = cross_loss(a)
            print(f"Degree {z} polynome progress : {round((counter/20)*100, 2)}%                 \r", end="")
            counter += 1
            fb, w = cross_loss(b)
            print(f"Degree {z} polynome progress : {round((counter/20)*100, 2)}%                 \r", end="")
            if fa > fb:
                lambda_min = a
            else:
                lambda_max = b
        return (lambda_min + lambda_max)/2, w

    golden_lambda, w_opt = golden_search()
    AC = accuracy2()
    print(f"Degree {z} polynome Golden Lambda = {round(golden_lambda, 5)} --- Validation accuracy : {round(AC*100,2)}%")
    Polynomial_degrees2.append(z)
    Golden_lambdas2.append(golden_lambda)
    Accuracies2.append(AC)

## Grid search

In [None]:
grid = np.logspace(-6, -15, 20)
acc = list()
degree = list()
lambdas = list()

for z in range(1,20): 
    
    tx = median_imputation(tX)
    tx = rearrange_continuous_categorical_features(tx)
    tx = build_poly(tx, z, pairwise_interaction=True, triplewise_interaction=False, triple_dimensions=False)
    tx, mean_, std_ = gaussian_scaling(tx)
    tx = add_bias(tx)
    x_train, x_validation, y_train, y_validation = split_data(y, tx, 0.8, seed=1)
    
    for counter, j in enumerate(grid):
        newton_loss, w, newton_grad_norm = logistic_newton_descent(y_train,
                                                                       x_train,
                                                                       w=np.zeros(x_train.shape[1]),
                                                                       lambda_=j,
                                                                       max_iters=100,
                                                                       eps=1e-6,
                                                                       w_start_OLS=True)
        thresh = threshold(y_train, sigmoid(x_train@w))
        pred = sigmoid(x_validation@w)
        pred = (pred>thresh)*1
        acc.append((1 - sum(np.abs(pred - y_validation))/len(y_validation)))
        degree.append(z)
        lambdas.append(grid[counter])
        print(f"Degree {z} polynome --- Lambda = {round(grid[counter])} --- Validation accuracy : {round(acc[counter+z-1]*100,2)}%")
    

# Ready to publish

## Compute on all the data set

In [None]:
from proj1_helpers import *
from functions import *
y, _, ids = load_csv_data("data/train.csv")
tX = np.genfromtxt("data/X_train_median_imputed.csv", delimiter=",", skip_header=1)
tX = rearrange_continuous_categorical_features(tX)
y[np.where(y == -1)] = 0

In [None]:
tx = build_poly(tX, 9, pairwise_interaction=True, trigo=False, logabs=False, exp=False)
tx, x_validation, y, y_validation = split_data(y, tx, 0.8, seed=491)
tx, mean_, std_ = gaussian_scaling(tx)

In [None]:
newton_loss, w, newton_grad_norm = logistic_newton_descent(y,
                                                           tx,
                                                           w=np.zeros(tx.shape[1]),
                                                           lambda_=0,
                                                           max_iters=1000,
                                                           eps=1e-10,
                                                           w_start_OLS=True)

GD_loss, w, GD_grad_norm = logistic_gradient_descent(y,
                                                     tx,
                                                     w=w,
                                                     max_iters=1500,
                                                     lambda_=0,
                                                     gamma=0.05,
                                                     eps=1e-4,
                                                     w_start_OLS=False)

thresh = threshold(y, sigmoid(tx@w))
pred = (sigmoid(tx@w) > thresh)*1
accuracy = 1 - sum(np.abs(pred - y))/len(y)

In [None]:
accuracy

In [None]:
GD_loss, w, GD_grad_norm = logistic_gradient_descent(y,
                                                     tx,
                                                     w=w,
                                                     max_iters=10000,
                                                     lambda_=0,
                                                     gamma=0.05,
                                                     eps=1e-4,
                                                     w_start_OLS=False)

In [None]:
thresh = threshold(y_validation, sigmoid(x_validation@w), step=0.001)
pred = (sigmoid(x_validation@w) > thresh)*1
accuracy = 1 - sum(np.abs(pred - y_validation))/len(y_validation)
accuracy

In [None]:
from proj1_helpers import *
from functions import *
_, __, ids_test = load_csv_data("data/test.csv")
x_test = np.genfromtxt("data/X_test_median_imputed.csv", delimiter=",", skip_header=1)
x_test = rearrange_continuous_categorical_features(x_test)
A = build_poly(x_test, 9, pairwise_interaction=True, trigo=False, logabs=False, exp=False)
A = (A - mean_) / std_

In [None]:
#create prediction
y_pred = sigmoid(A@w)
y_pred = (y_pred>thresh)*1

In [None]:
sum(y_pred)/len(y_pred)

In [None]:
y_pred[np.where(y_pred == 0)] = -1
print(len(y_pred[np.where(y_pred == 1)])/len(y_pred[np.where(y_pred == -1)]))

In [None]:
create_csv_submission(ids_test, y_pred, "anthony_submission_stochpol9.csv")

In [None]:
from proj1_helpers import *
from functions import *
y, _, ids = load_csv_data("data/train.csv")
tX = np.genfromtxt("data/X_train_median_imputed.csv", delimiter=",", skip_header=1)
tx = build_poly(tX, 0, pairwise_interaction=False, triplewise_interaction=False, triple_dimensions=False)
tx, _, __ = gaussian_scaling(tx)

In [None]:
import pandas as pd
y[np.where(y == -1)] = 0
dt = np.c_[y, tx]

In [None]:
pd.DataFrame(dt).to_csv("data/h2o_train.csv", index=False)

In [None]:
tX = np.genfromtxt("data/X_test_median_imputed.csv", delimiter=",", skip_header=1)
tx = build_poly(tX, 0, pairwise_interaction=False, triplewise_interaction=False, triple_dimensions=False)
tx, _, __ = gaussian_scaling(tx)
pd.DataFrame(tx).to_csv("data/h2o_test.csv", index=False)

In [None]:
from sklearn.datasets import load_digits
from sklearn.linear_model import Perceptron
clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(tx, y)
Perceptron()
clf.score(tx, y)