In [1]:
import numpy as np

from preprocessing import PRI_jet_num_split
from preprocessing import standardize, minmax_normalize
from preprocessing import clean_nan
from preprocessing import map_0_1, map_minus_1_1

from cross_validation import get_model, calculate_loss, accuracy, total_cross_validation

from implementations import build_poly

from helpers import load_csv_data
from helpers import predict_labels, create_csv_submission

from cross_validation import gamma_lambda_selection_cv
from cross_validation import plotting_graphs

In [2]:
def sort_arr(ids, y_pred):
    idx = ids.argsort()
    return ids[idx], y_pred[idx]

In [3]:
train_fname = "data/train.csv"
test_fname = "data/test.csv"
sumbission_fname = "data/submission.csv"

In [4]:
y_train, X_train, ids_train = load_csv_data(train_fname)
y_test, X_test, ids_test = load_csv_data(test_fname)

print("Shapes")
print(X_train.shape, y_train.shape, ids_train.shape)
print(X_test.shape, y_test.shape, ids_test.shape)

Shapes
(250000, 30) (250000,) (250000,)
(568238, 30) (568238,) (568238,)


In [5]:
# combine_vals = True
combine_vals = False

train_subsets = PRI_jet_num_split(y_train, X_train, ids_train, combine_vals)
test_subsets = PRI_jet_num_split(y_test, X_test, ids_test, combine_vals)

print(f"Number of train subsets: { len(train_subsets) }")
print(f"Number of test subsets:  { len(test_subsets) }")

assert len(train_subsets) == len(test_subsets)
num_subsets = len(train_subsets)

Number of train subsets: 8
Number of test subsets:  8


In [6]:
ids = np.array([])
y_pred = np.array([])

exp_measure_tr, exp_measure_te = 0, 0
max_degree = [3, 3, 3, 2, 3, 3, 3, 2]
fs_perc = [0.8, 0.9, 0.9, 0.9, 0.8, 0.8, 0.8, 0.65]
gammas, lambdas = [[2e-6], [5e-6], [5e-6], [2e-5], [5e-6], [1e-4], [5e-5, 1e-5, 5e-6], [2e-4]], [[1], [1], [1e-1], [0], [1], [1e-1], [1e-1, 1, 5], [0]]
gammas_opt, lambdas_opt = [2e-6, 5e-6, 5e-6, 2e-5, 5e-6, 1e-4, 5e-5, 2e-4], [1, 1, 1e-1, 0, 1e-2, 1e-1, 1, 0]

exp_measure_tr, exp_measure_te = 0, 0

np.random.seed(6)

for i in range(num_subsets):
    y_train_subset, X_train_subset, ids_train_subset = train_subsets[i]
    y_test_subset, X_test_subset, ids_test_subset = test_subsets[i]

    y_train_subset = map_0_1(y_train_subset)
    X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset)
    print(f"Train shape before feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}")
    X_train_subset, X_test_subset = build_poly(X_train_subset, max_degree[i]), build_poly(X_test_subset, max_degree[i])
    print(f"Train shape after  feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}")
    
    # set n_best_features to X_train_subset.shape[1] if you don't want feature selection
    n_best_features = round(fs_perc[i] * X_train_subset.shape[1])
    # n_best_features = X_train_subset.shape[1]
    D = n_best_features
    N, _ = X_train_subset.shape
    
    # accuracy by predicting the majority class in the training dataset
    CA_one = y_train_subset.sum() / N
    CA_zero = 1 - CA_one
    CA_baseline = max(CA_zero, CA_one)
    
    # !!!!!! IMPORTANT
    # If we increase the reg. factor (lambda), then we must decrease the step size (gamma)
    # else the model diverges as the step size becomes too big.
    
    # FEATURE SELECTION WITH L1 REG.
    max_iters_fs = 300
    gamma_fs, lambda_fs = 1e-7, 1e2
    model_fs = 'LOG_REG_L1'
    
    initial_w_fs = np.random.randn(X_train_subset.shape[1])
    # get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size)
    w_fs = get_model(model_fs, y_train_subset, X_train_subset, initial_w_fs, max_iters_fs, gamma_fs, lambda_fs, 1)
    features = np.argsort(abs(w_fs))[::-1][:n_best_features]
    print(w_fs.min(), w_fs.max(), w_fs.mean())
    
    # Feature selection
    X_train_subset, X_test_subset = X_train_subset[:, features], X_test_subset[:, features]
    
    # tweak params
    k_fold = 5
    max_iters = 1500
    # gammas, lambdas = [1e-6, 1e-5], [0.0, 1e-3, 1, 10, 100, 500]
    # gammas[0] ... use it for the large datasets with mass feature
    # gammas[1] ... use it for the small datasets without mass feature
    seed, batch_size = 17, 1
    metric, model = 'CA', 'MSE_OPT_REG'
    
    initial_w = np.random.randn(D)
    # gamma_lambda_selection_cv(y, tx, k_fold, initial_w, max_iters, gammas, lambdas, seed = 1, metric = 'CA', model = 'LOG_REG_GD')
    optimal_gamma, optimal_lambda_, measure_tr, measure_te = \
        gamma_lambda_selection_cv(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas[i], lambdas[i],
                                  seed = seed, batch_size = batch_size, metric = metric, model = model)
    print('CA_bs:', CA_baseline)
    print('Iter:', i, ' Best gamma:', optimal_gamma, ' Best lambda:', optimal_lambda_, '\n')
    
    exp_measure_tr += measure_tr * X_train_subset.shape[0] / X_train.shape[0]
    exp_measure_te += measure_te * X_test_subset.shape[0] / X_test.shape[0]
    
    # get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size)
    w = get_model(model, y_train_subset, X_train_subset, initial_w, max_iters, optimal_gamma, optimal_lambda_, batch_size)

    y_pred_test = np.array(map_minus_1_1(predict_labels(w, X_test_subset)))
    ids = np.concatenate((ids, ids_test_subset))
    y_pred = np.concatenate((y_pred, y_pred_test))

ids, y_pred = sort_arr(ids, y_pred)
create_csv_submission(ids, y_pred, sumbission_fname)

print("Expected training accuracy / loss:", exp_measure_tr)
print("Expected test accuracy / loss:", exp_measure_te)

Train shape before feature expansion:  (73790, 17)   Test shape: (168195, 17)
Train shape after  feature expansion:  (73790, 52)   Test shape: (168195, 52)
-2.7251935826121225 2.644102679544718 -0.07394490563081245
(0, 0)/(1, 1)
CA_tr:
 [[0.78793875]]
CA_te:
 [[0.78844017]]
LOSS_tr:
 [[31323.18619984]]
LOSS_te:
 [[7817.83809558]]
CA_tr: 0.7879387450874102
CA_te: 0.7884401680444505
CA_bs: 0.67566065862583
Iter: 0  Best gamma: 2e-06  Best lambda: 1 

Train shape before feature expansion:  (26123, 16)   Test shape:  (59263, 16)
Train shape after  feature expansion:  (26123, 49)   Test shape:  (59263, 49)
-2.786973847818426 1.0524792515652646 -0.12921517202363705
(0, 0)/(1, 1)
CA_tr:
 [[0.94218989]]
CA_te:
 [[0.93970138]]
LOSS_tr:
 [[5095.11938997]]
LOSS_te:
 [[1354.09386244]]
CA_tr: 0.9421898928024502
CA_te: 0.9397013782542114
CA_bs: 0.9403207901083337
Iter: 1  Best gamma: 5e-06  Best lambda: 1 

Train shape before feature expansion:  (69982, 19)   Test shape: (158095, 19)
Train shape aft