In [1]:
import numpy as np

from preprocessing import PRI_jet_num_split
from preprocessing import standardize, minmax_normalize
from preprocessing import clean_nan
from preprocessing import map_0_1, map_minus_1_1

from cross_validation import get_model, calculate_loss, accuracy, total_cross_validation

from implementations import build_poly

from scripts.helpers import load_csv_data
from scripts.helpers import predict_labels, create_csv_submission

from cross_validation import gamma_lambda_selection_cv
from cross_validation import plotting_graphs

In [2]:
def sort_arr(ids, y_pred):
    idx = ids.argsort()
    return ids[idx], y_pred[idx]

In [3]:
np.random.seed(1)

In [4]:
train_fname = "data/train.csv"
test_fname = "data/test.csv"
sumbission_fname = "data/submission.csv"

In [5]:
y_train, X_train, ids_train = load_csv_data(train_fname)
y_test, X_test, ids_test = load_csv_data(test_fname)

print("Shapes")
print(X_train.shape, y_train.shape, ids_train.shape)
print(X_test.shape, y_test.shape, ids_test.shape)
print()

Shapes
(250000, 30) (250000,) (250000,)
(568238, 30) (568238,) (568238,)



In [6]:
# combine_vals = True
combine_vals = False

train_subsets = PRI_jet_num_split(y_train, X_train, ids_train, combine_vals)
test_subsets = PRI_jet_num_split(y_test, X_test, ids_test, combine_vals)

print(f"Number of train subsets: { len(train_subsets) }")
print(f"Number of test subsets:  { len(test_subsets) }")
print()

assert len(train_subsets) == len(test_subsets)
num_subsets = len(train_subsets)

Number of train subsets: 8
Number of test subsets:  8



In [32]:
ids = np.array([])
y_pred = np.array([])

# Polynomial max degree
max_degree = 2

# GD step sizes, regularization factors
# lambdas = np.logspace(0, 1, 5) 
# gammas = np.logspace(0, 1, 5)
# gammas, lambdas = [1e-6, 1e-5, 1e-4], [0.0, 0.1, 1.0]

# getting the score by total cross_validation
# testing_accuracy, testing_loss = [], []

In [None]:
for i in range(num_subsets):
    y_train_subset, X_train_subset, ids_train_subset = train_subsets[i]
    y_test_subset, X_test_subset, ids_test_subset = test_subsets[i]

    y_train_subset = map_0_1(y_train_subset)
    X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset)
    print(f"Train shape before feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}")
    X_train_subset, X_test_subset = build_poly(X_train_subset, max_degree), build_poly(X_test_subset, max_degree)
    print(f"Train shape after  feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}\n")

    N, D = X_train_subset.shape
    initial_w = np.random.randn(D)
    
    CA_baseline = y_train_subset.sum() / N
    CA_baseline = max(CA_baseline, 1 - CA_baseline)
    
    # tweak params
    k_fold = 5
    # dont use high step sizes with high regularization
    max_iters = 200
    gammas, lambdas = [1e-6, 1e-5, 1e-4], [0.0, 1e-3, 1, 10, 100, 500]
    # gammas, lambdas = [1e-6], [1e4]
    # gammas, lambdas = [1e-2], [0]
# overflow warning/error for lambda values more than 10 (need to check this)

    seed = 17
    batch_size = 1
    metric = 'CA'
    model = 'LOG_REG_GD'
    
    # gamma_lambda_selection_cv(y, tx, k_fold, initial_w, max_iters, gammas, lambdas, seed = 1, metric = 'CA', model = 'LOG_REG_GD')
    optimal_gamma, optimal_lambda_ = \
        gamma_lambda_selection_cv(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas, lambdas,
                                  seed = seed, batch_size = batch_size, metric = metric, model = model)
    
    print('CA_bs:', CA_baseline)
    print('Iter:', i, ' Best gamma:', optimal_gamma, ' Best lambda:', optimal_lambda_)
    
    
    # calling total cross_validation function to get tr_ac, te_ac, loss_tr, loss_te for each subset     
    # loss_tr, loss_te, tr_ca, te_ca = total_cross_validation(y_train_subset, X_train_subset, k_fold, initial_w, \
    #                             max_iters, optimal_gamma, optimal_lambda_, seed = 1, batch_size = 1, model = 'LOG_REG_GD')
    # testing_loss.append(loss_te)
    # testing_accuracy.append(te_ca)

    
#      plotting graphs (between accuracy/loss and subset of hyperparameter values) 
#       will only plot for Reg_logistic after getting maximum accuracy on ai_crowd
#     plotting_graphs(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas, \
#                 lambdas, optimal_lambda_, optimal_gamma, i, seed = 1, model = 'LOG_REG_GD')
    
    
    # get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size)
    w = get_model(model, y_train_subset, X_train_subset, initial_w, max_iters, optimal_gamma, optimal_lambda_, batch_size)
    


    y_pred_test = np.array(map_minus_1_1(predict_labels(w, X_test_subset)))

    ids = np.concatenate((ids, ids_test_subset))
    y_pred = np.concatenate((y_pred, y_pred_test))

ids, y_pred = sort_arr(ids, y_pred)
create_csv_submission(ids, y_pred, sumbission_fname)

# the below example run is when combined=True.

Train shape before feature expansion:  (73790, 18)   Test shape: (168195, 18)
Train shape after  feature expansion:  (73790, 37)   Test shape: (168195, 37)

(0, 0)/(3, 6)
(0, 1)/(3, 6)
(0, 2)/(3, 6)
(0, 3)/(3, 6)
(0, 4)/(3, 6)
(0, 5)/(3, 6)
(1, 0)/(3, 6)
(1, 1)/(3, 6)
(1, 2)/(3, 6)
(1, 3)/(3, 6)
(1, 4)/(3, 6)
(1, 5)/(3, 6)
(2, 0)/(3, 6)
(2, 1)/(3, 6)
(2, 2)/(3, 6)
(2, 3)/(3, 6)
(2, 4)/(3, 6)
(2, 5)/(3, 6)
CA_tr:
 [[0.75928988 0.75928988 0.75928649 0.75919501 0.75859195 0.7559527 ]
 [0.79058138 0.79058138 0.79058138 0.7905746  0.7904628  0.78469305]
 [0.76813593 0.72671432 0.73584158 0.72041943 0.72308578 0.74498238]]
CA_te:
 [[0.75912725 0.75912725 0.75912725 0.75915436 0.75842255 0.75595609]
 [0.79054072 0.79054072 0.79054072 0.79054072 0.79055428 0.7845643 ]
 [0.7673804  0.72645345 0.73554682 0.72091069 0.72253693 0.74478927]]
LOSS_tr:
 [[3.46700618e+04 3.46700751e+04 3.46833347e+04 3.48032192e+04
  3.60453454e+04 4.26177027e+04]
 [2.61743892e+04 2.61743959e+04 2.61811626e+04 2.62441

In [None]:
from implementations import compute_loss_reg_logistic_regression

def fn(x):
    return 1.0 / (1.0 + np.exp(-x))

y_example = np.array([0, 1, 1])
x_example = np.array([[-1, 2], [2, -2], [4, 4]])
x_example = np.array([[-1], [2], [4]])
lambda_example = 0

w_start = np.array([0])
w_example = get_model('LOG_REG_GD', y_example, x_example, w_start, 100000, 1e-2, lambda_example, 1)

# w_example[0] = 33.379
print(w_example)
print(fn(x_example.dot(w_example)))
print(compute_loss_reg_logistic_regression(y_example, x_example, w_start, lambda_example))
print(compute_loss_reg_logistic_regression(y_example, x_example, w_example, lambda_example))

#gamma_lambda_selection_cv(y_example, x_example, 2, w_start, 100, [1e-5, 5e-1], [0.0, 1e-3],
#                                  seed = 1, batch_size = 1, metric = 'CA', model = 'LOG_REG_GD')


In [None]:
v = np.array([-1, -2, 3])
np.maximum(v, 0)

In [None]:
# USE THIS TO TEST 1 SUBSET

i = 0
y_train_subset, X_train_subset, ids_train_subset = train_subsets[i]
y_test_subset, X_test_subset, ids_test_subset = test_subsets[i]

y_train_subset = map_0_1(y_train_subset)
X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset)
print(f"Train shape before feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}\n")
X_train_subset, X_test_subset = build_poly(X_train_subset, max_degree), build_poly(X_test_subset, max_degree)
print(f"Train shape after  feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}\n")

N, D = X_train_subset.shape
initial_w = np.random.randn(D)

In [None]:
# need to choose optimal lambda and optimal gamma together
k_fold = 5 # can experiment with different numbers
max_iters = 50

# GD step sizes, regularization factors
gammas, lambdas = [1e-5, 1], [0.0, 0.001]
optimal_gamma, optimal_lambda_ = gamma_lambda_selection_cv(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas, lambdas)

print(optimal_gamma, optimal_lambda_)

_, w = reg_logistic_regression(y_train_subset, X_train_subset, optimal_lambda_, initial_w, max_iters, optimal_gamma)
y_pred_test = np.array(map_minus_1_1(predict_labels(w, X_test_subset)))

ids = np.concatenate((ids, ids_test_subset))
y_pred = np.concatenate((y_pred, y_pred_test))

ids, y_pred = sort_arr(ids, y_pred)