In [1]:
# need to wrap main in a function such it runs all possible combinations 
# need to store optimal gamma and lambda for different possible combinations

import numpy as np
from preprocessing import PRI_jet_num_split
from preprocessing import standardize, minmax_normalize
from preprocessing import clean_nan
from preprocessing import map_0_1, map_minus_1_1
from implementations import build_poly
from implementations import reg_logistic_regression
from helpers import load_csv_data
from helpers import predict_labels, create_csv_submission
from cross_validation import gamma_lambda_selection_cv

In [2]:
def sort_arr(ids, y_pred):
    idx = ids.argsort()
    return ids[idx], y_pred[idx]

In [3]:
np.random.seed(1)

In [4]:
train_fname = "data/train.csv"
test_fname = "data/test.csv"
sumbission_fname = "data/submission.csv"

In [5]:
y_train, X_train, ids_train = load_csv_data(train_fname)
y_test, X_test, ids_test = load_csv_data(test_fname)

print("Shapes")
print(X_train.shape, y_train.shape, ids_train.shape)
print(X_test.shape, y_test.shape, ids_test.shape)
print()

Shapes
(250000, 30) (250000,) (250000,)
(568238, 30) (568238,) (568238,)



In [6]:
combine_vals = False

train_subsets = PRI_jet_num_split(y_train, X_train, ids_train, combine_vals)
test_subsets = PRI_jet_num_split(y_test, X_test, ids_test, combine_vals)

print(f"Number of train subsets: { len(train_subsets) }")
print(f"Number of test subsets:  { len(test_subsets) }")
print()

assert len(train_subsets) == len(test_subsets)
num_subsets = len(train_subsets)

Number of train subsets: 8
Number of test subsets:  8



In [9]:
ids = np.array([])
y_pred = np.array([])

# Polynomial max degree
max_degree = 2

# GD step sizes, regularization factors
lambdas = np.logspace(0, 1, 5) 
gammas = np.logspace(0, 1, 5)
gammas, lambdas = [1e-5], [0.0]

In [10]:
for i in range(num_subsets):
    y_train_subset, X_train_subset, ids_train_subset = train_subsets[i]
    y_test_subset, X_test_subset, ids_test_subset = test_subsets[i]

    y_train_subset = map_0_1(y_train_subset)
    X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset)
    print(f"Train shape before feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}\n")
    X_train_subset, X_test_subset = build_poly(X_train_subset, max_degree), build_poly(X_test_subset, max_degree)
    print(f"Train shape after  feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}\n")

    N, D = X_train_subset.shape
    initial_w = np.random.randn(D)
    
    # need to choose optimal lambda and optimal gamma together
    k_fold = 4
    max_iters = 50

    optimal_gamma, optimal_lambda_ = gamma_lambda_selection_cv(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas, lambdas)
    
    _, w = reg_logistic_regression(y_train_subset, X_train_subset, optimal_lambda_, initial_w, max_iters, optimal_gamma)
    y_pred_test = np.array(map_minus_1_1(predict_labels(w, X_test_subset)))

    ids = np.concatenate((ids, ids_test_subset))
    y_pred = np.concatenate((y_pred, y_pred_test))

ids, y_pred = sort_arr(ids, y_pred)
create_csv_submission(ids, y_pred, sumbission_fname)

Train shape before feature expansion:  (73790, 18)   Test shape: (168195, 18)

Train shape after  feature expansion:  (73790, 37)   Test shape: (168195, 37)

Train shape before feature expansion:  (26123, 17)   Test shape:  (59263, 17)

Train shape after  feature expansion:  (26123, 35)   Test shape:  (59263, 35)

Train shape before feature expansion:  (69982, 22)   Test shape: (158095, 22)

Train shape after  feature expansion:  (69982, 45)   Test shape: (158095, 45)

Train shape before feature expansion:   (7562, 21)   Test shape:  (17243, 21)

Train shape after  feature expansion:   (7562, 43)   Test shape:  (17243, 43)

Train shape before feature expansion:  (47427, 29)   Test shape: (107905, 29)

Train shape after  feature expansion:  (47427, 59)   Test shape: (107905, 59)

Train shape before feature expansion:   (2952, 28)   Test shape:   (6743, 28)

Train shape after  feature expansion:   (2952, 57)   Test shape:   (6743, 57)

Train shape before feature expansion:  (20687, 29)  

In [None]:
# USE THIS TO TEST 1 SUBSET

i = 0
y_train_subset, X_train_subset, ids_train_subset = train_subsets[i]
y_test_subset, X_test_subset, ids_test_subset = test_subsets[i]

y_train_subset = map_0_1(y_train_subset)
X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset)
print(f"Train shape before feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}\n")
X_train_subset, X_test_subset = build_poly(X_train_subset, max_degree), build_poly(X_test_subset, max_degree)
print(f"Train shape after  feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}\n")

N, D = X_train_subset.shape
initial_w = np.random.randn(D)

In [None]:
# need to choose optimal lambda and optimal gamma together
k_fold = 5 # can experiment with different numbers
max_iters = 50

# GD step sizes, regularization factors
gammas, lambdas = [1e-5, 1], [0.0, 0.001]
optimal_gamma, optimal_lambda_ = gamma_lambda_selection_cv(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas, lambdas)

print(optimal_gamma, optimal_lambda_)

_, w = reg_logistic_regression(y_train_subset, X_train_subset, optimal_lambda_, initial_w, max_iters, optimal_gamma)
y_pred_test = np.array(map_minus_1_1(predict_labels(w, X_test_subset)))

ids = np.concatenate((ids, ids_test_subset))
y_pred = np.concatenate((y_pred, y_pred_test))

ids, y_pred = sort_arr(ids, y_pred)