In [1]:
import numpy as np

from preprocessing import PRI_jet_num_split
from preprocessing import standardize, minmax_normalize
from preprocessing import clean_nan
from preprocessing import map_0_1, map_minus_1_1

from cross_validation import get_model, calculate_loss, accuracy, total_cross_validation

from implementations import build_poly

from scripts.helpers import load_csv_data
from scripts.helpers import predict_labels, create_csv_submission

from cross_validation import gamma_lambda_selection_cv
from cross_validation import plotting_graphs

In [2]:
def sort_arr(ids, y_pred):
    idx = ids.argsort()
    return ids[idx], y_pred[idx]

In [3]:
np.random.seed(1)

In [7]:
train_fname = "data/train.csv"
test_fname = "data/test.csv"
sumbission_fname = "data/submission.csv"

In [8]:
y_train, X_train, ids_train = load_csv_data(train_fname)
y_test, X_test, ids_test = load_csv_data(test_fname)

print("Shapes")
print(X_train.shape, y_train.shape, ids_train.shape)
print(X_test.shape, y_test.shape, ids_test.shape)
print()

Shapes
(250000, 30) (250000,) (250000,)
(568238, 30) (568238,) (568238,)



In [9]:
# combine_vals = True
combine_vals = False

train_subsets = PRI_jet_num_split(y_train, X_train, ids_train, combine_vals)
test_subsets = PRI_jet_num_split(y_test, X_test, ids_test, combine_vals)

print(f"Number of train subsets: { len(train_subsets) }")
print(f"Number of test subsets:  { len(test_subsets) }")
print()

assert len(train_subsets) == len(test_subsets)
num_subsets = len(train_subsets)

Number of train subsets: 8
Number of test subsets:  8



In [18]:
ids = np.array([])
y_pred = np.array([])

# Polynomial max degree
max_degree = 2

# GD step sizes, regularization factors
# lambdas = np.logspace(0, 1, 5) 
# gammas = np.logspace(0, 1, 5)
# gammas, lambdas = [1e-6, 1e-5, 1e-4], [0.0, 0.1, 1.0]

# getting the score by total cross_validation
# testing_accuracy, testing_loss = [], []

In [19]:
i = 0
y_train_subset, X_train_subset, ids_train_subset = train_subsets[i]
y_test_subset, X_test_subset, ids_test_subset = test_subsets[i]

y_train_subset = map_0_1(y_train_subset)
X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset)
print(f"Train shape before feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}")
X_train_subset, X_test_subset = build_poly(X_train_subset, max_degree), build_poly(X_test_subset, max_degree)
print(f"Train shape after  feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}")

# set n_best_features to X_train_subset.shape[1] if you don't want feature selection
n_best_features = round(0.75 * X_train_subset.shape[1])
# n_best_features = X_train_subset.shape[1]
D = n_best_features
N, _ = X_train_subset.shape

# accuracy by predicting the majority class in the training dataset
CA_baseline = y_train_subset.sum() / N
CA_baseline = max(CA_baseline, 1 - CA_baseline)

# !!!!!! IMPORTANT
# If we increase the reg. factor (lambda), then we must decrease the step size (gamma)
# else the model diverges as the step size becomes too big.

# FEATURE SELECTION WITH L1 REG.
max_iters_fs = 200
gamma_fs, lambda_fs = 1e-7, 1e2
model_fs = 'LOG_REG_L1'

initial_w_fs = np.random.randn(X_train_subset.shape[1])
# get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size)
loss_, w_fs = get_model(model_fs, y_train_subset, X_train_subset, initial_w_fs, max_iters_fs, gamma_fs, lambda_fs, 1)
features = np.argsort(abs(w_fs))[::-1][:n_best_features]
print(w_fs.min(), w_fs.max())

# Feature selection
X_train_subset, X_test_subset = X_train_subset[:, features], X_test_subset[:, features]

# tweak params
k_fold = 5
max_iters = 2000
# gammas, lambdas = [1e-6, 1e-5], [0.0, 1e-3, 1, 10, 100, 500]
gammas, lambdas = [5e-6, 1e-5, 1e-4, 1e-3], [0, 1e-2, 1, 10, 100]
# gammas, lambdas = [1e-2], [0]
seed, batch_size = 17, 1
metric, model = 'CA', 'LOG_REG_GD'

initial_w = np.random.randn(D)
# gamma_lambda_selection_cv(y, tx, k_fold, initial_w, max_iters, gammas, lambdas, seed = 1, metric = 'CA', model = 'LOG_REG_GD')
optimal_gamma, optimal_lambda_ = \
    gamma_lambda_selection_cv(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas, lambdas,
                              seed = seed, batch_size = batch_size, metric = metric, model = model)
print('CA_bs:', CA_baseline)
print('Iter:', i, ' Best gamma:', optimal_gamma, ' Best lambda:', optimal_lambda_, '\n')

# get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size)
loss_, w = get_model(model, y_train_subset, X_train_subset, initial_w, max_iters, optimal_gamma, optimal_lambda_, batch_size)


# y_pred_test = np.array(map_minus_1_1(predict_labels(w, X_test_subset)))

# ids = np.concatenate((ids, ids_test_subset))
# y_pred = np.concatenate((y_pred, y_pred_test))

# ids, y_pred = sort_arr(ids, y_pred)
# create_csv_submission(ids, y_pred, sumbission_fname)

# the below example run is when combined=True.

Train shape before feature expansion:  (73790, 17)   Test shape: (168195, 17)
Train shape after  feature expansion:  (73790, 35)   Test shape: (168195, 35)
-1.6468000927981619 2.104501190256358
(0, 0)/(4, 5)


ValueError: setting an array element with a sequence.

In [17]:
# exp_measure_tr, exp_measure_te = 0, 0

# i = 0
# y_train_subset, X_train_subset, ids_train_subset = train_subsets[i]
# y_test_subset, X_test_subset, ids_test_subset = test_subsets[i]

# y_train_subset = map_0_1(y_train_subset)
# X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset)
# print(f"Train shape before feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}")
# X_train_subset, X_test_subset = build_poly(X_train_subset, max_degree), build_poly(X_test_subset, max_degree)
# print(f"Train shape after  feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}")

# # set n_best_features to X_train_subset.shape[1] if you don't want feature selection
# n_best_features = round(0.85 * X_train_subset.shape[1])
# # n_best_features = X_train_subset.shape[1]
# D = n_best_features
# N, _ = X_train_subset.shape

# # accuracy by predicting the majority class in the training dataset
# CA_one = y_train_subset.sum() / N
# CA_zero = 1 - CA_one
# CA_baseline = max(CA_zero, CA_one)

# # !!!!!! IMPORTANT
# # If we increase the reg. factor (lambda), then we must decrease the step size (gamma)
# # else the model diverges as the step size becomes too big.

# # FEATURE SELECTION WITH L1 REG.
# max_iters_fs = 300
# gamma_fs, lambda_fs = 1e-7, 1e2
# model_fs = 'LOG_REG_L1'

# initial_w_fs = np.random.randn(X_train_subset.shape[1])
# # get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size)
# w_fs = get_model(model_fs, y_train_subset, X_train_subset, initial_w_fs, max_iters_fs, gamma_fs, lambda_fs, 1)
# features = np.argsort(abs(w_fs))[::-1][:n_best_features]
# print(w_fs.min(), w_fs.max(), w_fs.mean())

# # Feature selection
# X_train_subset, X_test_subset = X_train_subset[:, features], X_test_subset[:, features]

# # tweak params
# k_fold = 5
# max_iters = 500
# # gammas, lambdas = [1e-6, 1e-5], [0.0, 1e-3, 1, 10, 100, 500]
# # gammas[0] ... use it for the large datasets with mass feature
# # gammas[1] ... use it for the small datasets without mass feature
# # gammas, lambdas = [2e-6, 5e-6], [1, 5] #[[1e-5, 3e-5], [2e-7, 1e-6]], [[0, 1e-2, 1, 10, 50], [0, 1, 1e1, 5e1, 2e2, 1e3]]
# gammas, lambdas = [5e-6, 1e-5, 1e-4, 1e-3], [0, 1e-2, 1, 10, 100]
# seed, batch_size = 17, 1
# metric, model = 'CA', 'LOG_REG_GD'
    
# initial_w = np.random.randn(D)
# # gamma_lambda_selection_cv(y, tx, k_fold, initial_w, max_iters, gammas, lambdas, seed = 1, metric = 'CA', model = 'LOG_REG_GD')
# optimal_gamma, optimal_lambda_, measure_tr, measure_te = \
#     gamma_lambda_selection_cv(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas, lambdas,
#                       seed = seed, batch_size = batch_size, metric = metric, model = model)
# print('CA_bs:', CA_baseline)
# print('Iter:', i, ' Best gamma:', optimal_gamma, ' Best lambda:', optimal_lambda_, '\n')

# exp_measure_tr += measure_tr * X_train_subset.shape[0] / X_train.shape[0]
# exp_measure_te += measure_te * X_test_subset.shape[0] / X_test.shape[0]
# # get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size)
# w = get_model(model, y_train_subset, X_train_subset, initial_w, max_iters, optimal_gamma, optimal_lambda_, batch_size)

# print("Expected training accuracy / loss:", exp_measure_tr)
# print("Expected test accuracy / loss:", exp_measure_te)

Train shape before feature expansion:  (73790, 17)   Test shape: (168195, 17)
Train shape after  feature expansion:  (73790, 35)   Test shape: (168195, 35)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
# for i in range(num_subsets):
#     y_train_subset, X_train_subset, ids_train_subset = train_subsets[i]
#     y_test_subset, X_test_subset, ids_test_subset = test_subsets[i]

#     y_train_subset = map_0_1(y_train_subset)
#     X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset)
#     print(f"Train shape before feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}")
#     X_train_subset, X_test_subset = build_poly(X_train_subset, max_degree), build_poly(X_test_subset, max_degree)
#     print(f"Train shape after  feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}")
    
#     # set n_best_features to X_train_subset.shape[1] if you don't want feature selection
#     n_best_features = round(0.75 * X_train_subset.shape[1])
#     # n_best_features = X_train_subset.shape[1]
#     D = n_best_features
#     N, _ = X_train_subset.shape
    
#     # accuracy by predicting the majority class in the training dataset
#     CA_baseline = y_train_subset.sum() / N
#     CA_baseline = max(CA_baseline, 1 - CA_baseline)
    
#     # !!!!!! IMPORTANT
#     # If we increase the reg. factor (lambda), then we must decrease the step size (gamma)
#     # else the model diverges as the step size becomes too big.
    
#     # FEATURE SELECTION WITH L1 REG.
#     max_iters_fs = 200
#     gamma_fs, lambda_fs = 1e-7, 1e2
#     model_fs = 'LOG_REG_L1'
    
#     initial_w_fs = np.random.randn(X_train_subset.shape[1])
#     # get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size)
#     loss_, w_fs = get_model(model_fs, y_train_subset, X_train_subset, initial_w_fs, max_iters_fs, gamma_fs, lambda_fs, 1)
#     features = np.argsort(abs(w_fs))[::-1][:n_best_features]
#     print(w_fs.min(), w_fs.max())
    
#     # Feature selection
#     X_train_subset, X_test_subset = X_train_subset[:, features], X_test_subset[:, features]
    
#     # tweak params
#     k_fold = 5
#     max_iters = 2000
#     # gammas, lambdas = [1e-6, 1e-5], [0.0, 1e-3, 1, 10, 100, 500]
#     gammas, lambdas = [5e-6, 3e-5], [0, 1e-2, 1, 10, 100]
#     # gammas, lambdas = [1e-2], [0]
#     seed, batch_size = 17, 1
#     metric, model = 'CA', 'LOG_REG_GD'
    
#     initial_w = np.random.randn(D)
#     # gamma_lambda_selection_cv(y, tx, k_fold, initial_w, max_iters, gammas, lambdas, seed = 1, metric = 'CA', model = 'LOG_REG_GD')
#     optimal_gamma, optimal_lambda_ = \
#         gamma_lambda_selection_cv(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas, lambdas,
#                                   seed = seed, batch_size = batch_size, metric = metric, model = model)
#     print('CA_bs:', CA_baseline)
#     print('Iter:', i, ' Best gamma:', optimal_gamma, ' Best lambda:', optimal_lambda_, '\n')
    
#     # get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size)
#     loss_, w = get_model(model, y_train_subset, X_train_subset, initial_w, max_iters, optimal_gamma, optimal_lambda_, batch_size)

    
#     y_pred_test = np.array(map_minus_1_1(predict_labels(w, X_test_subset)))

#     ids = np.concatenate((ids, ids_test_subset))
#     y_pred = np.concatenate((y_pred, y_pred_test))

# ids, y_pred = sort_arr(ids, y_pred)
# create_csv_submission(ids, y_pred, sumbission_fname)

# # the below example run is when combined=True.

In [None]:
from implementations import compute_loss_reg_logistic_regression
from implementations import compute_loss_reg_logistic_regression_L1

def fn(x):
    return 1.0 / (1.0 + np.exp(-x))

y_example = np.array([0, 1, 1])
x_example, w_start = np.array([[-1, 2, 5, 8], [2, -2, 1, 6], [4, 4, 6, 8]]), np.array([0, 0, 0, 0])
#x_example, w_start = np.array([[-1], [2], [4]]), np.array([0])
lambda_example = 10

w_example_GD = get_model('LOG_REG_GD', y_example, x_example, w_start, 100000, 1e-4, lambda_example, 1)

# w_example[0] = 33.379
print(w_example_GD)
print(fn(x_example.dot(w_example_GD)))
print(compute_loss_reg_logistic_regression(y_example, x_example, w_start, lambda_example))
print(compute_loss_reg_logistic_regression(y_example, x_example, w_example_GD, lambda_example))
print()

w_example_L1 = get_model('LOG_REG_L1', y_example, x_example, w_start, 100000, 1e-4, lambda_example, 1)

# w_example[0] = 33.379
print(w_example_L1)
print(fn(x_example.dot(w_example_L1)))
print(compute_loss_reg_logistic_regression_L1(y_example, x_example, w_start, lambda_example))
print(compute_loss_reg_logistic_regression_L1(y_example, x_example, w_example_L1, lambda_example))
print()


#gamma_lambda_selection_cv(y_example, x_example, 2, w_start, 100, [1e-5, 5e-1], [0.0, 1e-3],
#                                  seed = 1, batch_size = 1, metric = 'CA', model = 'LOG_REG_GD')


In [None]:
v = np.array([-1, -2, 3])
np.maximum(v, 0)

In [None]:
# USE THIS TO TEST 1 SUBSET

i = 0
y_train_subset, X_train_subset, ids_train_subset = train_subsets[i]
y_test_subset, X_test_subset, ids_test_subset = test_subsets[i]

y_train_subset = map_0_1(y_train_subset)
X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset)
print(f"Train shape before feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}\n")
X_train_subset, X_test_subset = build_poly(X_train_subset, max_degree), build_poly(X_test_subset, max_degree)
print(f"Train shape after  feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}\n")

N, D = X_train_subset.shape
initial_w = np.random.randn(D)

In [None]:
# need to choose optimal lambda and optimal gamma together
k_fold = 5 # can experiment with different numbers
max_iters = 50

# GD step sizes, regularization factors
gammas, lambdas = [1e-5, 1], [0.0, 0.001]
optimal_gamma, optimal_lambda_ = gamma_lambda_selection_cv(y_train_subset, X_train_subset, k_fold, initial_w, max_iters, gammas, lambdas)

print(optimal_gamma, optimal_lambda_)

_, w = reg_logistic_regression(y_train_subset, X_train_subset, optimal_lambda_, initial_w, max_iters, optimal_gamma)
y_pred_test = np.array(map_minus_1_1(predict_labels(w, X_test_subset)))

ids = np.concatenate((ids, ids_test_subset))
y_pred = np.concatenate((y_pred, y_pred_test))

ids, y_pred = sort_arr(ids, y_pred)