In [1]:
import numpy as np

import data_io
import data_preprocessing
from implementations import *
import validation
import attribute_selection
import evaluators
import metrics

# Autoreload modules
%load_ext autoreload
%autoreload 2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

DATA_FILE_PREFIX = '/content/drive/My Drive/mlproject1_higgs_data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
y_train, x_train, _, cols = data_io.load_csv_data(f'{DATA_FILE_PREFIX}train.csv')
#_, x_test, ids_test, cols_train = data_io.load_csv_data(f'{DATA_FILE_PREFIX}test.csv')

In [32]:
col_to_index_mapping = {col_name: index - 2 for index, col_name in enumerate(cols) if index >= 2}
y_train = (y_train + 1) // 2

In [7]:
import operator

def tune_lambda(y, x, grid, seed=42, history=True):
    w_init = np.zeros(x.shape[1])
    res = {}
    for lambda_ in grid:
        np.random.seed(seed)
        train_model = lambda y_, x_: make_predictor(reg_logistic_regression_sgd(
            y_, x_, lambda_, w_init, 5, 1000, 0.5,
        )[0])
        res[lambda_] = validation.cross_validation(y, x, train_model, 5)[0].mean()
        if history:
          print(f"{lambda_}: {res[lambda_]:.4f}")
    return max(res.items(), key=operator.itemgetter(1))

In [8]:
def make_predictor(w):
  def foo(features):
    return (features @ w > 0).astype(int)
  return foo

In [9]:
def train_model(y, x):
  w_init = np.zeros(x.shape[1])
  lambda_ = 1e-5
  return make_predictor(reg_logistic_regression_sgd(
      y, x, lambda_, w_init, 5, 1000, 0.5)[0])

In [10]:
def build_pairwise_plus(x, column_idx):
    """build pairwise multiplyed features x"""
    if x.ndim == 1:
        x = x[:, np.newaxis]
        
    columns = np.copy(x[:, column_idx])
    pairwise = []
    for i in range(columns.shape[1] - 1):
        for j in range(i + 1, columns.shape[1] - 1):
            pairwise.append(columns[:, i] + columns[:, j])
    pairwise = np.array(pairwise).T
    return np.concatenate([np.copy(x), pairwise], 1)

def transformation_pipeline_median_selected(x, col_to_index_mapping=col_to_index_mapping):
    tx = np.copy(x) # Recommended to copy x so it doesn't change
    tx[tx == -999.] = np.nan
    tx = data_preprocessing.apply_transformation(
        tx,
        [col_to_index_mapping[key] for key in col_to_index_mapping if 'PRI_jet_num' not in key],
        data_preprocessing.standardize_with_nans,
    )
    # standardize and normalize may change value of fields from default missing values, so it uses matrix calculated before applying transformations
    tx = data_preprocessing.median_missing_values(tx, np.isnan(tx)) 
    # onehot for categorical and drop one level
    tx, col_to_index_mapping_upd = data_preprocessing.one_hot_transformation(tx, 'PRI_jet_num', col_to_index_mapping)
    tx = tx[:, :-1]

    sins = np.sin(tx)
    coses = np.cos(tx)
    #polys = data_preprocessing.build_poly(tx, list(range(tx.shape[1])), [2])
    tx = np.concatenate((tx, sins, coses), axis=1)
    
    # add bias
    tx = data_preprocessing.prepend_bias_column(tx)
    first_selection_attr = [0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 20, 22, 25, 30, 31, 33, 34, 35, 36, 40, 41, 42, 44, 45, 47, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 72, 74, 75, 76, 79, 81, 82, 87, 88, 91, 94, 95, 96]
    tx = tx[:, first_selection_attr]
    tx = tx[:, [0, 1, 2, 3, 4, 6, 7, 8, 9, 12, 13, 15, 18, 19, 20, 22, 23, 24, 25, 27, 31, 33, 35, 36, 37, 38, 39, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]]

    return tx

def transformation_pipeline_median_selected_pairwise(x, col_to_index_mapping=col_to_index_mapping):
    tx = np.copy(x) # Recommended to copy x so it doesn't change
    
    columns_with_missing_values = np.max((tx == -999), axis=0)
    missing_columns_binary = (tx[:, columns_with_missing_values] == -999).astype(int)
    
    tx[tx == -999.] = np.nan

    tx = data_preprocessing.apply_transformation(
        tx,
        [col_to_index_mapping[key] for key in col_to_index_mapping if 'PRI_jet_num' not in key],
        data_preprocessing.standardize_with_nans,
    )
    # standardize and normalize may change value of fields from default missing values, so it uses matrix calculated before applying transformations
    tx = data_preprocessing.median_missing_values(tx, np.isnan(tx)) 
    # onehot for categorical and drop one level
    tx, col_to_index_mapping_upd = data_preprocessing.one_hot_transformation(tx, 'PRI_jet_num', col_to_index_mapping)
    tx = tx[:, :-1]

    sins = np.sin(tx)
    coses = np.cos(tx)
    #polys = data_preprocessing.build_poly(tx, list(range(tx.shape[1])), [2])
    tx = np.concatenate((tx, sins, coses), axis=1)
    
    # add bias
    tx = data_preprocessing.prepend_bias_column(tx)
    first_selection_attr = [0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 20, 22, 25, 30, 31, 33, 34, 35, 36, 40, 41, 42, 44, 45, 47, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 72, 74, 75, 76, 79, 81, 82, 87, 88, 91, 94, 95, 96]
    tx = tx[:, first_selection_attr]
    #tx = tx[:, [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 22, 23, 24, 25, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]]

    poly = data_preprocessing.standardize(data_preprocessing.build_poly(tx, list(range(tx.shape[1])), [2, 3]))

    d = tx.shape[1]

    tx = np.c_[data_preprocessing.build_pairwise_alt(tx, list(range(d))), poly, missing_columns_binary]

    #plus = build_pairwise_plus(tx, list(range(d)))

    #tx = np.concatenate((tx, mul), axis=1)

    tx = data_preprocessing.prepend_bias_column(tx)
    
    return tx

In [34]:
tx_train_2 = transformation_pipeline_median_selected_pairwise(x_train)


In [11]:
tx_train_2.shape

(250000, 1662)

In [12]:
del x_train

In [None]:
del tx_train_2

In [None]:
print(tx_train_2.shape)

(250000, 1654)


In [None]:
import operator

def tune_lambda(y, x, grid, seed=42, history=True):
    w_init = np.zeros(x.shape[1])
    res = {}
    for lambda_ in grid:
        np.random.seed(seed)
        train_model = lambda y_, x_: make_predictor(reg_logistic_regression_sgd(
            y_, x_, lambda_, w_init, 100, 2000, 0.1,
        )[0])
        res[lambda_] = validation.cross_validation(y, x, train_model, 5)[0].mean()
        if history:
          print(f"{lambda_}: {res[lambda_]:.4f}")
    return max(res.items(), key=operator.itemgetter(1))

tx_train_to_tune = tx_train_2
ty_train_to_tune = y_train

_ = tune_lambda(ty_train_to_tune, tx_train_to_tune, [0, 1e-5, 1e-7, 1e-8, 1e-9])

  exp_x = np.exp(x)
  exp_x / (1 + exp_x))


0: 0.8409
1e-05: 0.8412
1e-07: 0.8414
1e-08: 0.8415
1e-09: 0.8416


In [19]:
w = None
def train_model(y_, x_):
  global w
  w_init = np.zeros(x_.shape[1])
  lambda_ = 1e-9
  weights, loss = reg_logistic_regression_sgd(
    y_, x_, lambda_, w_init, 400, 2000, 0.04,
  )
  w = weights
  return make_predictor(weights)

predict = train_model(y_train, tx_train_2)
metrics.accuracy(y_train, predict(tx_train_2))

  exp_x = np.exp(x)
  exp_x / (1 + exp_x))


0.841824

In [35]:
tx_train_2.shape

(250000, 1662)

In [36]:
import numpy as np
np.set_printoptions(threshold=np.inf)

print('84.1824% weights:', list(w))

84.1824% weights: [-0.02602270315451367, -0.02602270315451367, 0.08712381130629011, 0.027529917772325246, 0.08191662751623512, -0.00233757183024629, -0.004025270468256667, 0.0021481144252863058, 0.1180850431405614, 0.05443842210771049, -0.10469642544284224, -0.0049708901300631525, 0.00595878881359088, -0.0171905055246726, 0.02768855382633608, 0.0291364696365455, -0.11378008140509095, 0.0007362057491584342, 0.020937266973202532, 0.009310484402131568, 0.2723260750895802, 0.01919263684238056, 0.17631070692326745, -0.009424789679751548, 0.05638078167649599, -0.0970006282887165, 0.09755720110718993, -0.029655968353008374, 0.12719849246182036, 0.0041780700330494806, -0.003353328628483768, -0.03418557727910871, 0.024976596589535643, -0.0035433527066504908, 0.0804126409029341, -0.01128713446296356, 0.017618102659126535, 0.007834502478900088, -0.07079672542369114, -0.03888458844399814, 0.09331843755690854, -0.05098711180302382, 0.008037683278065538, 0.020874393807811784, -0.022489597402793476, 

In [35]:
metrics.accuracy(y_train, predict(tx_train_2))

0.841824

In [22]:
del tx_train_2
del y_train

In [23]:
_, x_test, ids_test, cols_train = data_io.load_csv_data(f'{DATA_FILE_PREFIX}test.csv')

In [24]:
import time

batch_size = 100000
current_ind = 0
pred = []
while current_ind <= x_test.shape[0]:
  x_test_batch = x_test[current_ind: min(current_ind + batch_size, x_test.shape[0])]
  current_ind += batch_size
  tx_test_batch = transformation_pipeline_median_selected_pairwise(x_test_batch)
  predictions = predict(tx_test_batch)
  pred.append(predictions)
  del tx_test_batch
  time.sleep(1)

In [25]:
prediction_test = np.concatenate(pred)

In [26]:
prediction_test = prediction_test * 2 - 1

In [27]:
prediction_test[:5]

array([-1, -1,  1,  1, -1])

In [28]:
x_test.shape

(568238, 30)

In [29]:
data_io.create_csv_submission(ids_test, prediction_test, '8418acc.csv')