In [1]:
import numpy as np

import data_io
import data_preprocessing
from implementations import *
import validation
import attribute_selection
import evaluators
import metrics

# Autoreload modules
%load_ext autoreload
%autoreload 2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

DATA_FILE_PREFIX = '/content/drive/My Drive/mlproject1_higgs_data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
y_train, x_train, _, cols = data_io.load_csv_data(f'{DATA_FILE_PREFIX}train.csv')
_, x_test, ids_test, cols_train = data_io.load_csv_data(f'{DATA_FILE_PREFIX}test.csv')

In [4]:
col_to_index_mapping = {col_name: index - 2 for index, col_name in enumerate(cols) if index >= 2}
y_train = (y_train + 1) // 2

In [5]:
def transformation_pipeline_base(x, col_to_index_mapping=col_to_index_mapping):
    tx = np.copy(x) # Recommended to copy x so it doesn't change
    tx[tx == -999.] = np.nan
    tx = data_preprocessing.apply_transformation(
        tx,
        [col_to_index_mapping[key] for key in col_to_index_mapping if 'PRI_jet_num' not in key],
        data_preprocessing.standardize_with_nans,
    )
    # standardize and normalize may change value of fields from default missing values, so it uses matrix calculated before applying transformations
    tx = data_preprocessing.nullify_missing_values(tx, np.isnan(tx)) 
    # onehot for categorical and drop one level
    tx, col_to_index_mapping_upd = data_preprocessing.one_hot_transformation(tx, 'PRI_jet_num', col_to_index_mapping)
    tx = tx[:, :-1]
    # add bias
    tx = data_preprocessing.prepend_bias_column(tx)
    return tx

def transformation_pipeline_median(x, col_to_index_mapping=col_to_index_mapping):
    tx = np.copy(x) # Recommended to copy x so it doesn't change
    tx[tx == -999.] = np.nan
    tx = data_preprocessing.apply_transformation(
        tx,
        [col_to_index_mapping[key] for key in col_to_index_mapping if 'PRI_jet_num' not in key],
        data_preprocessing.standardize_with_nans,
    )
    # standardize and normalize may change value of fields from default missing values, so it uses matrix calculated before applying transformations
    tx = data_preprocessing.median_missing_values(tx, np.isnan(tx)) 
    # onehot for categorical and drop one level
    tx, col_to_index_mapping_upd = data_preprocessing.one_hot_transformation(tx, 'PRI_jet_num', col_to_index_mapping)
    tx = tx[:, :-1]

    sins = np.sin(tx)
    coses = np.cos(tx)
    #polys = data_preprocessing.build_poly(tx, list(range(tx.shape[1])), [2])
    tx = np.concatenate((tx, sins, coses), axis=1)
    
    # add bias
    tx = data_preprocessing.prepend_bias_column(tx)
    return tx

In [None]:
tx_train = transformation_pipeline_median(x_train)

first_selection_attr = [0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 20, 22, 25, 30, 31, 33, 34, 35, 36, 40, 41, 42, 44, 45, 47, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 72, 74, 75, 76, 79, 81, 82, 87, 88, 91, 94, 95, 96]
tx_train = tx_train[:, first_selection_attr]
tx_train = tx_train[:, [0, 1, 2, 3, 4, 6, 7, 8, 9, 12, 13, 15, 18, 19, 20, 22, 23, 24, 25, 27, 31, 33, 35, 36, 37, 38, 39, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]]

In [36]:
import operator

def tune_lambda(y, x, grid, seed=42):
    w_init = np.zeros(x.shape[1])
    res = {}
    for lambda_ in grid:
        np.random.seed(seed)
        train_model = lambda y_, x_: make_predictor(reg_logistic_regression_sgd(
            y_, x_, lambda_, w_init, 5, 1000, 0.5,
        )[0])
        res[lambda_] = validation.cross_validation(y, x, train_model, 5)[0].mean()
        print(f"{lambda_}: {res[lambda_]:.4f}")
    return max(res.items(), key=operator.itemgetter(1))[0]

In [7]:
def make_predictor(w):
  def foo(features):
    return (features @ w > 0).astype(int)
  return foo

In [8]:
def train_model(y, x):
  w_init = np.zeros(x.shape[1])
  lambda_ = 1e-5
  return make_predictor(reg_logistic_regression_sgd(
      y, x, lambda_, w_init, 5, 1000, 0.5)[0])

In [None]:
_ = validation.cross_validation(y_train, tx_train, train_model, 10, verbose=True)

------ 10-fold cross validation results ------
    Accuracy: avg 0.8156, max 0.81852, min 0.81212, stddev 0.002433
    Fbeta score: avg 0.72166, max 0.73235, min 0.7091, stddev 0.0071623


In [None]:
tune_lambda(y_train, tx_train, [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11, 1e-12])

1: 0.3429
0.1: 0.7464
0.01: 0.7926
0.001: 0.8122
0.0001: 0.8173
1e-05: 0.8180
1e-06: 0.8180
1e-07: 0.8180
1e-08: 0.8179
1e-09: 0.8179
1e-10: 0.8179
1e-11: 0.8179
1e-12: 0.8179


1e-05

In [None]:
attribute_selection.backward_attribute_selector(
    y_train, 
    tx_train, 
    evaluators.cross_validation_mean_acc_evaluator(train_model),
    20,
    verbose=True)

 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 96: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.816472)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 95: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.816076)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 94: 0, 1, 2, 3,

 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 96: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.816472)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 95: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.816076)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 94: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.81616)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 93: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 22, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.81628)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 92: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 22, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.816204)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 91: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.815888)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 90: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.815732)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 89: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.815844)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 88: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.815968)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 87: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.816172)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 86: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78, 79, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.81604)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 85: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78, 79, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.815936)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 84: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 77, 79, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 (score 0.815892)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 83: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 77, 79, 81, 82, 84, 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96 (score 0.816144)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 82: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 77, 79, 81, 82, 84, 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96 (score 0.816476)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 81: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 77, 79, 81, 82, 84, 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96 (score 0.815988)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 80: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 79, 81, 82, 84, 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96 (score 0.816132)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 79: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 79, 81, 82, 84, 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96 (score 0.816268)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 78: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 79, 81, 82, 84, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96 (score 0.816168)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 77: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 79, 81, 82, 84, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96 (score 0.816288)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 76: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 38, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 79, 81, 82, 84, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96 (score 0.816164)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 75: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 79, 81, 82, 84, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96 (score 0.816292)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 74: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 52, 53, 55, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 79, 81, 82, 84, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96 (score 0.816296)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 73: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 53, 55, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 79, 81, 82, 84, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96 (score 0.8162)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 72: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 53, 55, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 84, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96 (score 0.816536)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 71: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 53, 55, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 84, 86, 87, 88, 89, 91, 93, 94, 95, 96 (score 0.816344)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 70: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 53, 55, 57, 58, 59, 60, 62, 63, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 84, 86, 87, 88, 89, 91, 93, 94, 95, 96 (score 0.816724)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 69: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 27, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 53, 57, 58, 59, 60, 62, 63, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 84, 86, 87, 88, 89, 91, 93, 94, 95, 96 (score 0.816496)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 68: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 53, 57, 58, 59, 60, 62, 63, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 84, 86, 87, 88, 89, 91, 93, 94, 95, 96 (score 0.816308)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 67: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 19, 20, 22, 25, 26, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 53, 57, 58, 59, 60, 62, 63, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 86, 87, 88, 89, 91, 93, 94, 95, 96 (score 0.816376)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 66: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 20, 22, 25, 26, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 53, 57, 58, 59, 60, 62, 63, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 86, 87, 88, 89, 91, 93, 94, 95, 96 (score 0.816384)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 65: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 20, 22, 25, 26, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 53, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 86, 87, 88, 89, 91, 93, 94, 95, 96 (score 0.816332)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 64: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 20, 22, 25, 26, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 86, 87, 88, 89, 91, 93, 94, 95, 96 (score 0.816292)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 63: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 20, 22, 25, 26, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 86, 87, 88, 91, 93, 94, 95, 96 (score 0.816276)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 62: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 20, 22, 25, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 86, 87, 88, 91, 93, 94, 95, 96 (score 0.81638)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 61: 0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 20, 22, 25, 28, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 86, 87, 88, 91, 93, 94, 95, 96 (score 0.81636)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 60: 0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 20, 22, 25, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 86, 87, 88, 91, 93, 94, 95, 96 (score 0.816424)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 59: 0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 20, 22, 25, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 79, 81, 82, 87, 88, 91, 93, 94, 95, 96 (score 0.816528)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 58: 0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 20, 22, 25, 30, 31, 33, 34, 35, 36, 37, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 72, 74, 75, 76, 79, 81, 82, 87, 88, 91, 93, 94, 95, 96 (score 0.816624)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 57: 0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 20, 22, 25, 30, 31, 33, 34, 35, 36, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 72, 74, 75, 76, 79, 81, 82, 87, 88, 91, 93, 94, 95, 96 (score 0.816368)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 56: 0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 20, 22, 25, 30, 31, 33, 34, 35, 36, 40, 41, 42, 44, 45, 47, 48, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 72, 74, 75, 76, 79, 81, 82, 87, 88, 91, 94, 95, 96 (score 0.81642)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 55: 0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 20, 22, 25, 30, 31, 33, 34, 35, 36, 40, 41, 42, 44, 45, 47, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 72, 74, 75, 76, 79, 81, 82, 87, 88, 91, 94, 95, 96 (score 0.816328)

In [None]:
attribute_selection.backward_attribute_selector(
    y_train, 
    tx_train, 
    evaluators.cross_validation_mean_acc_evaluator(train_model),
    20,
    verbose=True)

 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 54: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54 (score 0.816288)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 53: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54 (score 0.816396)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 52: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54 (score 0.816316)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 51: 0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19,

[3, 4, 6, 9, 19, 20, 27, 37, 38, 39, 40, 42, 45, 46, 48, 50, 51, 52, 53, 54]

In [None]:
tx_train_sub = tx_train[:, [3, 4, 6, 9, 19, 20, 27, 37, 38, 39, 40, 42, 45, 46, 48, 50, 51, 52, 53, 54]]

attribute_selection.backward_attribute_selector(
    y_train, 
    tx_train_sub, 
    evaluators.cross_validation_mean_acc_evaluator(train_model),
    1,
    verbose=True)

 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 19: 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 (score 0.812068)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 18: 0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 (score 0.811316)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 17: 0, 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19 (score 0.810528)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 16: 0, 2, 3, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19 (score 0.8097)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 15: 0, 2, 3, 4, 5, 7, 9, 11, 12, 14, 15, 16, 17, 18, 19 (score 0.809308)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 14: 0, 2, 3, 4, 5, 7, 9, 11, 12, 14, 15, 17, 18, 19 (score 0.80796)
 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 13: 0, 2, 3, 4, 5, 7, 9, 12, 14, 15, 17, 18, 19 (score 0.806528)


  return true_positive_count / (true_positive_count + false_positive_count)


 --- BACKWARD ATTRIBUTE SELECTION: Best attribute indexes for size 1: 5 (score 0.66678)


[5]

In [33]:
def build_pairwise_plus(x, column_idx):
    """build pairwise multiplyed features x"""
    if x.ndim == 1:
        x = x[:, np.newaxis]
        
    columns = np.copy(x[:, column_idx])
    pairwise = []
    for i in range(columns.shape[1] - 1):
        for j in range(i + 1, columns.shape[1] - 1):
            pairwise.append(columns[:, i] + columns[:, j])
    pairwise = np.array(pairwise).T
    return np.concatenate([np.copy(x), pairwise], 1)

def transformation_pipeline_median_selected(x, col_to_index_mapping=col_to_index_mapping):
    tx = np.copy(x) # Recommended to copy x so it doesn't change
    tx[tx == -999.] = np.nan
    tx = data_preprocessing.apply_transformation(
        tx,
        [col_to_index_mapping[key] for key in col_to_index_mapping if 'PRI_jet_num' not in key],
        data_preprocessing.standardize_with_nans,
    )
    # standardize and normalize may change value of fields from default missing values, so it uses matrix calculated before applying transformations
    tx = data_preprocessing.median_missing_values(tx, np.isnan(tx)) 
    # onehot for categorical and drop one level
    tx, col_to_index_mapping_upd = data_preprocessing.one_hot_transformation(tx, 'PRI_jet_num', col_to_index_mapping)
    tx = tx[:, :-1]

    sins = np.sin(tx)
    coses = np.cos(tx)
    #polys = data_preprocessing.build_poly(tx, list(range(tx.shape[1])), [2])
    tx = np.concatenate((tx, sins, coses), axis=1)
    
    # add bias
    tx = data_preprocessing.prepend_bias_column(tx)
    first_selection_attr = [0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 20, 22, 25, 30, 31, 33, 34, 35, 36, 40, 41, 42, 44, 45, 47, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 72, 74, 75, 76, 79, 81, 82, 87, 88, 91, 94, 95, 96]
    tx = tx[:, first_selection_attr]
    tx = tx[:, [0, 1, 2, 3, 4, 6, 7, 8, 9, 12, 13, 15, 18, 19, 20, 22, 23, 24, 25, 27, 31, 33, 35, 36, 37, 38, 39, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]]

    return tx

def transformation_pipeline_median_selected_pairwise(x, col_to_index_mapping=col_to_index_mapping):
    tx = np.copy(x) # Recommended to copy x so it doesn't change
    tx[tx == -999.] = np.nan
    tx = data_preprocessing.apply_transformation(
        tx,
        [col_to_index_mapping[key] for key in col_to_index_mapping if 'PRI_jet_num' not in key],
        data_preprocessing.standardize_with_nans,
    )
    # standardize and normalize may change value of fields from default missing values, so it uses matrix calculated before applying transformations
    tx = data_preprocessing.median_missing_values(tx, np.isnan(tx)) 
    # onehot for categorical and drop one level
    tx, col_to_index_mapping_upd = data_preprocessing.one_hot_transformation(tx, 'PRI_jet_num', col_to_index_mapping)
    tx = tx[:, :-1]

    sins = np.sin(tx)
    coses = np.cos(tx)
    #polys = data_preprocessing.build_poly(tx, list(range(tx.shape[1])), [2])
    tx = np.concatenate((tx, sins, coses), axis=1)
    
    # add bias
    tx = data_preprocessing.prepend_bias_column(tx)
    first_selection_attr = [0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 20, 22, 25, 30, 31, 33, 34, 35, 36, 40, 41, 42, 44, 45, 47, 49, 50, 52, 57, 58, 59, 62, 63, 65, 66, 67, 68, 69, 72, 74, 75, 76, 79, 81, 82, 87, 88, 91, 94, 95, 96]
    tx = tx[:, first_selection_attr]
    tx = tx[:, [1, 2, 3, 4, 6, 7, 8, 9, 12, 13, 15, 18, 19, 20, 22, 23, 24, 25, 27, 31, 33, 35, 36, 37, 38, 39, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]]

    tx = data_preprocessing.build_pairwise(tx, list(range(tx.shape[1])))

    tx = data_preprocessing.prepend_bias_column(tx)
    
    return tx

In [92]:
tx_train_2 = transformation_pipeline_median_selected_pairwise(x_train)

'''
tx_train_2 = tx_train_2[:, [  4,   6,   7,   9,  10,  11,  13,  15,  16,  18,  19,  21,  28,
        42,  46,  49,  50,  54,  55,  56,  57,  61,  62,  64,  70,  72,
        80,  84,  87,  88,  90, 100, 101, 102, 103, 105, 106, 107, 108,
       109, 110, 111, 113, 114, 115, 117, 119, 120, 124, 126, 128, 134,
       135, 137, 138, 139, 140, 141, 142, 145, 149, 151, 154, 156, 158,
       159, 162, 165, 167, 168, 170, 171, 173, 176, 177, 178, 179, 182,
       183, 185, 186, 188, 190, 191, 192, 193, 194, 196, 198, 200, 201,
       203, 206, 208, 211, 212, 216, 218, 219, 224, 225, 226, 227, 231,
       238, 239, 241, 243, 245, 246, 247, 251, 252, 253, 254, 259, 260,
       261, 263, 264, 265, 266, 275, 280, 281, 283, 284, 287, 288, 291,
       292, 293, 295, 296, 297, 298, 300, 301, 306, 307, 316, 317, 318,
       319, 320, 322, 326, 329, 330, 331, 332, 333, 337, 339, 340, 341,
       342, 344, 346, 347, 348, 352, 353, 355, 357, 361, 362, 363, 366,
       368, 369, 371, 372, 373, 374, 378, 381, 383, 384, 386, 387, 389,
       391, 393, 394, 395, 398, 399, 404, 405, 408, 411, 412, 418, 426,
       433, 435, 441, 443, 444, 445, 446, 447, 449, 450, 451, 454, 455,
       456, 457, 458, 460, 467, 470, 480, 482, 486, 488, 489, 490, 491,
       494, 496, 498, 501, 502, 504, 506, 507, 511, 513, 514, 515, 516,
       519, 522, 525, 526, 528, 530, 533, 534, 535, 542, 544, 547, 548,
       551, 553, 554, 556, 557, 558, 559, 565, 567, 568, 569, 570, 571,
       573, 574, 577, 580, 581, 582, 583, 584, 585, 586, 587, 592, 594,
       596, 597, 598, 604, 605, 606, 608, 609, 611, 613, 614, 615, 617,
       618, 619, 620, 623, 625, 626, 627, 628, 631, 632, 633, 641, 647,
       649, 650, 652, 654, 658, 662, 663, 665, 666, 668, 681, 682, 685,
       692, 708, 710, 716, 719, 749, 755, 757, 758, 760, 761, 771, 774]]
'''
tx_train_2 = tx_train_2[:, [  4,   6,   7,   9,  10,  11,  13,  15,  16,  18,  19,  21,  28,
        42,  46,  49,  50,  54,  55,  56,  57,  61,  62,  63,  64,  70,
        72,  75,  80,  81,  84,  87,  88,  90,  98,  99, 100, 101, 102,
       103, 105, 106, 107, 108, 109, 110, 111, 113, 114, 115, 117, 119,
       120, 124, 126, 128, 130, 134, 135, 136, 137, 138, 139, 140, 141,
       142, 145, 149, 151, 154, 156, 158, 159, 162, 165, 167, 168, 170,
       171, 173, 176, 177, 178, 179, 182, 183, 185, 186, 188, 190, 191,
       192, 193, 194, 196, 198, 200, 201, 203, 205, 206, 208, 211, 212,
       216, 218, 219, 224, 225, 226, 227, 231, 238, 239, 241, 243, 245,
       246, 247, 251, 252, 253, 254, 255, 258, 259, 260, 261, 263, 264,
       265, 266, 274, 275, 280, 281, 283, 284, 287, 288, 291, 292, 293,
       295, 296, 297, 298, 300, 301, 302, 303, 306, 307, 312, 316, 317,
       318, 319, 320, 322, 326, 329, 330, 331, 332, 333, 337, 339, 340,
       341, 342, 344, 345, 346, 347, 348, 352, 353, 355, 357, 361, 362,
       363, 366, 368, 369, 371, 372, 373, 374, 378, 381, 383, 384, 385,
       386, 387, 389, 391, 393, 394, 395, 398, 399, 400, 404, 405, 408,
       410, 411, 412, 418, 426, 433, 435, 441, 443, 444, 445, 446, 447,
       449, 450, 451, 454, 455, 456, 457, 458, 460, 466, 467, 470, 472,
       480, 482, 483, 484, 486, 488, 489, 490, 491, 493, 494, 496, 498,
       501, 502, 503, 504, 506, 507, 511, 513, 514, 515, 516, 519, 522,
       525, 526, 528, 530, 533, 534, 535, 542, 543, 544, 547, 548, 551,
       553, 554, 556, 557, 558, 559, 562, 563, 565, 567, 568, 569, 570,
       571, 573, 574, 577, 580, 581, 582, 583, 584, 585, 586, 587, 592,
       594, 596, 597, 598, 604, 605, 606, 607, 608, 609, 611, 613, 614,
       615, 617, 618, 619, 620, 623, 624, 625, 626, 627, 628, 631, 632,
       633, 641, 647, 649, 650, 652, 654, 658, 662, 663, 665, 666, 668,
       681, 682, 684, 685, 692, 694, 702, 708, 710, 716, 719, 726, 731,
       734, 748, 749, 752, 755, 757, 758, 759, 760, 761, 771, 774]]

In [93]:
print(tx_train_2.shape)

(250000, 363)


In [94]:
def train_model_pairwise(y, x):
  w_init = np.zeros(x.shape[1])
  lambda_ = 1e-5
  return make_predictor(reg_logistic_regression_sgd(
      y, x, lambda_, w_init, 5, 1000, 0.5)[0])

_ = validation.cross_validation(y_train, tx_train_2, train_model_pairwise, 10, verbose=True)

------ 10-fold cross validation results ------
    Accuracy: avg 0.82856, max 0.83292, min 0.82388, stddev 0.002321
    Fbeta score: avg 0.7392, max 0.74837, min 0.73026, stddev 0.0055279


In [95]:
tune_lambda(y_train, tx_train_2, [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10])

1: 0.4238
0.1: 0.7742
0.01: 0.8155
0.001: 0.8280
0.0001: 0.8314
1e-05: 0.8326
1e-06: 0.8329
1e-07: 0.8329
1e-08: 0.8332
1e-09: 0.8332
1e-10: 0.8333


1e-10

In [64]:
w, loss = lasso_logistic_regression_sgd(y_train, tx_train_2, .1, np.zeros(tx_train_2.shape[1]), 20, 200, .01)

In [90]:
np.where(w > 1e-5)[0]

array([  4,   6,   7,   9,  10,  11,  13,  15,  16,  18,  19,  21,  28,
        42,  46,  49,  50,  54,  55,  56,  57,  61,  62,  63,  64,  70,
        72,  75,  80,  81,  84,  87,  88,  90,  98,  99, 100, 101, 102,
       103, 105, 106, 107, 108, 109, 110, 111, 113, 114, 115, 117, 119,
       120, 124, 126, 128, 130, 134, 135, 136, 137, 138, 139, 140, 141,
       142, 145, 149, 151, 154, 156, 158, 159, 162, 165, 167, 168, 170,
       171, 173, 176, 177, 178, 179, 182, 183, 185, 186, 188, 190, 191,
       192, 193, 194, 196, 198, 200, 201, 203, 205, 206, 208, 211, 212,
       216, 218, 219, 224, 225, 226, 227, 231, 238, 239, 241, 243, 245,
       246, 247, 251, 252, 253, 254, 255, 258, 259, 260, 261, 263, 264,
       265, 266, 274, 275, 280, 281, 283, 284, 287, 288, 291, 292, 293,
       295, 296, 297, 298, 300, 301, 302, 303, 306, 307, 312, 316, 317,
       318, 319, 320, 322, 326, 329, 330, 331, 332, 333, 337, 339, 340,
       341, 342, 344, 345, 346, 347, 348, 352, 353, 355, 357, 36

In [96]:
def create_submission(x_train, y_train, x_test, ids, train_model, file_name):
  predict = train_model(y_train, x_train)
  labels_train = predict(x_train)
  train_acc = metrics.accuracy(y_train, labels_train) * 100
  full_file_name = f'{file_name}.csv'
  if input(f'Train accuracy is {train_acc:.3} %. \n\
Do you want to continue and create submission `{full_file_name}`? [y/N]').strip().upper() == 'Y':
    labels = predict(x_test)
    labels = labels * 2 - 1
    data_io.create_csv_submission(ids, labels, full_file_name)

In [100]:
tx_test = transformation_pipeline_median_selected_pairwise(x_test)
tx_test = tx_test[:, [  4,   6,   7,   9,  10,  11,  13,  15,  16,  18,  19,  21,  28,
        42,  46,  49,  50,  54,  55,  56,  57,  61,  62,  63,  64,  70,
        72,  75,  80,  81,  84,  87,  88,  90,  98,  99, 100, 101, 102,
       103, 105, 106, 107, 108, 109, 110, 111, 113, 114, 115, 117, 119,
       120, 124, 126, 128, 130, 134, 135, 136, 137, 138, 139, 140, 141,
       142, 145, 149, 151, 154, 156, 158, 159, 162, 165, 167, 168, 170,
       171, 173, 176, 177, 178, 179, 182, 183, 185, 186, 188, 190, 191,
       192, 193, 194, 196, 198, 200, 201, 203, 205, 206, 208, 211, 212,
       216, 218, 219, 224, 225, 226, 227, 231, 238, 239, 241, 243, 245,
       246, 247, 251, 252, 253, 254, 255, 258, 259, 260, 261, 263, 264,
       265, 266, 274, 275, 280, 281, 283, 284, 287, 288, 291, 292, 293,
       295, 296, 297, 298, 300, 301, 302, 303, 306, 307, 312, 316, 317,
       318, 319, 320, 322, 326, 329, 330, 331, 332, 333, 337, 339, 340,
       341, 342, 344, 345, 346, 347, 348, 352, 353, 355, 357, 361, 362,
       363, 366, 368, 369, 371, 372, 373, 374, 378, 381, 383, 384, 385,
       386, 387, 389, 391, 393, 394, 395, 398, 399, 400, 404, 405, 408,
       410, 411, 412, 418, 426, 433, 435, 441, 443, 444, 445, 446, 447,
       449, 450, 451, 454, 455, 456, 457, 458, 460, 466, 467, 470, 472,
       480, 482, 483, 484, 486, 488, 489, 490, 491, 493, 494, 496, 498,
       501, 502, 503, 504, 506, 507, 511, 513, 514, 515, 516, 519, 522,
       525, 526, 528, 530, 533, 534, 535, 542, 543, 544, 547, 548, 551,
       553, 554, 556, 557, 558, 559, 562, 563, 565, 567, 568, 569, 570,
       571, 573, 574, 577, 580, 581, 582, 583, 584, 585, 586, 587, 592,
       594, 596, 597, 598, 604, 605, 606, 607, 608, 609, 611, 613, 614,
       615, 617, 618, 619, 620, 623, 624, 625, 626, 627, 628, 631, 632,
       633, 641, 647, 649, 650, 652, 654, 658, 662, 663, 665, 666, 668,
       681, 682, 684, 685, 692, 694, 702, 708, 710, 716, 719, 726, 731,
       734, 748, 749, 752, 755, 757, 758, 759, 760, 761, 771, 774]]

lambda_ = 1e-8
train_model = lambda y_, x_: make_predictor(reg_logistic_regression_sgd(
    y_, x_, lambda_, np.zeros(x_.shape[1]), 20, 1000, 0.5,
)[0])

create_submission(
    tx_train_2, y_train, 
    tx_test, 
    ids_test, train_model, '832accSub')

Train accuracy is 83.2 %. 
Do you want to continue and create submission `832accSub.csv`? [y/N]y
