In [54]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [55]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids, features = load_csv_data(DATA_TRAIN_PATH, sub_sample=True)

print(features)

['DER_mass_MMC' 'DER_mass_transverse_met_lep' 'DER_mass_vis' 'DER_pt_h'
 'DER_deltaeta_jet_jet' 'DER_mass_jet_jet' 'DER_prodeta_jet_jet'
 'DER_deltar_tau_lep' 'DER_pt_tot' 'DER_sum_pt' 'DER_pt_ratio_lep_tau'
 'DER_met_phi_centrality' 'DER_lep_eta_centrality' 'PRI_tau_pt'
 'PRI_tau_eta' 'PRI_tau_phi' 'PRI_lep_pt' 'PRI_lep_eta' 'PRI_lep_phi'
 'PRI_met' 'PRI_met_phi' 'PRI_met_sumet' 'PRI_jet_num'
 'PRI_jet_leading_pt' 'PRI_jet_leading_eta' 'PRI_jet_leading_phi'
 'PRI_jet_subleading_pt' 'PRI_jet_subleading_eta' 'PRI_jet_subleading_phi'
 'PRI_jet_all_pt']


## Feature Engineering

In [59]:
# removes features from the data from string input
# also removes wanted features from features list
# example
# tX, features = remove_features(['DER_mass_MMC','DER_mass_transverse_met_lep'], verbose=True)
def remove_features(feats, verbose=False):
    
    idx_to_remove = np.ones(len(feats))
    removed = []

    for i, feat in enumerate(feats):

        idx = np.where(features == feat)[0]
        if (len(idx) == 1) :
            idx_to_remove[i] = idx
            removed.append(feat)

    idx_to_remove = idx_to_remove.astype(np.int)

    if verbose:
        print("Features removed:", *removed, sep='\n')

    return np.delete(tX, idx_to_remove, 1), np.delete(features, idx_to_remove)

In [57]:
tX, features = remove_features(['DER_sum_pt','PRI_jet_all_pt','PRI_met_sumet','DER_pt_h'],verbose=True)

1
1
1
1
Features removed:
DER_sum_pt
PRI_jet_all_pt
PRI_met_sumet
DER_pt_h


## Do your thing crazy machine learning thing here :) ...

In [60]:
def tXToX(tX):
    X = np.c_[np.ones(tX.shape[0]), tX]

    X_safe = X
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            if X[i,j] < -990:
                X[i,j] = 0
    
    return X_safe

X = np.c_[np.ones(len(y)), tX]
n, d = X.shape

X_safe = tXToX(tX)

In [61]:
from implementations import *

In [62]:
w_mse, loss_mse = least_squares(y, X)

In [63]:
accuracy_mse = compute_accuracy(predict_labels(w_mse, X), y)
print(accuracy_mse)

0.7522


In [64]:
def ridge_classifier(lambda_):
    return ridge_regression(y, X, lambda_)

lambda_ridge, _, _ = find_max_hyperparam(ridge_classifier, [10**c for c in range(-3,3)])
print("Optimal lambda: %f" % lambda_ridge)

Testing hyperparameter value 0.001000 - loss: 0.327
Testing hyperparameter value 0.010000 - loss: 0.328
Testing hyperparameter value 0.100000 - loss: 0.335
Testing hyperparameter value 1.000000 - loss: 0.345
Testing hyperparameter value 10.000000 - loss: 0.350
Testing hyperparameter value 100.000000 - loss: 0.357
Optimal lambda: 0.001000


In [65]:
def ridge_train(y_train, X_train):
    return ridge_regression(y, X, lambda_ridge)

def ridge_test(X_test, w):
    return np.sign(X_test@w)

accuracy_ridge = cross_validate(y, X, ridge_train, ridge_test, 0.8, 100)
print(accuracy_ridge)

0.7515100000000001


In [66]:
def log_reg_train(y_train, X_train):
    return logistic_regression(y_train, X_train, 0.01*np.ones(X_train.shape[1]), 1000, verbose=False)

def log_reg_test(X_test, w):
    return np.sign(X_test@w)

accuracy_log_reg = cross_validate(y, X_safe, log_reg_train, log_reg_test, 0.7, 20)
print(accuracy_log_reg)

0.7427666666666668


In [67]:
def reg_log_reg_classifier(lambda_):
    return reg_logistic_regression(y, X_safe, lambda_, np.zeros(X_safe.shape[1]), 1000)

def log_reg_sparse_classifier(lambda_):
    return logistic_regression_sparse(y, X_safe, lambda_, np.zeros(X_safe.shape[1]), 1000)

def mse_sparse_classifier(lambda_):
    return least_squares_sparse(y, X_safe, lambda_, np.zeros(X_safe.shape[1]), 1000)

In [68]:
lambda_log_reg_l2, _, _ = find_max_hyperparam(reg_log_reg_classifier, [10**c for c in range(-3,3)])

Testing hyperparameter value 0.001000 - loss: 2536.553
Testing hyperparameter value 0.010000 - loss: 2541.668
Testing hyperparameter value 0.100000 - loss: 2538.592
Testing hyperparameter value 1.000000 - loss: 2502.761
Testing hyperparameter value 10.000000 - loss: 2549.846
Testing hyperparameter value 100.000000 - loss: 2535.697


In [69]:
lambda_log_reg_l1, _, _ = find_max_hyperparam(log_reg_sparse_classifier, [10**c for c in range(-3,3)])

Testing hyperparameter value 0.001000 - loss: 2535.763
Testing hyperparameter value 0.010000 - loss: 2536.310
Testing hyperparameter value 0.100000 - loss: 2536.681
Testing hyperparameter value 1.000000 - loss: 2548.953
Testing hyperparameter value 10.000000 - loss: 2544.463
Testing hyperparameter value 100.000000 - loss: 2609.466


In [70]:
lambda_mse_l2, _, _ = find_max_hyperparam(ridge_classifier, [10**c for c in range(-6,1)])

Testing hyperparameter value 0.000001 - loss: 0.327
Testing hyperparameter value 0.000010 - loss: 0.327
Testing hyperparameter value 0.000100 - loss: 0.327
Testing hyperparameter value 0.001000 - loss: 0.327
Testing hyperparameter value 0.010000 - loss: 0.328
Testing hyperparameter value 0.100000 - loss: 0.335
Testing hyperparameter value 1.000000 - loss: 0.345


In [71]:
lambda_mse_l1, w_mse_l1, _ = find_max_hyperparam(mse_sparse_classifier, [10**c for c in range(-6,1)])

Testing hyperparameter value 0.000001 - loss: 0.338
Testing hyperparameter value 0.000010 - loss: 0.340
Testing hyperparameter value 0.000100 - loss: 0.340
Testing hyperparameter value 0.001000 - loss: 0.339
Testing hyperparameter value 0.010000 - loss: 0.348
Testing hyperparameter value 0.100000 - loss: 0.359
Testing hyperparameter value 1.000000 - loss: 0.364


In [72]:
w_mse_l1, _ = least_squares_sparse(y, X_safe, 0.01, np.zeros(X_safe.shape[1]), 1000)
print("Non-zero weights: %i / %i" % (np.sum(w_mse_l1 != 0), len(w_mse_l1)))

Non-zero weights: 23 / 27


In [73]:
p = 1
ypred_kernel = kernel_predict(kernel_poly, y, X_safe, X_safe, p, lambda_=1)
print(compute_accuracy(ypred_kernel, y))

0.7548


## Generate predictions and save ouput in csv format for submission:

In [74]:
DATA_TEST_PATH = '../data/test.csv'
y_test, tX_test, ids_test, _ = load_csv_data(DATA_TEST_PATH)

In [78]:
X_test_safe = tXToX(tX_test)

In [79]:
compute_accuracy(np.sign(X_test_safe @ w_mse_l1), y_test)

0.25873489629345453

In [21]:
OUTPUT_PATH = '../results/predictions.csv'
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

NameError: name 'weights' is not defined