In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids, features = load_csv_data(DATA_TRAIN_PATH)

In [3]:
from implementations import *

In [4]:
print(features)

['DER_mass_MMC' 'DER_mass_transverse_met_lep' 'DER_mass_vis' 'DER_pt_h'
 'DER_deltaeta_jet_jet' 'DER_mass_jet_jet' 'DER_prodeta_jet_jet'
 'DER_deltar_tau_lep' 'DER_pt_tot' 'DER_sum_pt' 'DER_pt_ratio_lep_tau'
 'DER_met_phi_centrality' 'DER_lep_eta_centrality' 'PRI_tau_pt'
 'PRI_tau_eta' 'PRI_tau_phi' 'PRI_lep_pt' 'PRI_lep_eta' 'PRI_lep_phi'
 'PRI_met' 'PRI_met_phi' 'PRI_met_sumet' 'PRI_jet_num'
 'PRI_jet_leading_pt' 'PRI_jet_leading_eta' 'PRI_jet_leading_phi'
 'PRI_jet_subleading_pt' 'PRI_jet_subleading_eta' 'PRI_jet_subleading_phi'
 'PRI_jet_all_pt']


In [21]:
def sanitizeX(tX, features):
    X_safe, features_safe = remove_features(tX, features, 
                                            ['DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 
                                             'DER_prodeta_jet_jet', 'DER_lep_eta_centrality', 
                                             'PRI_jet_leading_pt', 'PRI_jet_leading_eta',
                                             'PRI_jet_leading_phi', 'PRI_jet_subleading_pt',
                                             'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi'], 
                                            verbose=True)
    
    for j in range(X_safe.shape[1]):
        for i in range(X_safe.shape[0]):
            if X_safe[i,j] < -990:
                mean = np.sum(X_safe[X_safe[:,j] > -990,j]) / np.sum(X_safe[:,j] > -990)
                X_safe[X_safe[:,j] < -990,j] = mean
                break
                
    X_safe = standardize(X_safe)
    
    return np.c_[np.ones(X_safe.shape[0]), X_safe], features_safe

## Do your thing crazy machine learning thing here :) ...

In [11]:
X_safe, features_safe = sanitizeX(tX, features)
n, d = X_safe.shape

Features removed:
DER_deltaeta_jet_jet
DER_mass_jet_jet
DER_prodeta_jet_jet
DER_lep_eta_centrality
PRI_jet_leading_pt
PRI_jet_leading_eta
PRI_jet_leading_phi
PRI_jet_subleading_pt
PRI_jet_subleading_eta
PRI_jet_subleading_phi


In [12]:
w_mse, loss_mse = least_squares(y, X_safe)

In [13]:
accuracy_mse = compute_accuracy(predict_labels(w_mse, X_safe), y)
print(accuracy_mse)

0.733812


In [22]:
def ridge_classifier(lambda_):
    return ridge_regression(y, X_safe, lambda_)

lambda_ridge, _, _ = find_max_hyperparam(ridge_classifier, [10**c for c in range(-3,3)])
print("Optimal lambda: %f" % lambda_ridge)

Testing hyperparameter value 0.001000 - loss: 0.353
Testing hyperparameter value 0.010000 - loss: 0.356
Testing hyperparameter value 0.100000 - loss: 0.365
Testing hyperparameter value 1.000000 - loss: 0.373
Testing hyperparameter value 10.000000 - loss: 0.377
Testing hyperparameter value 100.000000 - loss: 0.384
Optimal lambda: 0.001000


In [15]:
def ridge_train(y_train, X_train):
    return ridge_regression(y, X_safe, lambda_ridge)

def ridge_test(X_test, w):
    return np.sign(X_test@w)

accuracy_ridge = cross_validate(y, X_safe, ridge_train, ridge_test, 0.8, 100)
print(accuracy_ridge)

0.7337803999999999


In [16]:
def log_reg_train(y_train, X_train):
    return logistic_regression(y_train, X_train, 0.01*np.ones(X_train.shape[1]), 1000, verbose=False)

def log_reg_test(X_test, w):
    return np.sign(X_test@w)

accuracy_log_reg = cross_validate(y, X_safe, log_reg_train, log_reg_test, 0.7, 20)
print(accuracy_log_reg)

KeyboardInterrupt: 

In [None]:
def reg_log_reg_classifier(lambda_):
    return reg_logistic_regression(y, X_safe, lambda_, np.zeros(X_safe.shape[1]), 1000)

def log_reg_sparse_classifier(lambda_):
    return logistic_regression_sparse(y, X_safe, lambda_, np.zeros(X_safe.shape[1]), 1000)

def mse_sparse_classifier(lambda_):
    return least_squares_sparse(y, X_safe, lambda_, np.zeros(X_safe.shape[1]), 1000)

In [None]:
lambda_log_reg_l2, _, _ = find_max_hyperparam(reg_log_reg_classifier, [10**c for c in range(-3,3)])

In [None]:
lambda_log_reg_l1, _, _ = find_max_hyperparam(log_reg_sparse_classifier, [10**c for c in range(-3,3)])

In [23]:
lambda_mse_l2, w_mse_l2, _ = find_max_hyperparam(ridge_classifier, [10**c for c in range(-6,1)])

Testing hyperparameter value 0.000001 - loss: 0.353
Testing hyperparameter value 0.000010 - loss: 0.353
Testing hyperparameter value 0.000100 - loss: 0.353
Testing hyperparameter value 0.001000 - loss: 0.353
Testing hyperparameter value 0.010000 - loss: 0.356
Testing hyperparameter value 0.100000 - loss: 0.365
Testing hyperparameter value 1.000000 - loss: 0.373


In [None]:
lambda_mse_l1, w_mse_l1, _ = find_max_hyperparam(mse_sparse_classifier, [10**c for c in range(-6,1)])

In [None]:
#w_mse_l1, _ = least_squares_sparse(y, X_safe, 0.1, np.zeros(X_safe.shape[1]), 1000)
print("Non-zero weights: %i / %i" % (np.sum(w_mse_l1 != 0), len(w_mse_l1)))

print(compute_accuracy(np.sign(X_safe@w_mse_l2), y))

In [None]:
p = 1
ypred_kernel = kernel_predict(kernel_RBF, y, X_safe, X_safe, p, lambda_=1)
print(compute_accuracy(ypred_kernel, y))

1e-06

## Generate predictions and save ouput in csv format for submission:

In [24]:
DATA_TEST_PATH = '../data/test.csv'
y_test, tX_test, ids_test, features_test = load_csv_data(DATA_TEST_PATH)

In [25]:
X_test_safe, features_test_safe = sanitizeX(tX_test, features_test)

Features removed:
DER_deltaeta_jet_jet
DER_mass_jet_jet
DER_prodeta_jet_jet
DER_lep_eta_centrality
PRI_jet_leading_pt
PRI_jet_leading_eta
PRI_jet_leading_phi
PRI_jet_subleading_pt
PRI_jet_subleading_eta
PRI_jet_subleading_phi


In [None]:
#compute_accuracy(np.sign(X_test_safe @ w_mse_l2), y_test)

In [26]:
OUTPUT_PATH = '../results/predictions_mean_clean_2.csv'
y_pred = predict_labels(w_mse_l2, X_test_safe)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)