In [1]:
import numpy as np
from proj1_helpers import *
from costs import *
from helpers import *
from implementations import *


In [2]:
def ridge_trials(y, tx, tx_sub, degree_range, lambda_range, partitions=2):
    ## Split data into test and training sets
    ## If partitions > 2, use k-fold cross-validation
    glob_tx_tr, glob_tx_te, glob_y_tr, glob_y_te = split_data(tx, y, 0.8)

    ## Initial results: losses, weights, preditions and (test) losses
    models = []
    losses = []
    accuracies = []
    predictions = []
    
    ## Loops over range of degrees
    degrees = range(degree_range[0], degree_range[1])
    lambdas = np.logspace(lambda_range[0], lambda_range[1], num=1+(lambda_range[1]-lambda_range[0]))
    for degree in degrees:
        ## Loops over range of lambdas
        for lambda_ in lambdas:
            print("Trying degree", degree,"with lambda =", lambda_,":")

            tx_tr, tx_te, tx_pred = expand(degree, glob_tx_tr, glob_tx_te, tx_sub)

            w, loss = ridge_regression(glob_y_tr, tx_tr, lambda_)
            print("\tTraining Loss = ", loss)

            y_test = predict_labels(w, tx_te)
            test_loss = compute_loss(glob_y_te, tx_te, w)
            accuracy = compute_accuracy((y_test+1)/2, glob_y_te)
            y_pred = predict_labels(w, tx_pred)

            print("\tTest Loss = ", test_loss, " Test Accuracy = ", accuracy )
            models.append(("ridge_regression", degree, lambda_, w))
            losses.append(test_loss)
            accuracies.append(accuracy)
            predictions.append(y_pred)
    return models, losses, accuracies, predictions
    
MAX_ITERS = 100  
GAMMA = 0.6

## Performs logistic trials over set of hyper-parameters (degrees)
## Results result from these trials with corresponding test losses
def logistic_trials(y, tx, tx_sub, degree_range, partitions=2):
    ## Split data into test and training sets
    ## If partitions > 2, use k-fold cross-validation
    glob_tx_tr, glob_tx_te, glob_y_tr, glob_y_te = split_data(tx, y, 0.8)

    ## Initial results: losses, weights, preditions and (test) losses
    models = []
    losses = []
    accuracies = []
    predictions = []
    
    ## Loops over range of degrees
    degrees = range(degree_range[0], degree_range[1])
    for degree in degrees:
        print("Trying degree", degree, ":")

        tx_tr, tx_te, tx_pred = expand(degree, glob_tx_tr, glob_tx_te, tx_sub)        
        initial_w = np.ones(tx_tr.shape[1])
        
        w, loss = logistic_regression(glob_y_tr, tx_tr, initial_w, MAX_ITERS, GAMMA)
        print("\tTraining Loss = ", loss)
        
        y_test = predict_labels(w, tx_te)
        test_loss = compute_loss(glob_y_te, tx_te, w, func="logistic")
        accuracy = compute_accuracy((y_test+1)/2, glob_y_te)
        y_pred = predict_labels(w, tx_pred)

        print("\tTest Loss = ", test_loss, " Test Accuracy = ", accuracy )
        models.append(("logistic_SGD", degree, w))
        losses.append(test_loss)
        accuracies.append(accuracy)
        predictions.append(y_pred)
    return models, losses, accuracies, predictions

In [3]:
## Because expansion and standardization are transformations of our initial feature set
## We must apply identical transformations to all feature sets we wish to make predictions upon
def expand(degree, tx_tr, tx_te, tx_pred):
    ## Extract jet numbers as three indicator variables
    ## Remove them so they will not be standardized or expanded
    jets_tr = jet_nums(tx_tr)
    jets_te= jet_nums(tx_te)
    jets_pred= jet_nums(tx_pred)
    ## Remove redundant columns
    res_tr = extract_col(tx_tr)
    res_te = extract_col(tx_te)
    red_pred = extract_col(tx_pred)
    ## Expand features to include polynomial terms
    res_tr = build_poly(tx_tr, degree)
    res_te = build_poly(tx_te, degree)
    res_pred = build_poly(tx_pred, degree)
    ## Standardize
    res_tr, mean, std = standardize(res_tr)
    res_te = (res_te-mean)/std
    res_pred = (res_pred-mean)/std
    ## Fix NaNs resulting from division by 0
    res_tr[np.isnan(res_tr)]=1
    res_te[np.isnan(res_te)]=1
    res_pred[np.isnan(res_pred)]=1
    ## Reconcatenate jet indicator columns
    res_tr = np.c_[res_tr, jets_tr]
    res_te = np.c_[res_te, jets_te]
    res_pred = np.c_[res_pred, jets_pred]
    return res_tr, res_te, res_pred

In [4]:
def clean_tx(tx):
    result = np.copy(tx)
    ## Convert -999 to NaN as we believe these are misidentified data
    ## Recording as NaN prevents them from influencing nanmean calculations
    result[result==-999]=np.nan
    
    ## Now replace each NaN index with the mean of its column
    means = np.nanmean(result, axis=0)
    nans = np.where(np.isnan(result))
    result[nans] = np.take(means, nans[1])
    return result

In [5]:
## Jet number seems to be categorical, taking on three discrete values
## Relative values do not seem to have meaning, so coefficients are not a good way to treat this
## Solution: Split this into three indicator vectors. Each indicator takes a different coefficient
def jet_nums(tx):
    jets = tx[:,22]
    new_tx = np.delete(tx, 22, axis=1)
    jet0 = np.zeros((jets.shape[0],1))
    jet0[jets==0] = 1
    jet1 = np.zeros((jets.shape[0],1))
    jet1[jets==1] = 1
    jet2 = np.zeros((jets.shape[0],1))
    jet2[jets==2] = 1
    jet3 = np.zeros((jets.shape[0],1))
    jet3[jets==3] = 1
    result = np.c_[jet0, jet1, jet2, jet3]
    return result

def extract_col(tx):
    result = np.delete(tx, 22, axis=1)
    return result
    

In [6]:
## Load training sets
y, tx, ids = load_csv_data("data/train.csv")

## For training, let y exist on {0, 1} rather than {-1, 1} to improve cost calculations
y = (y + 1) / 2

## Fix issues with dataset involving suspect outliers
tx = clean_tx(tx)

## Load submission dataset
y_sub, tx_sub, ids_sub = load_csv_data("data/test.csv")
tx_sub = clean_tx(tx_sub)

In [7]:
models, losses, accuracies, preds =  logistic_trials(y, tx, tx_sub, (1,4))

Trying degree 1 :


  x = x/stds
  x = x/stds


	Training Loss =  3.845498660169477e-08
	Test Loss =  nan  Test Accuracy =  0.63436
Trying degree 2 :


  loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))


	Training Loss =  5.1733383021348905e-05
	Test Loss =  nan  Test Accuracy =  0.62944
Trying degree 3 :
	Training Loss =  7.320090438504591
	Test Loss =  nan  Test Accuracy =  0.6658


In [8]:
best_logsgd = preds[np.argmax(accuracies)]

In [9]:
best_logsgd

array([-1., -1., -1., ...,  1., -1., -1.])

In [10]:
rr_models, rr_losses, rr_accuracies, rr_preds =  ridge_trials(y, tx, tx_sub, (7,8),(-2,3))

Trying degree 7 with lambda = 0.01 :


  x = x/stds
  x = x/stds


	Training Loss =  0.0723559123960287
	Test Loss =  0.07498035939430843  Test Accuracy =  0.80082
Trying degree 7 with lambda = 0.1778279410038923 :
	Training Loss =  0.07339981408960228
	Test Loss =  0.07405314214841499  Test Accuracy =  0.79726
Trying degree 7 with lambda = 3.1622776601683795 :
	Training Loss =  0.07387292435457403
	Test Loss =  0.07433472669753109  Test Accuracy =  0.79602
Trying degree 7 with lambda = 56.23413251903491 :
	Training Loss =  0.07452479379977583
	Test Loss =  0.07490395262337601  Test Accuracy =  0.7934
Trying degree 7 with lambda = 1000.0 :
	Training Loss =  0.07614689821873045
	Test Loss =  0.07703775381349459  Test Accuracy =  0.78494


In [11]:
best_rr = rr_preds[np.argmax(rr_accuracies)]

array([0., 0., 0.])

In [None]:
create_csv_submission(ids_sub, best, "predictions.csv")