In [1]:
import numpy as np
from proj1_helpers import *
from costs import *
from helpers import *
from implementations import *


In [6]:
## Load training and test sets
glob_y_tr, glob_tx_tr, ids_tr = load_csv_data("data/train.csv")
glob_y_te, glob_tx_te, ids_te = load_csv_data("data/test.csv")

## For training, let y exist on {0, 1} rather than {-1, 1} to improve cost calculations
glob_y_tr = (glob_y_tr + 1) / 2

## Fix issues with dataset involving suspicious outliers
glob_tx_te = clean_tx(glob_tx_te)
glob_tx_tr = clean_tx(glob_tx_tr)

In [5]:
def ridge_trials(degree_range, lambda_range):
    degrees = range(degree_range[0], degree_range[1])
    lambdas = np.logspace(lambda_range[0], lambda_range[1], num=(lambda_range[1]-lambda_range[0]))
    losses = []
    weights_res = []
    preds = []
    for degree in degrees:
        for lambda_ in lambdas:
            print("Trying degree", degree,"with lambda =", lambda_,":")
            
            tx_tr, tx_te = expand(degree, glob_tx_tr, glob_tx_te)
            weights, loss = ridge_regression(glob_y_tr, tx_tr, lambda_)
            y_pred = predict_labels(weights, tx_te)
            test_loss = compute_loss(glob_y_te, tx_te, weights)
            accuracy = compute_accuracy(y_pred, glob_y_te)

            print("\tTest Loss = ", test_loss, " Test Accuracy = ", accuracy )
            losses.append(test_loss)
            weights_res.append(weights)
            preds.append(y_pred)
    return weights_res, losses, preds
            
    
    #def logistic_GD(y, tx, initial_w, max_iters, gamma):
    
MAX_ITERS = 150   
GAMMA = 0.2

def logistic_trials(degree_range):
    degrees = range(degree_range[0], degree_range[1])
    losses = []
    weights_res = []
    preds = []
    for degree in degrees:
        print("Trying degree", degree, ":")

        tx_tr, tx_te = expand(degree, glob_tx_tr, glob_tx_te)        
        initial_w = np.ones(tx_tr.shape[1])
        
        weights, loss = logistic_GD(glob_y_tr, tx_tr, initial_w, MAX_ITERS, GAMMA)
        print("\tTraining Loss = ", loss)
        
        y_pred = predict_labels(weights, tx_te)
        test_loss = compute_loss(glob_y_te, tx_te, weights, func="logistic")
        accuracy = compute_accuracy(y_pred, glob_y_te)

        print("\tTest Loss = ", test_loss, " Test Accuracy = ", accuracy )
        losses.append(test_loss)
        weights_res.append(weights)
        preds.append(y_pred)
    return weights_res, losses, preds

In [4]:
def expand(degree, tx_tr, tx_te):
    ## Expand features to include polynomial terms
    res_tr = build_poly(tx_tr, degree)
    res_te = build_poly(tx_te, degree)
    ## Standardize
    res_tr, mean, std = standardize(res_tr)
    res_te = (res_te-mean)/std
    ## Fix NaNs resulting from division by 0
    res_tr[np.isnan(res_tr)]=1
    res_te[np.isnan(res_te)]=1
    return res_tr, res_te

In [3]:
def clean_tx(tx):
    result = np.copy(tx)
    ## Convert -999 to NaN as we believe these are misidentified data
    ## Recording as NaN prevents them from influencing mean calculations
    result[result==-999]=np.nan
    
    ## Now replace each NaN index with the mean of its column
    means = np.nanmean(result, axis=0)
    nans = np.where(np.isnan(result))
    result[nans] = np.take(means, nans[1])
    return result

In [7]:
trial_weights, trial_losses, preds =  logistic_trials((1,3))

Trying degree 1 :


  x = x/stds
  x = x/stds
  import sys
  import sys
  loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))


	Training Loss =  nan
	Test Loss =  nan  Test Accuracy =  0.28562855704827905
Trying degree 2 :
	Training Loss =  nan
	Test Loss =  nan  Test Accuracy =  0.2879744050908246


In [9]:
trial_weights, trial_losses, preds =  ridge_trials((1,4),(-2,2))

Trying degree 1 with lambda = 0.01 :


  x = x/stds
  x = x/stds
  import sys
  import sys


	Test Loss =  0.24338779667219024  Test Accuracy =  0.25286939627409644
Trying degree 1 with lambda = 0.21544346900318834 :
	Test Loss =  0.24338790090242854  Test Accuracy =  0.25286939627409644
Trying degree 1 with lambda = 4.6415888336127775 :
	Test Loss =  0.24339014411691806  Test Accuracy =  0.25285355784020075
Trying degree 1 with lambda = 100.0 :
	Test Loss =  0.24343858823298772  Test Accuracy =  0.2524980729905427
Trying degree 2 with lambda = 0.01 :
	Test Loss =  0.24947699442144283  Test Accuracy =  0.274054181522531
Trying degree 2 with lambda = 0.21544346900318834 :
	Test Loss =  0.2494769173210402  Test Accuracy =  0.27405242169654265
Trying degree 2 with lambda = 4.6415888336127775 :
	Test Loss =  0.24947525894170797  Test Accuracy =  0.2740471422185774
Trying degree 2 with lambda = 100.0 :
	Test Loss =  0.24944220066652417  Test Accuracy =  0.2733766485169946
Trying degree 3 with lambda = 0.01 :
	Test Loss =  0.25466230969845277  Test Accuracy =  0.2840376743547598
Try

In [10]:
best = preds[np.argmin(trial_losses)]
best

array([-1., -1., -1., ..., -1.,  1., -1.])

In [11]:
compute_accuracy(best, glob_y_te)

0.25286939627409644

In [48]:
create_csv_submission(ids_te, best, "prediction.csv")