In [2]:
import numpy as np
from implementations import *
from cross_validation import *
from data_preprocessing import *
from proj1_helpers import *
import math

print('loading data'+"\n")
DATA_TEST_PATH = '../data/train.csv'
y,tX,ids = load_csv_data(DATA_TEST_PATH)
print('data loaded')

jet_set = jet(tX)
inds = create_inds(jet_set, False)

data_sets = jet_split(tX,inds)
y_sets = split_y(y,inds)

loading data

data loaded


In [3]:
def replace_aberrant_values(tX):
    '''Replaces the aberrant value (-999) for a given feature 
    and  replaces it by the mean observed value of that feature.'''
    tX_repl_feat = np.copy(tX)
    means = []
    
    #compute the mean of each feature (column) without taking -999 values into account
    for j in range(tX_repl_feat.shape[1]):
        m = tX_repl_feat[:,j][tX_repl_feat[:,j] != -999].mean()
        means.append(m)
    
    #change all -999 values of a column by the mean computed previously
    for i in range(len(means)):
        mask = tX_repl_feat[:, i] == -999
        tX_repl_feat[:, i][mask] = means[i]
    
    return tX_repl_feat

In [4]:
def standardize(x):

    centered_data = x - np.mean(x, axis=0)
    std_data = centered_data / np.std(centered_data, axis=0)
    
    return std_data

In [5]:
def cross_validation(y, tX, lambda_, degree, ratio):
    
    for i in range(int(1/(1-ratio))):

        xtrain, ytrain, xtest, ytest = split_data(tX, y, ratio)
        
        weights_ = []
        trainlosses = []
        testlosses = []

        data_set=build_poly(xtrain,degree)
        data_set_test=build_poly(xtest,degree)

        w, loss = ridge_regression(ytrain,data_set,lambda_)

        weights_.append(w)
        trainlosses.append(loss)
        testlosses.append(compute_loss(ytest,data_set_test,w))
            
    #print("test error =",np.mean(testlosses))
    #print("train error =", np.mean(trainlosses))

    return np.mean(testlosses), np.mean(trainlosses)

In [6]:
for i in range(len(data_sets)):
    data_sets[i] = replace_aberrant_values(data_sets[i])
    data_sets[i] = standardize(data_sets[i])
    print('hello')

hello
hello
hello


In [8]:
matrix = [[1, 1, 1, 1, 1]]
weights_ = []
losses = []
lambda_= np.logspace(1, -8, num = 10)
degree = np.linspace(1,10, num = 9, dtype = int)

for lam in lambda_:
    for deg in degree:       
        for data_set, y_set in zip(data_sets, y_sets):
            test_error, train_error = cross_validation(y_set, data_set, lam, deg, 0.8)
            losses.append(test_error)
            #losses.append(train_error)
        matrix = np.append(matrix, [[lam, deg, losses[0], losses[1], losses[2]]], axis = 0)
        losses = []

In [15]:
matrix

array([[1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00],
       [1.00000000e+01, 1.00000000e+00, 4.60213654e-01, 4.84485972e-01,
        4.97396007e-01],
       [1.00000000e+01, 2.00000000e+00, 4.38356430e-01, 4.78644499e-01,
        4.96305243e-01],
       [1.00000000e+01, 3.00000000e+00, 4.51310401e-01, 4.85160443e-01,
        4.99200067e-01],
       [1.00000000e+01, 4.00000000e+00, 4.46135815e-01, 4.80169354e-01,
        4.99167001e-01],
       [1.00000000e+01, 5.00000000e+00, 8.16176045e-01, 4.88001276e-01,
        5.08116593e-01],
       [1.00000000e+01, 6.00000000e+00, 1.98788424e+01, 5.16062382e-01,
        5.37492869e+00],
       [1.00000000e+01, 7.00000000e+00, 4.86724105e-01, 4.90990453e-01,
        1.16630843e+00],
       [1.00000000e+01, 8.00000000e+00, 2.40819138e+00, 5.57784496e-01,
        5.64722620e-01],
       [1.00000000e+01, 1.00000000e+01, 7.39690604e+25, 1.38823239e+01,
        7.29855878e+00],
       [1.00000000e+00, 1.0000

In [12]:
matrix[47]

array([1.00000000e-04, 2.00000000e+00, 4.21682077e-01, 4.74182192e-01,
       4.95914865e-01])

In [13]:
matrix[20]

array([0.1       , 2.        , 0.42176892, 0.47214813, 0.49712175])

In [14]:
matrix[56]

array([1.00000000e-05, 2.00000000e+00, 4.26361599e-01, 4.74731921e-01,
       4.95908475e-01])