In [2]:
import numpy as np
from implementations import *
from cross_validation import *
from data_preprocessing import *
from proj1_helpers import *
from costs import *
import math

# Initialisation

In [3]:
print('loading training data'+"\n")
DATA_TEST_PATH = '../data/train.csv'
y,tX,ids = load_csv_data(DATA_TEST_PATH)
print('training data loaded'+"\n")


loading training data

training data loaded



# Least squares Gradient Descent implementation

In [4]:
jet_tX = jet(tX)

means = []
devs = []
degree = [1, 1, 1]
# cleans -999 and standardizes
for i in range(len(jet_tX)):
    print("Subset: " + str(i))
    # preprocess every train subset
    preprocessed_tX = preprocess_data(tX[jet_tX[i]])
    acc, testloss, trainloss, weights = cross_validation_for_GD(y[jet_tX[i]], preprocessed_tX, degree[i])

Subset: 0
test error: 0.499935094242
train error: 0.49993572488
accuracy: 0.784395956361
Subset: 1
test error: 0.499960251879
train error: 0.49996061264
accuracy: 0.694931648182
Subset: 2
test error: 0.499936268265
train error: 0.499936855991
accuracy: 0.694871794872


# Least squares Stochastic Gradient Descent implementation

In [5]:
jet_tX = jet(tX)

means = []
devs = []
degree = [1, 1, 1]
# cleans -999 and standardizes
for i in range(len(jet_tX)):
    print("Subset: " + str(i))
    # preprocess every train subset
    preprocessed_tX = preprocess_data(tX[jet_tX[i]])
    acc, testloss, trainloss, weights = cross_validation_for_GD(y[jet_tX[i]], preprocessed_tX, degree[i], stoch = True)

Subset: 0
test error: 0.499954534955
train error: 0.499891886642
accuracy: 0.731208087279
Subset: 1
test error: 0.499975946183
train error: 0.499979199915
accuracy: 0.67010575187
Subset: 2
test error: 0.499947153216
train error: 0.4998881644
accuracy: 0.681816928591


# Least squares using normal equations

In [6]:
jet_tX = jet(tX)

means = []
devs = []
degrees = [4,4,11]
# cleans -999 and standardizes
for i in range(len(jet_tX)):
    # preprocess every train subset
    preprocessed_tX = preprocess_data(tX[jet_tX[i]])
    print("subset: " + str(i))

    acc, testloss, trainloss, weights = cross_validation_for_leastsquares(y[jet_tX[i]], preprocessed_tX, degrees[i])

subset: 0
test error: 34179422.732
train error: 0.241024145617
accuracy: 0.832529276349
subset: 1
test error: 0.334557967773
train error: 0.322174455215
accuracy: 0.775973690998
subset: 2
test error: 93691.61798
train error: 0.328329868807
accuracy: 0.782271850014


# Ridge Regression

In [22]:
jet_tX = jet(tX)

lams= [1e-4,0.001,1e-5]

degs = 12*np.ones(3).astype(int)
losses=[]

for i in range(len(jet_tX)):
    # preprocess every train subset
    preprocessed_tX = preprocess_data(tX[jet_tX[i]])
    print("Subset: " + str(i))
    acc, testloss, trainloss, weights = cross_validation_ridge(y[jet_tX[i]], preprocessed_tX,lams[i],degs[i])




Subset: 0
test error: 6.42360649512e+38
train error: 0.230647760947
accuracy: 0.84288859974
Subset: 1
test error: 56838.9501348
train error: 0.287514196408
accuracy: 0.804526695899
Subset: 2
test error: 9114959.92082
train error: 0.260537970987
accuracy: 0.829432037497


# Logistic Regression implementation

In [21]:
jet_tX = jet(tX)

means = []
devs = []
degree = [1, 1, 1]
# cleans -999 and standardizes
for i in range(len(jet_tX)):
    # preprocess every train subset
    preprocessed_tX = preprocess_data(tX[jet_tX[i]])
    print("Subset: " + str(i))
    acc, testloss, trainloss, weights = cross_validation_for_logistic(y[jet_tX[i]], preprocessed_tX, degree[i])

Subset: 0
test error: 0.607156524739
train error: 35868.3143443
accuracy: 0.814272845561
Subset: 1
test error: 0.491508456431
train error: 35341.3732367
accuracy: 0.705751870003
Subset: 2
test error: 0.55110153738
train error: 32326.4341239
accuracy: 0.709925558313


# Regularised logistic regression implementation

Here, we perform a Grid Search for the best lambdas of each independent models using regularised logistic regression. The next cell takes a few minutes to run, but the results have already been added in the next cell.

In [10]:
#grid search for best lambdas
jet_tX = jet(tX)

means = []
devs = []
degree = [1,1,1]
# cleans -999 and standardizes
accs=[]
lambdas=np.logspace(0,3,num=20)
for lambda_ in lambdas:
    print("lambda=",lambda_)
    for i in range(len(jet_tX)):
        # preprocess every train subset
        preprocessed_tX = preprocess_data(tX[jet_tX[i]])
        acc, testloss, trainloss, weights = cross_validation_for_reglogistic(y[jet_tX[i]], preprocessed_tX,lambda_, degree[i])
        accs.append(acc)

lambda= 1.0
lambda= 1.43844988829
lambda= 2.06913808111
lambda= 2.97635144163
lambda= 4.28133239872
lambda= 6.15848211066
lambda= 8.8586679041
lambda= 12.742749857
lambda= 18.3298071083
lambda= 26.3665089873
lambda= 37.9269019073
lambda= 54.5559478117
lambda= 78.4759970351
lambda= 112.883789168
lambda= 162.377673919
lambda= 233.572146909
lambda= 335.981828628
lambda= 483.293023857
lambda= 695.192796178
lambda= 1000.0
0.687425844725


We find the lambdas corresponding to the minimum values:

In [17]:
accsme=accs.reshape(-1,3)
goodlambdas=[lambdas[np.argmax(accsme[:,0])], lambdas[np.argmax(accsme[:,1])], lambdas[np.argmax(accsme[:,2])]]
print(goodlambdas)

[233.57214690901213, 2.9763514416313179, 78.475997035146108]


## After grid search

In [20]:
#function called with the bestlambdas found in previous step
jet_tX = jet(tX)

goodlambdas= [233.57214690901213, 2.9763514416313179, 78.475997035146108]

means = []
devs = []
degree = [1,1,1]
# cleans -999 and standardizes
accs=[]

lambdas=goodlambdas

for i in range(len(jet_tX)):
    print("Subset: " + str(i))
    # preprocess every train subset
    preprocessed_tX = preprocess_data(tX[jet_tX[i]])
    acc, testloss, trainloss, weights = cross_validation_for_reglogistic(y[jet_tX[i]], preprocessed_tX,lambdas[i], degree[i])
    print("test error:", testloss)
    print("train error:", trainloss)
    print("accuracy:", acc)


Subset: 0
test error: 0.643346313135
train error: 36149.1896223
accuracy: 0.815273746372
Subset: 1
test error: 0.491512282785
train error: 35342.9228542
accuracy: 0.705738973433
Subset: 2
test error: 0.551191449409
train error: 32381.4164391
accuracy: 0.710669975186


## Generate predictions and save ouput in csv format for submission:

In [64]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [75]:
OUTPUT_PATH = '../data/submission_splitt.csv'
create_csv_submission(ids_test, y_preds, OUTPUT_PATH)

In [65]:
#polynomial expansion of test data
#tX_testpol=build_poly(tX_test,degree)

OUTPUT_PATH = '../data/submission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(w, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)