In [1]:
import numpy as np
from implementations import *
from cross_validation import *
from data_preprocessing import *
from proj1_helpers import *
from costs import *
import math

# Initialisation

In [2]:
print('loading training data'+"\n")
DATA_TEST_PATH = '../data/train.csv'
y,tX,ids = load_csv_data(DATA_TEST_PATH)
print('training data loaded'+"\n")


loading training data

training data loaded



# Least squares Gradient Descent implementation

In [3]:
jet_tX = jet(tX)

means = []
devs = []
degree = [1, 1, 1]
# cleans -999 and standardizes
for i in range(len(jet_tX)):
    print("Subset: " + str(i))
    # preprocess every train subset
    preprocessed_tX = preprocess_data(tX[jet_tX[i]])
    acc, testloss, trainloss, weights = cross_validation_for_GD(y[jet_tX[i]], preprocessed_tX, degree[i])

Subset: 0
test error: 0.49993509424190474
train error: 0.4999357248800381
accuracy: 0.7843959563607246
Subset: 1
test error: 0.4999602518791192
train error: 0.49996061263993086
accuracy: 0.6949316481815837
Subset: 2
test error: 0.4999362682649859
train error: 0.49993685599111737
accuracy: 0.6948717948717948


# Least squares Stochastic Gradient Descent implementation

In [4]:
jet_tX = jet(tX)

means = []
devs = []
degree = [1, 1, 1]
# cleans -999 and standardizes
for i in range(len(jet_tX)):
    print("Subset: " + str(i))
    # preprocess every train subset
    preprocessed_tX = preprocess_data(tX[jet_tX[i]])
    acc, testloss, trainloss, weights = cross_validation_for_GD(y[jet_tX[i]], preprocessed_tX, degree[i], stoch = True)

Subset: 0
test error: 0.4999545349554637
train error: 0.49989188664237744
accuracy: 0.7312080872785508
Subset: 1
test error: 0.4999759461830184
train error: 0.49997919991484263
accuracy: 0.6701057518700027
Subset: 2
test error: 0.4999471532164
train error: 0.499888164400482
accuracy: 0.6818169285911222


# Least squares using normal equations

In [5]:
jet_tX = jet(tX)

means = []
devs = []
degrees = [4,4,11]
# cleans -999 and standardizes
for i in range(len(jet_tX)):
    # preprocess every train subset
    preprocessed_tX = preprocess_data(tX[jet_tX[i]])
    print("subset: " + str(i))

    acc, testloss, trainloss, weights = cross_validation_for_leastsquares(y[jet_tX[i]], preprocessed_tX, degrees[i])

subset: 0
test error: 34179422.74176813
train error: 0.2410241456261793
accuracy: 0.8325292763487138
subset: 1
test error: 0.3345579466053833
train error: 0.3221744551437434
accuracy: 0.775960794428682
subset: 2
test error: 90222.0608500605
train error: 0.5801349865593991
accuracy: 0.7769644334160463


# Ridge Regression

In [6]:
jet_tX = jet(tX)

lams= [1e-4,0.001,1e-5]

degs = 12*np.ones(3).astype(int)
losses=[]

for i in range(len(jet_tX)):
    # preprocess every train subset
    preprocessed_tX = preprocess_data(tX[jet_tX[i]])
    print("Subset: " + str(i))
    acc, testloss, trainloss, weights = cross_validation_ridge(y[jet_tX[i]], preprocessed_tX,lams[i],degs[i])




Subset: 0
test error: 1.310991676774038e+35
train error: 0.22984703002509174
accuracy: 0.8435191672505254
Subset: 1
test error: 70208.33203895975
train error: 0.2874576360086217
accuracy: 0.8047201444415786
Subset: 2
test error: 8420811.02692854
train error: 0.25997575075353124
accuracy: 0.8294182519988972


# Logistic Regression implementation

In [6]:
jet_tX = jet(tX)

means = []
devs = []
degree = [1, 1, 1]
# cleans -999 and standardizes
for i in range(len(jet_tX)):
    # preprocess every train subset
    preprocessed_tX = preprocess_data(tX[jet_tX[i]])
    print("Subset: " + str(i))
    acc, testloss, trainloss, weights = cross_validation_for_logistic(y[jet_tX[i]], preprocessed_tX, degree[i])

Subset: 0
test error: 0.607156524739386
train error: 35868.314344267805
accuracy: 0.8142728455610049
Subset: 1
test error: 0.49150845643121277
train error: 35341.373236731735
accuracy: 0.7057518700025793
Subset: 2
test error: 0.5511015373801456
train error: 32326.434123875497
accuracy: 0.7099255583126551


# Regularised logistic regression implementation

## Grid Search

In [8]:
#grid search for best lambdas
jet_tX = jet(tX)

means = []
devs = []
degree = [1,1,1]
# cleans -999 and standardizes
accs=[]
lambdas=np.logspace(0,3,num=20)
for lambda_ in lambdas:
    for i in range(len(jet_tX)):
        # preprocess every train subset
        preprocessed_tX = preprocess_data(tX[jet_tX[i]])
        acc, testloss, trainloss, weights = cross_validation_for_reglogistic(y[jet_tX[i]], preprocessed_tX,lambda_, degree[i])
        accs.append(acc)
accs=np.asarray(accs)
accs.reshape(-1,3)
print(min(accs,axis=0))

TypeError: 'axis' is an invalid keyword argument for min()

In [9]:
accsme=np.mean(accs,axis=1)
accsme=accsme.reshape(-1,3)
goodlambdas=[lambdas[np.argmax(accsme[:,0])], lambdas[np.argmax(accsme[:,1])], lambdas[np.argmax(accsme[:,2])]]
print(goodlambdas)

[233.57214690901213, 2.976351441631318, 78.47599703514611]


## After grid search

In [11]:
#function called with the bestlambdas found in previous step
jet_tX = jet(tX)

goodlambdas= [233.57214690901213, 2.9763514416313179, 78.475997035146108]

means = []
devs = []
degree = [1,1,1]
# cleans -999 and standardizes
accs=[]

lambdas=goodlambdas

for i in range(len(jet_tX)):
    print("Subset: " + str(i))
    # preprocess every train subset
    preprocessed_tX = preprocess_data(tX[jet_tX[i]])
    acc, testloss, trainloss, weights = cross_validation_for_reglogistic(y[jet_tX[i]], preprocessed_tX,lambdas[i], degree[i])
    


Subset: 0
test error: 0.643346313135406
train error: 36149.18962227507
accuracy: 0.8152737463717346
Subset: 1
test error: 0.49151228278468817
train error: 35342.92285417614
accuracy: 0.7057389734330668
Subset: 2
test error: 0.5511914494093625
train error: 32381.416439138622
accuracy: 0.7106699751861042


## Generate predictions and save ouput in csv format for submission:

In [94]:
print(losses)

[0.8631864728562834, 0.4832607028424577, 0.49314131426161195, 0.8631873777247719, 0.4832607030257133, 0.49314131431398334, 0.8631842263651718, 0.48326070330115534, 0.4931413144520695, 0.8631856999364111, 0.4832607022911984, 0.49314131481615386, 0.8631836416254774, 0.4832607028340903, 0.4931413157761151, 0.863187178550533, 0.4832607026229851, 0.4931413183071827, 0.8631850475996731, 0.48326070391096154, 0.4931413249806259, 0.863182907667257, 0.48326070396344206, 0.49314134257547887, 0.8631828153865209, 0.48326070720978426, 0.49314138896218335, 0.8631876272593108, 0.483260714985934, 0.4931415112345318, 0.8631830796997476, 0.48326073463548824, 0.49314183339374673, 0.86318573481566, 0.4832607863565696, 0.493142681223469, 0.863184120470095, 0.4832609234871037, 0.4931449057555667, 0.8631831628658986, 0.4832612841797311, 0.4931506982039143, 0.8631892409904786, 0.4832622362335288, 0.49316551134594866, 0.8631867393985223, 0.48326474540578473, 0.493202016571479, 0.863193392293753, 0.4832713548082

In [64]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [75]:
OUTPUT_PATH = '../data/submission_splitt.csv'
create_csv_submission(ids_test, y_preds, OUTPUT_PATH)

In [65]:
#polynomial expansion of test data
#tX_testpol=build_poly(tX_test,degree)

OUTPUT_PATH = '../data/submission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(w, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)