In [22]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from proj1_helpers import *
from data_cleaning import *
import implementations as imp
import plots
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

COLUMN_TO_DROP = 22

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Import data

In [23]:
y_train, x_train, ids_train = load_csv_data("../data/train.csv")
y_test, x_test, ids_test = load_csv_data("../data/test.csv")

In [24]:
x_train.shape

(250000, 30)

# Clean data and add features

In [25]:
NUM_JETS = 4

PRI_jet_num_train = np.array([x_train[:, COLUMN_TO_DROP]]).astype(int)
print(PRI_jet_num_train)
del_x_train = np.delete(x_train, COLUMN_TO_DROP, axis=1)
print(del_x_train.shape)

replaced_x_train = replace_undefined_with_mean(del_x_train, UNDEFINED_VALUE)

norm_x_train, train_data_mean, train_data_std = mean_std_normalization(replaced_x_train)

print(norm_x_train[0][0])
print(norm_x_train.shape)

[[2 1 1 ... 1 0 0]]
(250000, 29)
0.4736512480675119
(250000, 29)


# Do the same for the test data

In [26]:
PRI_jet_num_test = np.array([x_test[:, COLUMN_TO_DROP]]).astype(int)
print(PRI_jet_num_test)
print(PRI_jet_num_test.shape)
del_x_test = np.delete(x_test, COLUMN_TO_DROP, axis=1)
print(del_x_test.shape)

replaced_x_test = replace_undefined_with_mean(del_x_test, UNDEFINED_VALUE)

norm_x_test, test_data_mean, test_data_std = mean_std_normalization(replaced_x_test, train_data_mean, train_data_std)
print(norm_x_test[0][0])
print(norm_x_test.shape)

[[0 1 0 ... 0 1 0]]
(1, 568238)
(568238, 29)
46.72265694612496
(568238, 29)


# Make and train model
### GD

In [45]:
gamma = 0.1
max_iters = 100
#_, initial_w = imp.least_squares(y_train, norm_x_train)
initial_w = np.zeros(norm_x_train.shape[1], dtype=np.float64)
seed = 1
ratio = 0.5
(tr_x, tr_y, te_x,te_y) = split_data(norm_x_train, y_train, ratio, seed)
gd_loss, gd_weights = imp.least_squares_SGD(tr_y, tr_x, initial_w, max_iters, gamma)
gd_loss

Iteration 0 loss=1.0
Iteration 1 loss=1.4885054938929285
Iteration 2 loss=2.215648605349431
Iteration 3 loss=3.2980051215988344
Iteration 4 loss=4.909098742386882
Iteration 5 loss=7.307220448105742
Iteration 6 loss=10.876837782092146
Iteration 7 loss=16.190232794826333
Iteration 8 loss=24.099250462504457
Iteration 9 loss=35.87186671213957
Iteration 10 loss=53.395470677214604
Iteration 11 loss=79.4794514520327
Iteration 12 loss=118.30560013794695
Iteration 13 loss=176.098535763634
Iteration 14 loss=262.1236379506695
Iteration 15 loss=390.1724751687724
Iteration 16 loss=580.7738728545197
Iteration 17 loss=864.4851004534258
Iteration 18 loss=1286.790821413505
Iteration 19 loss=1915.3952071649967
Iteration 20 loss=2851.076288841281
Iteration 21 loss=4243.842719448107
Iteration 22 loss=6316.983203116015
Iteration 23 loss=9402.864202667537
Iteration 24 loss=13996.21502399978
Iteration 25 loss=20833.442956930423
Iteration 26 loss=31010.694298095874
Iteration 27 loss=46159.58883214982
Iteratio

1.2655920832054022e+17

In [46]:
y_validation = predict_labels(gd_weights, te_x)
score = sum(y_validation == te_y)/len(te_y)
score

0.614352

In [52]:
gamma_list = [0.0001,0.001, 0.01, 0.1]
degree_list = range(2,15)
max_iters = 100
seed = 1
ratio = 0.8

best_gamma = np.zeros(4)
best_score = np.zeros(4)
best_degree = np.zeros(4)
all_scores = np.zeros([len(gamma_list), len(degree_list)])
g = 0
d = 0


for degree in degree_list:
    print("Running with degree = {}".format(degree))
    for i in range(NUM_JETS):
        curr_x = norm_x_train[PRI_jet_num_train[0,:]==i]
        curr_y = y_train[PRI_jet_num_train[0,:]==i]
        
        (tr_x, tr_y, te_x,te_y) = split_data(curr_x, curr_y, ratio, seed)
        
        px_tr = create_poly_features(tr_x, degree)
        px_te = create_poly_features(te_x, degree)
        
        initial_w = np.zeros(px_tr.shape[1], dtype=np.float64)
        
        g = 0
        for gamma in gamma_list:

            gd_loss, gd_weights = imp.least_squares_GD(tr_y, px_tr, initial_w, max_iters, gamma)

            y_validation = predict_labels(gd_weights, px_te)
            score = sum(y_validation == te_y)/len(te_y)

            if score > best_score[i]:
                best_gamma[i] = gamma
                best_degree[i] = degree
                best_score[i] = score
            all_scores[g,d] = all_scores[g,d] + score*sum(PRI_jet_num_train[0,:]==i)/len(norm_x_train)
            g = g+1
    d = d+1
        


Running with degree = 2
Running with degree = 3


  y_pred[np.where(y_pred <= 0)] = -1
  y_pred[np.where(y_pred > 0)] = 1
  w = w - (grad*gamma)


Running with degree = 4
Running with degree = 5
Running with degree = 6
Running with degree = 7
Running with degree = 8
Running with degree = 9
Running with degree = 10
Running with degree = 11
Running with degree = 12
Running with degree = 13
Running with degree = 14


In [54]:
#calc actual score
actual_score = 0
for i in range(NUM_JETS):
    actual_score = actual_score + best_score[i]*sum(PRI_jet_num_train[0,:]==i)/len(norm_x_train)

print(actual_score)

0.7656847279875


In [55]:
print("Gamma=" + str(best_gamma))
print("Degree=" + str(best_degree))

Gamma=[0.01  0.01  0.001 0.001]
Degree=[2. 2. 2. 2.]


### SGD

In [None]:
gamma = 0.1
max_iters = 50
initial_w = np.zeros(new_x_train.shape[1], dtype=np.float64)
batch_size = int(np.floor(new_x_train.shape[0] / 100))

# Training
sgd_loss, sgd_weights = imp.least_squares_SGD(y_train, new_x_train, initial_w, max_iters, gamma, batch_size)

In [None]:
# Plot
plt.subplots(figsize=(20,10))
plt.plot(sgd_loss)
plt.legend(["Training loss"])
plt.grid()
plt.title("Loss for Stochastic Gradient Decent")
plt.xlabel("Iteration number")
plt.ylabel("MSE")
plt.show()

### Testing

In [None]:
y_pred = predict_labels(sgd_weights[-1], new_x_test)
n = len(y_pred)
correct = 0
for i in range(n):
    if (y_pred[i] == y_test[i]):
        correct += 1
print(str(correct) + " of " + str(n) + " correct, precentage: " + str(correct/n)) 

In [None]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):

    ws = [initial_w]
    losses = []
    w = initial_w
    loss = 0
    #threshold = 1e-8
    for n_iter in range(max_iters):
        loss = sum(sum(np.logaddexp(0, tx.dot(w)) - y*(tx.dot(w))))
        prediction = sigmoid(tx.dot(w))
        gradient = tx.T.dot(prediction - y)

        # gradient w by descent update
        w = w - (gamma * gradient)
        ws.append(w)
        losses.append(loss)

        #if (len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold):
        #   break

    #finds best parameters
    min_ind = np.argmin(losses)
    loss = losses[min_ind]
    w = ws[min_ind][:]
    
    return w, loss