In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from proj1_helpers import *
from data_cleaning import *
import implementations as imp
import plots
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

COLUMN_TO_DROP = 22

# Import data

In [None]:
y_train, x_train, ids_train = load_csv_data("../data/train.csv")
y_test, x_test, ids_test = load_csv_data("../data/test.csv")

In [None]:
x_train.shape

# Clean data and add features

In [None]:
NUM_JETS = 4

PRI_jet_num_train = np.array([x_train[:, COLUMN_TO_DROP]]).astype(int)
print(PRI_jet_num_train)
del_x_train = np.delete(x_train, COLUMN_TO_DROP, axis=1)
print(del_x_train.shape)

replaced_x_train = replace_undefined_with_mean(del_x_train, UNDEFINED_VALUE)

norm_x_train, train_data_mean, train_data_std = mean_std_normalization(replaced_x_train)

print(norm_x_train[0][0])
print(norm_x_train.shape)

# Do the same for the test data

In [None]:
PRI_jet_num_test = np.array([x_test[:, COLUMN_TO_DROP]]).astype(int)
print(PRI_jet_num_test)
print(PRI_jet_num_test.shape)
del_x_test = np.delete(x_test, COLUMN_TO_DROP, axis=1)
print(del_x_test.shape)

replaced_x_test = replace_undefined_with_mean(del_x_test, UNDEFINED_VALUE)

norm_x_test, test_data_mean, test_data_std = mean_std_normalization(replaced_x_test, train_data_mean, train_data_std)
print(norm_x_test[0][0])
print(norm_x_test.shape)

# Make and train model
### GD

In [None]:
gamma = 0.1
max_iters = 100
_, initial_w = imp.least_squares(y_train, norm_x_train)
#initial_w = np.zeros(norm_x_train.shape[1], dtype=np.float64)

gd_loss, gd_weights = imp.least_squares_SGD(y_train, norm_x_train, initial_w, max_iters, gamma)
gd_loss

In [None]:
y_validation = predict_labels(gd_weights, norm_x_test)
score = sum(y_validation == y_test)/len(y_test)
score

In [None]:
gamma_list = [0.0001,0.001, 0.01, 0.1]
degree_list = range(8,9)
max_iters = 100
seed = 1
ratio = 0.8

best_gamma = np.zeros(4)
best_score = np.zeros(4)
best_degree = np.zeros(4)
all_scores = np.zeros([len(gamma_list), len(degree_list)])
g = 0
d = 0

for degree in degree_list:
    print("Running with degree = {}".format(degree))
    for i in range(NUM_JETS):
        curr_x = norm_x_train[PRI_jet_num_train[0,:]==i]
        curr_y = y_train[PRI_jet_num_train[0,:]==i]
        
        (tr_x, tr_y, te_x,te_y) = split_data(curr_x, curr_y, ratio, seed)
        
        px_tr = create_poly_features(tr_x, degree)
        px_te = create_poly_features(te_x, degree)
        g = 0
        for gamma in gamma_list:
            _, initial_w = imp.least_squares(tr_y, px_tr)

            gd_loss, gd_weights = imp.least_squares_GD(tr_y, px_tr, initial_w, max_iters, gamma)

            y_validation = predict_labels(gd_weights, px_te)
            score = sum(y_validation == te_y)/len(te_y)

            if score > best_score[i]:
                best_gamma[i] = gamma
                best_degree[i] = degree
                best_score[i] = score
            all_scores[g,d] = all_scores[g,d] + score*sum(PRI_jet_num_train[0,:]==i)/len(norm_x_train)
            g = g+1
    d = d+1
        


In [None]:
#calc actual score
actual_score = 0
for i in range(NUM_JETS):
    actual_score = actual_score + best_score[i]*sum(PRI_jet_num_train[0,:]==i)/len(norm_x_train)

print(actual_score)

### SGD

In [None]:
gamma = 0.1
max_iters = 50
initial_w = np.zeros(new_x_train.shape[1], dtype=np.float64)
batch_size = int(np.floor(new_x_train.shape[0] / 100))

# Training
sgd_loss, sgd_weights = imp.least_squares_SGD(y_train, new_x_train, initial_w, max_iters, gamma, batch_size)

In [None]:
# Plot
plt.subplots(figsize=(20,10))
plt.plot(sgd_loss)
plt.legend(["Training loss"])
plt.grid()
plt.title("Loss for Stochastic Gradient Decent")
plt.xlabel("Iteration number")
plt.ylabel("MSE")
plt.show()

### Testing

In [None]:
y_pred = predict_labels(sgd_weights[-1], new_x_test)
n = len(y_pred)
correct = 0
for i in range(n):
    if (y_pred[i] == y_test[i]):
        correct += 1
print(str(correct) + " of " + str(n) + " correct, precentage: " + str(correct/n)) 

In [None]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):

    ws = [initial_w]
    losses = []
    w = initial_w
    loss = 0
    #threshold = 1e-8
    for n_iter in range(max_iters):
        loss = sum(sum(np.logaddexp(0, tx.dot(w)) - y*(tx.dot(w))))
        prediction = sigmoid(tx.dot(w))
        gradient = tx.T.dot(prediction - y)

        # gradient w by descent update
        w = w - (gamma * gradient)
        ws.append(w)
        losses.append(loss)

        #if (len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold):
        #   break

    #finds best parameters
    min_ind = np.argmin(losses)
    loss = losses[min_ind]
    w = ws[min_ind][:]
    
    return w, loss