In [1]:
    # Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt 
import implementations
import helpers
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *

DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH, sub_sample=False)

print(y.shape)
print(tX.shape)

(250000,)
(250000, 30)


## Cleaning and standarization of x

### Clean Data 

In [3]:
threshold = 0.15
all_tx, all_y = helpers.build_data(y, tX, threshold)

### Basic clean data

In [4]:
clean_tX = helpers.clean_tx(tX, 0.15)
std_tX = helpers.standardize(clean_tX)
row = std_tX.shape[0]
model_data = np.c_[np.ones(row), std_tX]

# Do your thing crazy machine learning thing here :) ...


In [5]:
initial_w = np.zeros(model_data.shape[1])
gamma = 0.01
max_iters = 300
lambda_ = 0.1

## Least squares GD

In [6]:
w, loss = implementations.least_square_GD(y, model_data, initial_w, max_iters, gamma)
print(w, loss)
tmp = y == predict_labels(w, model_data)
print(np.count_nonzero(tmp == 1)/len(tmp))

[-0.2992326  -0.21957221 -0.05760512  0.06179666  0.12080422 -0.05052733
  0.02068824 -0.08682256  0.1348509   0.13329037 -0.00137684 -0.0025637
  0.06636619 -0.00074148  0.00253707  0.01658314  0.00179607 -0.02090431
  0.00766294 -0.02099584] 0.36210667089730614
0.723796


## Least squares SGD

In [7]:
w, loss = implementations.least_square_SGD(y, model_data, initial_w, max_iters, gamma)
print(w, loss)
tmp = y == predict_labels(w, model_data)
print(np.count_nonzero(tmp == 1)/len(tmp))

[-0.31117809 -0.24709017 -0.08969226  0.11493254  0.13271232 -0.05327159
  0.0866703   0.07194741  0.18004492  0.0367816  -0.006251    0.02718262
  0.0762358  -0.01958307 -0.02983458 -0.00608005  0.06051118 -0.00625945
  0.02182407  0.07673993] 0.1645397727593384
0.682708


## Least Square

In [8]:
w, loss = implementations.least_square(y, model_data)
print(w, loss)
tmp = y == predict_labels(w, model_data)
print(np.count_nonzero(tmp == 1)/len(tmp))

[-3.14664000e-01 -2.61046202e-01 -2.60495896e-01  2.18600813e-03
  2.84091422e-01 -4.08633500e-02 -3.60254350e+02 -1.93462667e-01
  1.33047784e-01  6.99737217e+01 -9.06834519e-04 -8.61974632e-04
  6.89948892e+01 -5.98583316e-04  2.31205241e-03  1.08372889e-01
  8.37090896e-04 -7.03488098e-02  8.36189386e-02  3.05118660e+02] 0.35251271029898895
0.73356


## Ridge Regression

In [9]:
w, loss = implementations.ridge_regression(y, model_data, lambda_)
print(w, loss)
tmp = y == predict_labels(w, model_data)
print(np.count_nonzero(tmp == 1)/len(tmp))

[-0.26222    -0.20226141 -0.08383322  0.04792501  0.13147698 -0.0408149
  0.02176297 -0.09289289  0.11972484  0.1317687  -0.0011034  -0.00200653
  0.08737519 -0.00056122  0.00224169  0.02863349  0.0015625  -0.02559142
  0.02186715 -0.02410863] 0.36257036691673894
0.726424


## Logistic regression

In [10]:
w_final, loss_final = implementations.logistic_regression(y, model_data, initial_w, max_iters, gamma)
print(w_final,loss_final)
tmp = y == predict_labels(w_final, model_data)
print(np.count_nonzero(tmp == 1)/len(tmp))

[-1.87323399e+00 -3.52547965e-01 -2.43545244e-01  2.02169028e-01
  1.59490471e-03 -3.79791710e-01  2.41255210e-01 -2.30879062e-01
  3.91687125e-01  2.45400008e-01  2.54048954e-01 -1.78613377e-01
  6.92948436e-02  2.54387707e-01  6.54765833e-02  1.40053927e-01
 -1.49373093e-01  1.00753657e-01  1.48191673e-01  2.13086194e-01] 1.305518724957287
0.67544


## Regularized Logistic Regression

In [11]:
w_final, loss_final = implementations.reg_logistic_regression(y, model_data, initial_w, max_iters, gamma, lambda_)
print(w_final,loss_final)
tmp = y == predict_labels(w_final, model_data)
print(np.count_nonzero(tmp == 1)/len(tmp))

[-1.27903735 -0.45183278  0.14902619  0.23402499 -0.00367915 -0.12349203
  0.23428769 -0.39292466  0.37454519  0.51603803 -0.11286803 -0.18927387
  0.05038865 -0.26159962 -0.02583562 -0.00721014  0.08826443  0.13539957
 -0.03452798  0.14723233] 0.9140614985863633
0.688768


# The main code we submit, above is just test with a basic cleaning data

## Find best Gamma and Lambda_

In [12]:
lambdas = np.logspace(-2, 0,num=30)
k_fold = 5
seed = 0
max_iters = 100
gamma = 1e-5

wanted_index = 0

tx_wanted = all_tx[wanted_index]
y_wanted = all_y[wanted_index]

k_indices = helpers.build_k_indices(y_wanted, k_fold, seed)
initial_w = np.zeros(tx_wanted.shape[1])

arr_pred_tr = []
arr_pred_te = []

for lambda_ in lambdas:
    pred_tr_tmp = []
    pred_te_tmp = []
    for k in range(k_fold):
        pred_tr, pred_te = helpers.cross_validation(y_wanted, tx_wanted, initial_w, max_iters, gamma, lambda_, k_indices, k)
        pred_tr_tmp.append(pred_tr)
        pred_te_tmp.append(pred_te)
    arr_pred_tr.append(np.mean(pred_tr_tmp))
    arr_pred_te.append(np.mean(pred_te_tmp))
    
maxIndex_tr = np.argmax(arr_pred_tr)
maxIndex_te = np.argmax(arr_pred_te)
    
print(max(arr_pred_tr), lambdas[maxIndex_tr])
print(max(arr_pred_te), lambdas[maxIndex_te])

0.7894379941947752 0.2395026619987486
0.789270343308978 0.2395026619987486


## Generate predictions and save ouput in csv format for submission:

In [13]:
gamma = 1e-5
max_iter = 300
all_w = []
all_pred = []
for tx, y in zip(all_tx, all_y):
    initial_w = np.zeros(tx.shape[1])
    w_final, _ = implementations.logistic_regression(y, tx, initial_w, max_iters, gamma)
    all_pred.append(np.count_nonzero((y == predict_labels(w_final, tx)) == 1)/len(tmp))
    all_w.append(w_final)
    
weights = all_w

In [14]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [15]:
all_tx_te, all_id_te = helpers.build_data(ids_test, tX_test, 0.15)
y_pred = helpers.pred_labels(all_tx_te, all_id_te, weights)

In [16]:
OUTPUT_PATH = '../data/submission.csv' # TODO: fill in desired name of output file for submission
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)