In [51]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import csv
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load the data

In [52]:
training_data = 'train.csv'
def load_data():
    Y = np.genfromtxt(training_data, delimiter=',', dtype=None, skip_header=1, usecols=[1], converters={1: lambda x: 0 if b'b'==x else 1})
    
    data = np.genfromtxt(training_data, delimiter=',', skip_header=1)
    X = data[:, 2:]
    
    return X, Y

In [53]:
X, Y = load_data()
print(Y)

[1 0 0 ... 1 0 0]


In [55]:
print(Y.shape)
print(X.shape)

(250000,)
(250000, 30)


## Standardize the variables

In [59]:
X_standardized = (X - X.mean(axis=0))/X.std(axis = 0)

In [60]:
print(X)

[[ 138.47    51.655   97.827 ...    1.24    -2.475  113.497]
 [ 160.937   68.768  103.235 ... -999.    -999.      46.226]
 [-999.     162.172  125.953 ... -999.    -999.      44.251]
 ...
 [ 105.457   60.526   75.839 ... -999.    -999.      41.992]
 [  94.951   19.362   68.812 ... -999.    -999.       0.   ]
 [-999.      72.756   70.831 ... -999.    -999.       0.   ]]


In [61]:
print(X_standardized)

[[ 0.46141372  0.06833197  0.40768027 ...  1.5668      1.55858439
   0.4125105 ]
 [ 0.51670419  0.55250482  0.54013641 ... -0.63936657 -0.63936694
  -0.27381996]
 [-2.33785898  3.19515553  1.09655998 ... -0.63936657 -0.63936694
  -0.29396985]
 ...
 [ 0.38016991  0.31931645 -0.13086367 ... -0.63936657 -0.63936694
  -0.31701723]
 [ 0.35431502 -0.84532397 -0.30297338 ... -0.63936657 -0.63936694
  -0.74543941]
 [-2.33785898  0.66533608 -0.25352276 ... -0.63936657 -0.63936694
  -0.74543941]]


## Methods from lab1 and lab2

In [62]:
def compute_loss(y, tx, w):
    error = y-np.sum(tx*w, axis=1)
    if(method == 'mae'):
        return 1/(2*len(y))*np.sum(error*error)
    else:
        return 1/(2*len(y))*np.sum(np.abs(error))

In [63]:
def grid_search(y, tx, w0, w1):
    losses = np.zeros((len(w0), len(w1)))
    for i in range(len(w0)):
        for j in range(len(w1)):
            losses[i][j] = compute_loss(y, tx, np.array([w0[i], w1[j]]))
    return losses

In [64]:
def compute_gradient(y, tx, w):
    error = y-np.sum(tx*w, axis=1)
    return -1/len(y)*tx.T@error

In [65]:
def gradient_descent(y, tx, initial_w, max_iters, gamma):
    w_res = initial_w
    loss_res = 0
    w = initial_w
    for n_iter in range(max_iters):
        gradient = compute_gradient(y, tx, w)
        loss = compute_loss(y, tx, w)
        w = w - gamma * gradient
        # store w and loss
        w_res = w
        loss_res = loss

    return loss_res, w_res

In [67]:
def stochastic_gradient_descent(y, tx, initial_w, batch_size, max_iters, gamma):
    w_res = initial_w
    loss_res = 0
    w = initial_w
    ti = max_iters
    for minibatch_y, minibatch_tx in batch_iter(y, tx, 32, max_iters):
        gradient = compute_gradient(minibatch_y, minibatch_tx, w)
        loss = compute_loss(minibatch_y, minibatch_tx, w)
        w = w- gamma * gradient
        # store w and loss
        w_res = w
        loss_res = loss
    return loss_res, w_res

In [76]:
def label_results(Y_predicted):
    f = lambda x: b'b' if x<0.5 else b's'
    return f(Y_predicted)

In [79]:
def performance(Y, Y_predicted):
    return np.sum(Y == Y_predicted)/Y.shape[0]