In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [25]:
from costs import *
from helpers import *

In [4]:
def compute_gradient(y, tx, w, costf="MSE"):
    """Compute the gradient."""
    e=y-tx.dot(w)
    N=len(y)
    if (costf=="MSE"):
        return -(np.transpose(tx).dot(e))/N
    else:
        if (costf=="MAE"):
            return -tx.T.dot(np.sign(e))/N
        else:
            raise CostFunctionNotRecognisedError

In [5]:
def gradient_descent(y, tx, initial_w, max_iters, gamma):
    """Gradient descent algorithm."""
    # Define parameters to store w and loss
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        g = compute_gradient(y, tx, w)
        loss = compute_loss(y, tx, w)
        w = w - gamma*g;
        # store w and loss
        ws.append(w)
        losses.append(loss)
        print("Gradient Descent({bi}/{ti}): ||gradient||={grad}, loss={l}, w0={w0}, w1={w1}".format(
              bi=n_iter, ti=max_iters - 1, grad=np.linalg.norm(g), l=loss, w0=w[0], w1=w[1]))

    return losses, ws

In [6]:
len(y)

250000

In [7]:
tX.shape

(250000, 30)

In [9]:
# Define the parameters of the algorithm.
max_iters = 200
gamma = 0.0000001

# Initialization
w_initial = np.zeros(tX.shape[1])

# Start gradient descent.
gradient_losses, gradient_ws = gradient_descent(y, tX, w_initial, max_iters, gamma)

Gradient Descent(0/199): ||gradient||=841.1056518611562, loss=0.5, w0=1.0766649510400011e-05, w1=-2.7284198603999986e-06
Gradient Descent(1/199): ||gradient||=366.64727324883336, loss=0.44933580794127903, w0=1.9619683262463213e-05, w1=-4.558792209042017e-06
Gradient Descent(2/199): ||gradient||=173.05481195643364, loss=0.4395521360766416, w0=2.758125136812426e-05, w1=-6.003897270664246e-06
Gradient Descent(3/199): ||gradient||=103.55566845478772, loss=0.4372285641519502, w0=3.5088986369807325e-05, w1=-7.2824826037755856e-06
Gradient Descent(4/199): ||gradient||=83.80629762975822, loss=0.4362834492886377, w0=4.233115283840755e-05, w1=-8.487879573590174e-06
Gradient Descent(5/199): ||gradient||=78.48028176587161, loss=0.4356091012507515, w0=4.938960304479309e-05, w1=-9.65992452714045e-06
Gradient Descent(6/199): ||gradient||=76.29875603358957, loss=0.4350028547308642, w0=5.630075471501667e-05, w1=-1.0815635842764712e-05
Gradient Descent(7/199): ||gradient||=74.76360134150336, loss=0.4344

Gradient Descent(61/199): ||gradient||=36.60934299528717, loss=0.4193217208294874, w0=0.00030220161594083066, w1=-6.929920411038802e-05
Gradient Descent(62/199): ||gradient||=36.24576296766769, loss=0.4191883635058284, w0=0.0003049597913631369, w1=-7.02964496092189e-05
Gradient Descent(63/199): ||gradient||=35.88915450608346, loss=0.41905763577132277, w0=0.00030767502343755884, w1=-7.129184084625115e-05
Gradient Descent(64/199): ||gradient||=35.5393775027259, loss=0.4189294617799317, w0=0.00031034801036091645, w1=-7.228539927791339e-05
Gradient Descent(65/199): ||gradient||=35.196295598528685, loss=0.4188037681493648, w0=0.0003129794375730283, w1=-7.327714607277825e-05
Gradient Descent(66/199): ||gradient||=34.85977597382074, loss=0.41868048386827167, w0=0.00031556997805270705, w1=-7.426710211548333e-05
Gradient Descent(67/199): ||gradient||=34.52968915546861, loss=0.4185595402079736, w0=0.0003181202926043038, w1=-7.525528801059931e-05
Gradient Descent(68/199): ||gradient||=34.20590883

Gradient Descent(121/199): ||gradient||=23.522394950127406, loss=0.4142256430066455, w0=0.0004106110868419557, w1=-0.00012646477498330638
Gradient Descent(122/199): ||gradient||=23.40543236617264, loss=0.41417045069913244, w0=0.00041169880708518395, w1=-0.0001273805469668556
Gradient Descent(123/199): ||gradient||=23.290519694227374, loss=0.4141158041783422, w0=0.0004127698825265543, w1=-0.00012829535433752935
Gradient Descent(124/199): ||gradient||=23.177611998634692, loss=0.41406169124802145, w0=0.00041382456838083656, w1=-0.00012920920670530194
Gradient Descent(125/199): ||gradient||=23.066665217278015, loss=0.4140081000588974, w0=0.00041486311585379527, w1=-0.00013012211355388332
Gradient Descent(126/199): ||gradient||=22.95763615040384, loss=0.41395501909822535, w0=0.00041588577220872416, w1=-0.0001310340842424398
Gradient Descent(127/199): ||gradient||=22.85048244943297, loss=0.4139024371796532, w0=0.0004168927808317172, w1=-0.0001319451280072909
Gradient Descent(128/199): ||grad

Gradient Descent(181/199): ||gradient||=18.960394448868765, loss=0.4115877928962569, w0=0.00045350149236594953, w1=-0.0001799793413868077
Gradient Descent(182/199): ||gradient||=18.911381362395584, loss=0.41155188980158336, w0=0.00045393185066954026, w1=-0.0001808506502232975
Gradient Descent(183/199): ||gradient||=18.86289376717839, loss=0.4115161717087715, w0=0.0004543555720953225, w1=-0.00018172139649267908
Gradient Descent(184/199): ||gradient||=18.814920307549485, loss=0.41148063616954544, w0=0.0004547727568980714, w1=-0.0001825915846031165
Gradient Descent(185/199): ||gradient||=18.76744991149234, loss=0.41144528079323006, w0=0.00045518350379908633, w1=-0.0001834612189071734
Gradient Descent(186/199): ||gradient||=18.720471783990984, loss=0.4114101032451014, w0=0.00045558791000994704, w1=-0.00018433030370256426
Gradient Descent(187/199): ||gradient||=18.67397540050622, loss=0.41137510124478804, w0=0.00045598607125589216, w1=-0.0001851988432328954
Gradient Descent(188/199): ||grad

In [19]:
def compute_stoch_gradient(y, tx, w, costf="MSE"):
    """Compute a stochastic gradient from just few examples n and their corresponding y_n labels."""
    e=y-tx.dot(w)
    N=len(y)
    if (costf=="MSE"):
        return -(np.transpose(tx).dot(e))/N
    else:
        if (costf=="MAE"):
            return -tx.T.dot(np.sign(e))/N
        else:
            raise CostFunctionNotRecognisedError


def stochastic_gradient_descent(y, tx, initial_w, batch_size, max_iters, gamma, costf="MSE"):
    """Stochastic gradient descent algorithm."""
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        for yn, xn in batch_iter(y, tx, batch_size):
            g = compute_stoch_gradient(yn, xn, w, costf)
            w = w - gamma*g;
            loss = compute_loss(y, tx, w, costf)
        # store w and loss
        ws.append(w)
        losses.append(loss)
        print("SGD({bi}/{ti}): |gradient|={grad}, loss={l}, w0={w0}, w1={w1}".format(
              bi=n_iter, ti=max_iters - 1, grad=np.linalg.norm(g), l=loss, w0=w[0], w1=w[1]))

    return losses, ws

In [31]:
# from stochastic_gradient_descent import *

# Define the parameters of the algorithm.
max_iters = 50
gamma = 0.0000001
batch_size = 1

# Initialization
w_initial = np.zeros(tX.shape[1])

# Start SGD.
sgd_losses, sgd_ws = stochastic_gradient_descent(
    y, tX, w_initial, batch_size, max_iters, gamma)


SGD(0/49): |gradient|=3315.789732239365, loss=0.5124687757386779, w0=9.989999999999999e-05, w1=-8.1236e-06
SGD(1/49): |gradient|=402.81046269515053, loss=0.5072239554008168, w0=8.295307243480032e-05, w1=-1.3557347124958942e-05
SGD(2/49): |gradient|=526.4032381002135, loss=0.5006468080144686, w0=7.110745002600888e-05, w1=-2.1624725651194117e-05
SGD(3/49): |gradient|=4396.613529800658, loss=0.622723007678385, w0=8.507690041131784e-05, w1=-2.086995625350684e-05
SGD(4/49): |gradient|=2492.731383146835, loss=1.1412026307985264, w0=9.686864066699177e-05, w1=-2.0681629991579653e-05
SGD(5/49): |gradient|=3.1774712722362484, loss=1.1420826245163278, w0=9.687648972658865e-05, w1=-2.0679122194085047e-05
SGD(6/49): |gradient|=1.1061126550533136, loss=1.1423891183906771, w0=9.687922305325756e-05, w1=-2.0678033352763887e-05
SGD(7/49): |gradient|=5471.7846538404965, loss=0.43501278559071815, w0=8.043363803039353e-05, w1=-2.5325175541772855e-05
SGD(8/49): |gradient|=450.17519493477454, loss=0.43435499

In [32]:
def ridge_regression(y, tx, lambda_):
    """implement ridge regression."""
    N = len(y)
    G = tx.T.dot(tx)
    i = np.linalg.inv(G + 2*N*lambda_*np.eye(G.shape[0]))
    w_star = i.dot(tx.T).dot(y)
    return w_star

In [51]:
lambda_ = 0.5
w = ridge_regression(y, tX, lambda_)
loss = compute_loss(y, tX, w)
print(w, loss)

[ 2.48519324e-04 -8.96465390e-03 -2.24753247e-03 -2.19688733e-03
 -8.28785842e-04  5.27482032e-04 -1.23580011e-02  2.78401608e-02
  6.37660259e-05  3.17281194e-03 -2.55552723e-02  4.56064526e-02
  1.07748012e-02  6.17277403e-03 -3.38880835e-04 -1.20732147e-03
  2.51582382e-03 -4.25916162e-04  8.32986709e-04  4.90332267e-03
  4.17025377e-04 -7.70891369e-04 -2.48302577e-02  1.45686817e-03
 -7.63838506e-04 -5.43201301e-04  2.59945303e-04  1.61198112e-03
  1.83180262e-04 -5.51499664e-03] 0.35160668414902113


## Generate predictions and save ouput in csv format for submission:

In [52]:
weights = w
weights

array([ 2.48519324e-04, -8.96465390e-03, -2.24753247e-03, -2.19688733e-03,
       -8.28785842e-04,  5.27482032e-04, -1.23580011e-02,  2.78401608e-02,
        6.37660259e-05,  3.17281194e-03, -2.55552723e-02,  4.56064526e-02,
        1.07748012e-02,  6.17277403e-03, -3.38880835e-04, -1.20732147e-03,
        2.51582382e-03, -4.25916162e-04,  8.32986709e-04,  4.90332267e-03,
        4.17025377e-04, -7.70891369e-04, -2.48302577e-02,  1.45686817e-03,
       -7.63838506e-04, -5.43201301e-04,  2.59945303e-04,  1.61198112e-03,
        1.83180262e-04, -5.51499664e-03])

In [53]:
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [54]:
OUTPUT_PATH = '../data/submission.csv'
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [55]:
loss = compute_loss(y_pred, tX_test, w)
loss

0.210197199747105