In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
from proj1_helpers import *
from helpers import *
from data_modification import replace_by_mean

""" Load TRAINING data """
DATA_TRAIN_PATH = '../data/train.csv'
y, raw_tx, ids = load_csv_data(DATA_TRAIN_PATH)

# Replace -999 by the mean of its respective column
processed_tx = replace_by_mean(raw_tx)

# Standardize (subtract mean and divive by standard deviation)
processed_tx,mean_pr_tx,std_pr_tx = standardize(processed_tx, True)

## Test Least squares with gradient descent

In [3]:
from implementations import least_squares_GD

initial_w = np.zeros((processed_tx.shape[1],1))
max_iters = 5000
gamma = 0.01

w, loss = least_squares_GD(y, processed_tx, initial_w, max_iters, gamma)

print("Weights:\n", w)
print("\n")
print("Loss:\n", loss)

Weights:
 [ -2.10143501e-01   2.51724232e-03  -2.60505491e-01  -2.46137310e-01
  -1.82979695e-02  -8.23612150e-03   1.00697248e-01   1.85207973e-03
   2.64019458e-01  -2.80149795e-02   1.00018767e-01  -1.79183852e-01
   1.22953669e-01   9.61122113e-02   1.64255318e-01   4.57586102e-04
   2.00926577e-03   2.57819491e-01  -4.32891513e-05   5.72497372e-03
   1.05679970e-01   3.47645245e-03  -5.54607535e-02  -2.12954943e-02
  -8.70220705e-02   2.79081053e-03   4.29395177e-03  -7.11935655e-02
   4.30635863e-03   1.84018336e-03   2.67614434e-02]


Loss:
 85200.9610964


## Test Least squares with stochastic gradient descent

In [4]:
from implementations import least_squares_SGD

initial_w = np.zeros((processed_tx.shape[1],1))
max_iters = 50
gamma = 0.01

w, loss = least_squares_SGD(y, processed_tx, initial_w, max_iters, gamma)

print("Weights:\n", w)
print("\n")
print("Loss:\n", loss)

Weights:
 [ 0.01316851 -0.0328513  -0.1032939  -0.0039369  -0.02081322  0.00811093
 -0.04822099 -0.01961342  0.0109631  -0.05058128  0.00184863 -0.10209009
 -0.06204941  0.05566102  0.11644048 -0.02014612 -0.0894478  -0.03129034
 -0.0367632   0.11528398 -0.04197272  0.05580443  0.04639581  0.00277032
 -0.01292013 -0.1077839  -0.0926331   0.01832733  0.05634268 -0.07242884
 -0.01766777]


Loss:
 0.129235233693


## Test Least squares

In [5]:
from implementations import least_squares

w, loss = least_squares(y, processed_tx)

print("Weights:\n", w)
print("\n")
print("Loss:\n", loss)

Weights:
 [  5.20696559e+00   9.63458006e-03  -2.54719226e-01  -2.63502969e-01
  -1.10181076e-03   2.18423835e-02   9.00537809e-02   4.83490442e-03
   2.82008766e-01  -2.81502573e-02  -3.29326782e+02  -1.88141152e-01
   1.18065031e-01   7.66172583e-02   6.39754849e+01  -7.79460340e-04
  -8.30656871e-04   6.30911711e+01  -8.61169078e-04   2.51791383e-03
   1.03659310e-01   9.33785709e-04  -4.70019042e-02   4.17575954e-02
  -4.75783451e-02   6.50726186e-04   1.88755821e-04  -3.66001839e-02
   1.55837347e-03  -1.74318742e-03   2.78984473e+02]


Loss:
 85102.3630404


## Test Ridge regression

In [6]:
from implementations import ridge_regression

lambda_ = 0.1

w, loss = ridge_regression(y, processed_tx, lambda_)

print("Weights:\n", w)
print("\n")
print("Loss:\n", loss)

Weights:
 [-0.0280887  -0.01393268 -0.20013708 -0.07221326  0.03795443  0.01984449
  0.05542273 -0.01392809  0.12000181 -0.02879156  0.03820549 -0.08841178
  0.11451565  0.08757881  0.11868554  0.00115836  0.00229579  0.07669044
  0.00128682  0.00685682  0.02623058  0.00548248 -0.01622494  0.00603481
 -0.02317321  0.00365795  0.0062076  -0.05102699  0.00572182  0.00429357
  0.00127156]


Loss:
 0.349444846357


## Test Logistic regression

In [6]:
def convert_minus_one_to_zero(x):
    if x == -1:
        return 0
    else:
        return x
    
# Convert the values -1 to 0 in the vector y: needed for logistic regression
y_for_log = list(map(convert_minus_one_to_zero, y))
y_for_log = np.asarray(y_for_log)
y_for_log = y_for_log[:,np.newaxis]

(250000, 1)


In [7]:
from implementations import logistic_regression

initial_w = np.zeros((processed_tx.shape[1],1))
max_iters = 50
gamma = 0.0001

w, loss = logistic_regression(y_for_log, processed_tx, initial_w, max_iters, gamma)

print("Weights:\n", w)
print("\n")
print("Loss:\n", loss)

Current iteration=0, the loss=69.31471805599453
Current iteration=10, the loss=66.42680182433267
Current iteration=20, the loss=65.70465981903608
Current iteration=30, the loss=57.144467051331944
Current iteration=40, the loss=64.97350005596627
Weights:
 [[-0.01228004]
 [-0.02064572]
 [-0.08523098]
 [-0.02281858]
 [ 0.03221598]
 [ 0.02011347]
 [ 0.01918041]
 [-0.02182487]
 [-0.01673321]
 [-0.01188065]
 [ 0.0173557 ]
 [-0.0493835 ]
 [ 0.0675026 ]
 [ 0.0680756 ]
 [ 0.03684481]
 [ 0.00338647]
 [ 0.00691451]
 [-0.02923728]
 [ 0.00594401]
 [ 0.00540324]
 [-0.00612073]
 [ 0.00827062]
 [ 0.0081356 ]
 [ 0.02767576]
 [-0.0028494 ]
 [ 0.00521155]
 [ 0.01035072]
 [-0.04254888]
 [ 0.01026975]
 [ 0.0126743 ]
 [ 0.01889566]]


Loss:
 62.0319607614


## Test Regularized logistic regression

In [12]:
from implementations import reg_logistic_regression

initial_w = np.zeros((processed_tx.shape[1],1))
max_iters = 5000
gamma = 0.001
lambda_ = 0.1

w, loss = reg_logistic_regression(y_for_log, processed_tx, lambda_, initial_w, max_iters, gamma)

print("Weights:\n", w)
print("\n")
print("Loss:\n", loss)

Current iteration=0, the loss=[[ 69.31471806]]
Current iteration=10, the loss=[[ 60.92563724]]
Current iteration=20, the loss=[[ 52.48830775]]
Current iteration=30, the loss=[[ 52.20305997]]
Current iteration=40, the loss=[[ 60.56446454]]
Current iteration=50, the loss=[[ 55.85920878]]
Current iteration=60, the loss=[[ 54.16333147]]
Current iteration=70, the loss=[[ 57.33694975]]
Current iteration=80, the loss=[[ 43.05055396]]
Current iteration=90, the loss=[[ 50.81126253]]
Current iteration=100, the loss=[[ 59.10704957]]
Current iteration=110, the loss=[[ 58.91900941]]
Current iteration=120, the loss=[[ 46.8373547]]
Current iteration=130, the loss=[[ 50.26477548]]
Current iteration=140, the loss=[[ 53.20376672]]
Current iteration=150, the loss=[[ 44.41899943]]
Current iteration=160, the loss=[[ 38.44757634]]
Current iteration=170, the loss=[[ 52.01632966]]
Current iteration=180, the loss=[[ 63.13479765]]
Current iteration=190, the loss=[[ 52.12464134]]
Current iteration=200, the loss=