In [216]:
import sys
sys.path.insert(0, '..')
# Useful starting lines
%matplotlib inline
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
import Helpers.helpers as helper
import Helpers.cleaner as cleaner
import numpy as np
import Helpers.implementations as imp


In [3]:
x,y = helper.load_clean_data('../../data/x_train.npy', '../../data/y_train.npy')


In [5]:
def toDeg(x,degree):
    res = x
    for i in range(2,degree +1):
        power = np.power(x,i)
        res = np.c_[res,power]
    return res

def addConstant(x):
    return np.c_[np.ones((x.shape[0],1)),x]

In [7]:
xpow = addConstant(toDeg(x, 3))


In [8]:
x_tr,y_tr,x_te,y_te = imp.split_data(xpow,y,0.5,seed = 9)


In [31]:
loss,w = imp.ridge_regression(y_tr,x_tr,0)

In [32]:
y_pred = helper.predict_labels(w,x_te)

In [33]:
res = np.array([(1 if(y_pred[i] == y_te[i]) else 0) for i in range(y_te.shape[0])])

In [34]:
print(res.sum()/len(res))

0.786678458803


In [172]:
for lambda_ in [0,0.0001]:
    for degree in [4,5,6]:
        xpow = addConstant(toDeg(x, degree))
        x_tr,y_tr,x_te,y_te = imp.split_data(xpow,y,0.5,seed = 9)
        loss,w = imp.ridge_regression(y_tr,x_tr,lambda_)
        y_pred = helper.predict_labels(w,x_te)
        res = np.array([(1 if(y_pred[i] == y_te[i]) else 0) for i in range(y_te.shape[0])])
        print("l= ", lambda_," d= ", degree, " score= ", res.sum()/len(res)," loss= ", loss)
        

l=  0  d=  4  score=  0.790930774797  loss=  0.076667316385
l=  0  d=  5  score=  0.792672795004  loss=  0.0757621348459
l=  0  d=  6  score=  0.789421023951  loss=  0.0768037450645
l=  0.0001  d=  4  score=  0.775315126989  loss=  0.0814069630877
l=  0.0001  d=  5  score=  0.778182760253  loss=  0.0803513763763
l=  0.0001  d=  6  score=  0.778933168958  loss=  0.0800476298695


In [17]:
#parameters
deg = 5

xpow = addConstant(toDeg(x, deg))
loss,w = imp.least_squares(y,xpow)

print(loss)

0.0759081131882


In [208]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

In [226]:
def cross_validation(y, x, k_indices, k):
    """return the loss of ridge regression."""
    # ***************************************************
    # INSERT YOUR CODE HERE
    # get k'th subgroup in test, others in train: TODO
    # ***************************************************
    x_test = x[k_indices[k]]
    y_test = y[k_indices[k]]
    all_ind = np.array(list(range(x.shape[0])))
    rest_ind = np.setdiff1d(all_ind,k_indices[k])
    x_train = x[rest_ind]
    y_train = y[rest_ind]
    loss,w = imp.least_squares(y_train,x_train)
    loss_tr = imp.compute_mse(y_train,x_train,w)
    loss_te = imp.compute_mse(y_test,x_test,w)
    return w,loss_tr, loss_te

In [227]:
# -*- coding: utf-8 -*-
"""a function of ploting figures."""
import numpy as np
import matplotlib.pyplot as plt


def cross_validation_visualization(lambds, mse_tr, mse_te):
    """visualization the curves of mse_tr and mse_te."""
    plt.semilogx(lambds, mse_tr, marker=".", color='b', label='train error')
    plt.semilogx(lambds, mse_te, marker=".", color='r', label='test error')
    plt.xlabel("lambda")
    plt.ylabel("rmse")
    plt.title("cross validation")
    plt.legend(loc=2)
    plt.grid(True)
    plt.savefig("cross_validation")

In [232]:


def cross_validation_demo():
    seed = 1
    degrees = [3,4,5,6,7]
    k_fold = 4
    lambdas = np.logspace(-4, 0, 10)
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    # define lists to store the loss of training data and test data
    for degree in degrees:
        x_pow = addConstant(toDeg(x,degree))
        sum_te = 0
        sum_tr = 0
        sum_w = np.array(x_pow.shape[1]*[0.0])
        for k in range(k_fold):
            w,mse_tr,mse_te = cross_validation(y,x_pow,k_indices,k)
            #print(w.shape)
            sum_tr += np.sqrt(2*mse_tr)
            sum_te += np.sqrt(2*mse_te)
            sum_w += w
        mean_w = sum_w/(1.0*k_fold)
        mean_error_tr = sum_tr/(1.0*k_fold)
        mean_error_te = sum_te/(1.0*k_fold)
        #print('shape', mean_w.shape)
        y_pred = helper.predict_labels(mean_w,x_pow)
        res = np.array([(1 if(y_pred[i] == y[i]) else 0) for i in range(y.shape[0])])
        print(degree,res.mean(),mean_error_tr,mean_error_te)

cross_validation_demo()

3 0.786476502722 0.394872562311 0.395020028507
4 0.791305046968 0.391824220366 0.3919905432
5 0.793176610371 0.38961614845 0.389777307025
6 0.792408331361 0.390643949434 0.390843533442
7 0.778985782372 0.404186349607 0.404275017546


In [242]:
#par
degree = 5
k_fold = 4
seed = 7
k_indices = build_k_indices(y, k_fold, seed)

x_pow = addConstant(toDeg(x,degree))
sum_te = 0
sum_tr = 0
sum_w = np.array(x_pow.shape[1]*[0.0])
for k in range(k_fold):
    w,mse_tr,mse_te = cross_validation(y,x_pow,k_indices,k)
    #print(w.shape)
    sum_tr += np.sqrt(2*mse_tr)
    sum_te += np.sqrt(2*mse_te)
    sum_w += w
mean_w = sum_w/(1.0*k_fold)
mean_error_tr = sum_tr/(1.0*k_fold)
mean_error_te = sum_te/(1.0*k_fold)
#print('shape', mean_w.shape)
y_pred = helper.predict_labels(mean_w,x_pow)
res = np.array([(1 if(y_pred[i] == y[i]) else 0) for i in range(y.shape[0])])
print(degree,res.mean(),mean_error_tr,mean_error_te)

5 0.793131942987 0.389613519902 0.389751710664


In [243]:
x_test,y_test = helper.load_clean_data('../../data/x_test.npy', '../../data/y_test.npy')

In [244]:
xpow_test = addConstant(toDeg(x_test, degree))

In [248]:
print(x.shape[1])
print(x_pow.shape[1])
print(xpow_test.shape[1])

19
96
96


In [249]:
y_pred = helper.predict_labels(w,xpow_test)

In [250]:
y_pred = helper.changeYfromBinary(y_pred)

In [251]:
a,b, ids = helper.load_csv_data('../../data/test.csv')

In [252]:
helper.create_csv_submission(ids,y_pred,'../../data/subm3.csv')