# Feature Selection

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import math
from implementations import *
from proj1_helpers import *
from misc_helpers import *
from plot_functions import *
from ml_math import *
%load_ext autoreload
%autoreload 2

SyntaxError: invalid syntax (implementations.py, line 62)

In [None]:
DATA_TRAIN_PATH = 'data/train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [None]:
y_norm = normalize(y)
tX_norm = normalize(tX)

In [None]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

def cross_validation(y, x, k_fold, solver = 'LS',lambda_ = 0):
    """return the loss of ridge regression."""
    seed = 1
    k_indices = build_k_indices(y, k_fold, seed)
    
    mse_tr = 0
    mse_te = 0
    for k in range(k_fold):
        # get k'th subgroup in test, others in train:
        test_indices = k_indices[k]
        train_indices = np.delete(k_indices,k,0).flatten()
        x_tr = x[train_indices]
        y_tr = y[train_indices]
        x_te = x[test_indices]
        y_te = y[test_indices]

        # Least squares:
        if solver == 'LS':
            w, loss = least_squares(y_tr, x_tr)
        elif solver == 'RR':
            w, loss = ridge_regression(y_tr, x_tr, lambda_)
        else:
            raise('Error')

        # calculate the loss for train and test data: 
        loss_tr = compute_MSE(y_tr, x_tr, w)
        loss_te = compute_MSE(y_te, x_te, w)
    
        mse_tr += loss_tr/k_fold
        mse_te += loss_te/k_fold
        
    
    return mse_tr, mse_te, w

# Do some crazy feature selection here

In [None]:
#normal
cross_validation(y,tX,4)

In [None]:
#normal normalized
cross_validation(y_norm,tX_norm,4)

In [None]:
def build_poly(x, degree, linear = False):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    if linear == False:
        D = len(x[0,:])
        N = len(x[:,0])
        new_x = np.ones((N,1)) #add bias
        if degree>=1:
            for i in range(1,degree+1):
                new_x = np.append(new_x,x**i,axis=1) 
        return new_x
    else:
        m = np.zeros((len(x),degree+1))
        for j in range(degree+1):
            m[:,j] = x**j
        return m

Here is the noise level. Everything that we add should beat this error

In [None]:
#only constant feature
tX0 = build_poly(tX,0)
print(cross_validation(y,tX0,4))
#only constant feature
tX0 = build_poly(tX_norm,0)
print(cross_validation(y,tX0,4))

In [None]:
#Test every degree for every feature
n,p = np.shape(tX)
loss = []
good_feat = []
min_loss = np.ones(p)*10000
min_deg = np.zeros(p)
for deg in [0,1,2,3,4,5,6,7,8,9,10]:
    for i in range(p):
        tX_ = build_poly(tX[:,i],deg, linear=True)
        loss_tr,loss_te,w = cross_validation(y,tX_,5)
        if loss_te<min_loss[i]:
            min_loss[i] = loss_te
            min_deg[i] = deg
        if loss_te < 0.45:
            good_feat.append((i,deg))


In [None]:
#all of these are good features
id_min_loss = np.where(min_loss < 0.43)
id_min_loss

In [None]:
#concatenate all good feature in tX0
tX0 = build_poly(tX,0)
for i in  id_min_loss[0]:
    deg = int(min_deg[i])
    tX0 = np.append(tX0, build_poly(tX[:,i], deg, linear=True),1)
tX0

In [None]:
def ridge_regression(y, tx, lambda_):
    """implement ridge regression."""
    # ridge regression:
    D = np.shape(tx)[1]
    N = np.shape(tx)[0]
    w = np.linalg.solve(tx.transpose()@tx+2*N*lambda_*np.identity(D), tx.transpose()@y)
    #Compute loss
    e = y - tx @ w
    N = len(y)
    loss = 1/(2*N)*e@e
    return w,loss

In [None]:
lambdas = np.logspace(-10, 0, 30)
rmse_tr = []
rmse_te = []
mse_te_min = 10000
w0 = np.ones((p))
for lambda_ in lambdas:
    mse_tr, mse_te, w = cross_validation(y,tX0,5,solver = 'RR',lambda_ = lambda_)
    rmse_tr = np.append(rmse_tr,mse_tr)
    rmse_te = np.append(rmse_te,mse_te)
    if mse_te < mse_te_min:
        w0 = w
        mse_te_min = mse_te
        

In [None]:
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

# Test for AIcrowd

In [None]:
DATA_TEST_PATH = 'data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
#tX_test = normalize(tX_test)
#concatenate all good feature in tX0
tX_test_0 = build_poly(tX_test,0)
for i in  id_min_loss[0]:
    deg = int(min_deg[i])
    tX_test_0 = np.append(tX_test_0, build_poly(tX_test[:,i], deg, linear=True),1)
np.shape(tX_test_0)

In [None]:
w0

In [None]:
OUTPUT_PATH = 'result/feature_selection_norm.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(w0, tX_test_0)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

# More advanced

Let's start from Remi's conclusion. Which is to take only the feature.\
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 (the id in feature is one less)\
First, delete where there is undefined feature. We will use:\
2, 3, 6, 7, 9, 10, 11, 13

Let's see for each, which of the degree is the best

In [None]:
good_feat = [2,3,6,7,9,10,11,13]

for feat in good_feat:
    for deg in range(10):
        cross_validation(y, build_poly(tX[:,feat],), 5,solver = 'RR',lambda_ = lambda_)

Now Let's try to see if there is a $1/x$ relation