In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import collections
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [3]:
from implementations import *

## Do your thing crazy machine learning thing here :) ...

### Testing functions

In [None]:
#GD
initial_w=np.zeros(tX.shape[1])
max_iters=50
gamma=1e-1
w,loss = least_squares_GD(y,tX,initial_w,max_iters,gamma)
print(f'w:{w}\nloss:{loss}')

In [None]:
#SDG
initial_w=np.zeros(tX.shape[1])
max_iters=20
gamma=1e-4
w,loss = least_squares_SGD(y,tX,initial_w,max_iters,gamma)
print(f'w:{w}\nloss:{loss}')

In [None]:
#LS
w,loss = least_squares(y,tX)
print(f'w:{w}\nloss:{loss}')

In [None]:
#RR
lambda_=0.1
w,loss = ridge_regression(y,tX,lambda_)
print(f'w:{w}\nloss:{loss}')

In [None]:
#LR
initial_w=np.zeros(tX.shape[1])
max_iters=300
gamma=1e-9
w,loss = logistic_regression(y,tX,initial_w,max_iters,gamma)
print(f'w:{w}\nloss:{loss}')

In [None]:
#RLR
initial_w=np.zeros(tX.shape[1])
max_iters=300
gamma=1e-9
lambda_=0.01
w,loss = reg_logistic_regression(y,tX, lambda_, initial_w, max_iters,gamma)
print(f'w:{w}\nloss:{loss}')

### Cleaning dataset

### Subsample


In [4]:
N=3
y=y[::N]
tX=tX[::N,:]
print(f'tX shape:{tX.shape} y shape:{y.shape}')

tX shape:(83334, 30) y shape:(83334,)


#### Transform [-1,1] into [0,1]

In [5]:
#tX[:,:][tX[:,:] == -999] = 0
#we could normalize the data ranging from [0,1] since its binary prediction",
y = (y-min(y))/(max(y)-min(y))


### Split dataset

In [6]:
tX_list,y_list=split_data(tX, y)

tX_0 shape: (33203, 19)
y_0 shape: (33203,)
tX_1 shape: (25881, 22)
y_1 shape: (25881,)
tX_2 shape: (16823, 29)
y_2 shape: (16823,)
tX_3 shape: (7427, 29)
y_3 shape: (7427,)


In [None]:
tX_list[0][0,:]

### Remove columns containing over 39% of NULL values

In [None]:
remove_features=[]
for i in range(tX.shape[1]):
    col=tX[:,i]
    total=col.shape[0]
    counter_=collections.Counter(col)
    nulls=counter_[-999]
    null_percentage=round(nulls/total,2)
    print(f'NULL percentage is: {null_percentage}')
    if null_percentage>0.39:
        remove_features.append(i)
tX=np.delete(tX,remove_features,1)

### Removing outliers

In [None]:
for j,x in enumerate(tX_list):
    k = 1
    for i in range(0,x.shape[1]):
        q1 = np.percentile(x[:,i],25)
        q2 = np.percentile(x[:,i],50)
        q3 = np.percentile(x[:,i],75)
        tX_list[j][:,i][(x[:,i] < q1 - k*(q3-q1))] = q2
        tX_list[j][:,i][(x[:,i] > q3 + k*(q3-q1))] = q2

### Normalization of features


In [None]:
for i,x in enumerate(tX_list):
    tX_list[i] = standardize(x)


In [None]:
tX_list[3][0,:]

### Remove less influent features

In [None]:
#Apply least squares
threshold=1e-3
w,loss = least_squares(y,tX)
print(f'w:{w}\nloss:{loss}')
print(f'w:{w}\n\tshape:{w.shape}')
w=w[np.abs(w)>=threshold]
print(f'w:{w}\n\tshape:{w.shape}')


### Logistic Cross Validation - Searching best Degree

In [7]:
degrees=np.arange(3)
k_fold=4
max_iters=100
gammas=np.logspace(-10,0,10)
interactions=[False,True]
tX_results_list_logistic=[]
for h,x in enumerate(tX_list):
    loss_tr_list=np.zeros((len(degrees),len(gammas),len(interactions)))
    loss_te_list=np.zeros((len(degrees),len(gammas),len(interactions)))
    for i,D in enumerate(degrees):
        print(i)
        for k,interaction in enumerate(interactions):
            phi_x=build_poly(x, D, interaction)
            for j,gamma in enumerate(gammas):
                #compute loss with cross-validation
                loss_tr, loss_te=apply_cross_validation_logistic(y_list[h],phi_x,k_fold,max_iters,gamma,1)
                loss_tr_list[i,j,k]=loss_tr
                loss_te_list[i,j,k]=loss_te
    D_best_index, gamma_best_index, interaction_index=np.unravel_index(np.argmin(loss_te_list),loss_te_list.shape)
    gamma_best=gammas[gamma_best_index]
    D_best_logistic=degrees[D_best_index]
    interaction_logistic=interactions[interaction_index]
    print(f'tX_{h} Best degree logistic: {D_best_logistic}, best gamma logistic:{gamma_best}, interaction:{interaction_logistic}')
    tX_results_list_logistic.append({'D_best':D_best_logistic,'gamma_best':gamma_best,'interaction':interaction_logistic})

0
1
2
tX_0 Best degree logistic: 2, best gamma logistic:2.782559402207126e-06, interaction:True
0
1
2
tX_1 Best degree logistic: 2, best gamma logistic:2.782559402207126e-06, interaction:True
0
1
2
tX_2 Best degree logistic: 1, best gamma logistic:3.5938136638046256e-05, interaction:True
0
1
2
tX_3 Best degree logistic: 2, best gamma logistic:3.5938136638046256e-05, interaction:True


### Cross Validation Ridge Regression - Best Degree

In [8]:
degrees=np.arange(3)
lambdas=np.logspace(-10,0,10)
k_fold=4
interactions=[False,True]
tX_results_list_ridge=[]
for h,x in enumerate(tX_list):
    rmse_tr_list=np.zeros((len(degrees),len(lambdas),len(interactions)))
    rmse_te_list=np.zeros((len(degrees),len(lambdas),len(interactions)))
    for i,D in enumerate(degrees):
        print(f'Degree:{D}')
        for k,interaction in enumerate(interactions):
            phi_x=build_poly(x, D, interaction)
            for j,lambda_ in enumerate(lambdas):
                #compute loss with cross-validation
                rmse_tr, rmse_te=apply_cross_validation(y_list[h],phi_x,k_fold,D,lambda_,1)
                rmse_tr_list[i,j,k]=rmse_tr
                rmse_te_list[i,j,k]=rmse_te
    D_best_index,lambda_best_index,interaction_index=np.unravel_index(np.argmin(rmse_te_list),rmse_te_list.shape)
    D_best_ridge=degrees[D_best_index]
    lambda_best_ridge=lambdas[lambda_best_index]
    interaction_ridge=interactions[interaction_index]
    print(f'tX_{h} Best degree ridge:{D_best_ridge}, best lambda_ ridge:{lambda_best_ridge}, interactions: {interaction_ridge}')
    tX_results_list_ridge.append({'D_best':D_best_ridge,'lambda_best':lambda_best_ridge,'interaction':interaction_ridge})

Degree:0
Degree:1
Degree:2
tX_0 Best degree ridge:2, best lambda_ ridge:3.5938136638046256e-05, interactions: True
Degree:0
Degree:1
Degree:2
tX_1 Best degree ridge:2, best lambda_ ridge:0.0004641588833612782, interactions: True
Degree:0
Degree:1
Degree:2
tX_2 Best degree ridge:2, best lambda_ ridge:0.0004641588833612782, interactions: True
Degree:0
Degree:1
Degree:2
tX_3 Best degree ridge:2, best lambda_ ridge:0.07742636826811278, interactions: True


### Testing accuracy (ridge-regression vs logistic)

In [9]:
weights_ridge=[]
weights_logistic=[]
for i,x in enumerate(tX_list):
    D_best_ridge=tX_results_list_ridge[i]['D_best']
    interaction_ridge=tX_results_list_ridge[i]['interaction']
    lambda_best_ridge=tX_results_list_ridge[i]['lambda_best']

    D_best_logistic=tX_results_list_logistic[i]['D_best']
    interaction_logistic=tX_results_list_logistic[i]['interaction']
    gamma_best=tX_results_list_logistic[i]['gamma_best']
    #Re-transform y data
    y_act=y_list[i]*2-1
    #Ridge
    phi_x_ridge=build_poly(x,D_best_ridge,interaction_ridge)
    w_ridge,_=ridge_regression(y_list[i],phi_x_ridge,lambda_best_ridge)
    weights_ridge.append(w_ridge)
    y_pred_ridge=predict_labels(w_ridge,phi_x_ridge)
    accuracy_ridge=accuracy(y_pred_ridge,y_act)
    #Logistic
    phi_x_logistic=build_poly(x, D_best_logistic, interaction_logistic)
    w_initial=np.zeros(phi_x_logistic.shape[1])
    max_iters=100
    w_logistic,loss_logistic=logistic_regression(y_list[i],phi_x_logistic,w_initial,max_iters,gamma_best)
    weights_logistic.append(w_logistic)
    y_pred_logistic=predict_labels(w_logistic,phi_x_logistic,logistic=True)
    accuracy_logistic=accuracy(y_pred_logistic,y_act)
    print(f'tX_{i} -- Accuracy ridge:{accuracy_ridge} Accuracy logistic:{accuracy_logistic}')

tX_0 -- Accuracy ridge:0.8383880974610728 Accuracy logistic:0.8376652712104328
tX_1 -- Accuracy ridge:0.8036783740968277 Accuracy logistic:0.7983076388083923
tX_2 -- Accuracy ridge:0.8403970754324437 Accuracy logistic:0.8301135350413125
tX_3 -- Accuracy ridge:0.8388312912346843 Accuracy logistic:0.8490642251245456


## Generate predictions and save ouput in csv format for submission:

In [10]:
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)


In [11]:
#dummy predict for obtaining y
w_dummy=np.zeros(tX_test.shape[1])
y_pred=predict_labels(w_dummy,tX_test)
tX_test_list,_=split_data(tX_test,y_pred,ignore_y=True)

tX_0 shape: (227458, 19)
tX_1 shape: (175338, 22)
tX_2 shape: (114648, 29)
tX_3 shape: (50794, 29)


In [12]:
for i,x in enumerate(tX_results_list_ridge):
    D_best_ridge=tX_results_list_ridge[i]['D_best']
    interaction_ridge=tX_results_list_ridge[i]['interaction']
    lambda_best_ridge=tX_results_list_ridge[i]['lambda_best']

    tX_test_ridge=build_poly(tX_test_list[i],D_best_ridge,interaction_ridge)
    weights=weights_ridge[i]
    y_pred[tX_test[:,22]==i] = predict_labels(weights, tX_test_ridge)

OUTPUT_PATH = 'output_ridge.csv' # TODO: fill in desired name of output file for submission
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
for i,x in enumerate(tX_results_list_logistic):
    D_best_logistic=tX_results_list_logistic[i]['D_best']
    interaction_logistic=tX_results_list_logistic[i]['interaction']
    gamma_best=tX_results_list_logistic[i]['gamma_best']

    tX_test_logistic=build_poly(tX_test_list[i],D_best_logistic,interaction_logistic)
    weights=weights_logistic[i]
    y_pred[tX_test[:,22]==i]=predict_labels(weights,tX_test_logistic, logistic=True)
    
OUTPUT_PATH = 'output_logistic.csv' # TODO: fill in desired name of output file for submission
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)