In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import collections
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [None]:
from implementations import *

## Do your thing crazy machine learning thing here :) ...

### Testing functions

In [None]:
#GD
initial_w=np.zeros(tX.shape[1])
max_iters=50
gamma=1e-1
w,loss = least_squares_GD(y,tX,initial_w,max_iters,gamma)
print(f'w:{w}\nloss:{loss}')

In [None]:
#SDG
initial_w=np.zeros(tX.shape[1])
max_iters=20
gamma=1e-4
w,loss = least_squares_SGD(y,tX,initial_w,max_iters,gamma)
print(f'w:{w}\nloss:{loss}')

In [None]:
#LS
w,loss = least_squares(y,tX)
print(f'w:{w}\nloss:{loss}')

In [None]:
#RR
lambda_=0.1
w,loss = ridge_regression(y,tX,lambda_)
print(f'w:{w}\nloss:{loss}')

In [None]:
#LR
initial_w=np.zeros(tX.shape[1])
max_iters=300
gamma=1e-9
w,loss = logistic_regression(y,tX,initial_w,max_iters,gamma)
print(f'w:{w}\nloss:{loss}')

In [None]:
#RLR
initial_w=np.zeros(tX.shape[1])
max_iters=300
gamma=1e-9
lambda_=0.01
w,loss = reg_logistic_regression(y,tX, lambda_, initial_w, max_iters,gamma)
print(f'w:{w}\nloss:{loss}')

### Cleaning dataset

### Subsample


In [None]:
N=4
y=y[1::N]
tX=tX[1::N]

#### Transform [-1,1] into [0,1]

In [None]:
#we normalize the data ranging from [0,1] instead of [-1,1] since it is a binary prediction 
#and it fits the structure for the logistic regression
y = (y-min(y))/(max(y)-min(y))


### Remove columns containing over 39% of NULL values

In [None]:
#remove the column contaning over a percentage of null values.
percentage = 0.39
remove_features=[]
for i in range(tX.shape[1]):
    col=tX[:,i]
    total=col.shape[0]
    counter_=collections.Counter(col)
    nulls=counter_[-999]
    null_percentage=round(nulls/total,2)
    print(f'NULL percentage is: {null_percentage}')
    if null_percentage>percentage:
        remove_features.append(i)
tX=np.delete(tX,remove_features,1)

### Removing outliers

In [None]:
#remove the outliers in the code using the following formula and setting the outliers to the median
k = 1
for i in range(0,tX.shape[1]):
    q1 = np.percentile(tX[:,i],25)
    q2 = np.percentile(tX[:,i],50)
    q3 = np.percentile(tX[:,i],75)
    tX[:,i][(tX[:,i] < q1 - k*(q3-q1))] = q2
    tX[:,i][(tX[:,i] > q3 + k*(q3-q1))] = q2

### Normalization of features


In [None]:
#feature scaling by standardizing the dataset
tX = standardize(tX)

### Remove less influent features

In [None]:
#Apply least squares
threshold=1e-3
w,loss = least_squares(y,tX)
print(f'w:{w}\nloss:{loss}')
print(f'w:{w}\n\tshape:{w.shape}')
w=w[np.abs(w)>=threshold]
print(f'w:{w}\n\tshape:{w.shape}')


### Logistic Cross Validation - Searching best Degree

In [None]:
degrees=np.arange(3)
k_fold=4
max_iters=100
gammas=np.logspace(-10,0,10)
interactions=[False,True]
loss_tr_list=np.zeros((len(degrees),len(gammas),len(interactions)))
loss_te_list=np.zeros((len(degrees),len(gammas),len(interactions)))

for i,D in enumerate(degrees):
    print(i)
    for k,interaction in enumerate(interactions):
        phi_x=build_poly(tX, D, interaction)
        for j,gamma in enumerate(gammas):
            #compute loss with cross-validation
            loss_tr, loss_te=apply_cross_validation_logistic(y,phi_x,k_fold,max_iters,gamma,1)
            loss_tr_list[i,j,k]=loss_tr
            loss_te_list[i,j,k]=loss_te
D_best_index, gamma_best_index, interaction_index=np.unravel_index(np.argmin(loss_te_list),loss_te_list.shape)
gamma_best=gammas[gamma_best_index]
D_best_logistic=degrees[D_best_index]
interaction_logistic=interactions[interaction_index]
print(f'Best degree logistic: {D_best_logistic}, best gamma logistic:{gamma_best}, interaction:{interaction_logistic}')

### Cross Validation Ridge Regression - Best Degree

In [None]:
degrees=np.arange(3)
lambdas=np.logspace(-10,0,10)
k_fold=4
interactions=[False,True]
rmse_tr_list=np.zeros((len(degrees),len(lambdas),len(interactions)))
rmse_te_list=np.zeros((len(degrees),len(lambdas),len(interactions)))
for i,D in enumerate(degrees):
    print(f'Degree:{D}')
    for k,interaction in enumerate(interactions):
        phi_x=build_poly(tX, D, interaction)
        for j,lambda_ in enumerate(lambdas):
            #compute loss with cross-validation
            rmse_tr, rmse_te=apply_cross_validation(y,phi_x,k_fold,D,lambda_,1)
            rmse_tr_list[i,j,k]=rmse_tr
            rmse_te_list[i,j,k]=rmse_te
D_best_index,lambda_best_index,interaction_index=np.unravel_index(np.argmin(rmse_te_list),rmse_te_list.shape)
D_best_ridge=degrees[D_best_index]
lambda_best_ridge=lambdas[lambda_best_index]
interaction_ridge=interactions[interaction_index]
print(f'Best degree ridge:{D_best_ridge}, best lambda_ ridge:{lambda_best_ridge}, interactions: {interaction_ridge}')

### Testing accuracy (ridge-regression vs logistic)

In [None]:
#Re-transform y data
y_act=y*2-1

#Ridge
phi_tX_ridge=build_poly(tX,D_best_ridge,interaction_ridge)
w_ridge,_=ridge_regression(y,phi_tX_ridge,lambda_best_ridge)
y_pred_ridge=predict_labels(w_ridge,phi_tX_ridge)
accuracy_ridge=accuracy(y_pred_ridge,y_act)
#Logistic
phi_tX_logistic=build_poly(tX, D_best_logistic, interaction_logistic)
w_initial=np.zeros(phi_tX_logistic.shape[1])
max_iters=100
w_logistic,loss_logistic=logistic_regression(y,phi_tX_logistic,w_initial,max_iters,gamma_best)
y_pred_logistic=predict_labels(w_logistic,phi_tX_logistic,logistic=True)
accuracy_logistic=accuracy(y_pred_logistic,y_act)
print(f'Accuracy ridge:{accuracy_ridge} Accuracy logistic:{accuracy_logistic}')

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
tX_test=np.delete(tX_test,remove_features,1)
tX_test= standardize(tX_test)

In [None]:
tX_test_ridge=build_poly(tX_test,D_best_ridge,interaction_ridge)
weights=w_ridge
OUTPUT_PATH = 'output_ridge.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test_ridge)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
tX_test_logistic=build_poly(tX_test,D_best_logistic,interaction_logistic)
weights=w_logistic
OUTPUT_PATH = 'output_logistic.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test_logistic, logistic=True)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)