In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import collections
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [None]:
from implementations import *

## Do your thing crazy machine learning thing here :) ...

### Testing functions

In [None]:
#GD
initial_w=np.zeros(tX.shape[1])
max_iters=50
gamma=1e-1
w,loss = least_squares_GD(y,tX,initial_w,max_iters,gamma)
print(f'w:{w}\nloss:{loss}')

In [None]:
#SDG
initial_w=np.zeros(tX.shape[1])
max_iters=20
gamma=1e-4
w,loss = least_squares_SGD(y,tX,initial_w,max_iters,gamma)
print(f'w:{w}\nloss:{loss}')

In [None]:
#LS
w,loss = least_squares(y,tX)
print(f'w:{w}\nloss:{loss}')

In [None]:
#RR
lambda_=0.1
w,loss = ridge_regression(y,tX,lambda_)
print(f'w:{w}\nloss:{loss}')

In [None]:
#LR
initial_w=np.zeros(tX.shape[1])
max_iters=1000
gamma=1e-8
w,loss = logistic_regression(y,tX,initial_w,max_iters,gamma)
print(f'w:{w}\nloss:{loss}')

In [None]:
#RLR
initial_w=np.zeros(tX.shape[1])
max_iters=300
gamma=1e-9
lambda_=0.01
w,loss = reg_logistic_regression(y,tX, lambda_, initial_w, max_iters,gamma)
print(f'w:{w}\nloss:{loss}')

### Cleaning dataset

#### Transform [-1,1] into [0,1]

In [None]:
#tX[:,:][tX[:,:] == -999] = 0
#we could normalize the data ranging from [0,1] since its binary prediction",
y = (y-min(y))/(max(y)-min(y))


### Remove columns containing over 50% of NULL values

In [None]:
remove_features=[]
for i in range(tX.shape[1]):
    col=tX[:,i]
    total=col.shape[0]
    counter_=collections.Counter(col)
    nulls=counter_[-999]
    null_percentage=round(nulls/total,2)
    print(f'NULL percentage is: {null_percentage}')
    if null_percentage>0.5:
        remove_features.append(i)
tX=np.delete(tX,remove_features,1)

### Removing outliers

In [None]:
q1 = []
q2 = []
k = 1
for i in range(0,tX.shape[1]):
    q1 = np.percentile(tX[:,i],25)
    q2 = np.percentile(tX[:,i],50)
    q3 = np.percentile(tX[:,i],75)
    tX[:,i][(tX[:,i] < q1 - k*(q3-q1))] = q2
    tX[:,i][(tX[:,i] > q3 + k*(q3-q1))] = q2


### Normalization of features

In [None]:
tX,_,_ = standardize(tX)
    
#tX[:,:][tX[:,:] = -999].shape[0]


### Logistic Cross Validation - Searching best Degree

In [None]:
degrees=np.arange(6)
k_fold=5
max_iters=50
gamma=1e-8
rmse_tr_list=[]
rmse_te_list=[]
for D in degrees:
    #compute loss with cross-validation
    rmse_tr, rmse_te=apply_cross_validation_logistic(y,tX,k_fold,D,max_iters,gamma,1)
    rmse_tr_list.append(rmse_tr)
    rmse_te_list.append(rmse_te)
print(rmse_te_list)
D_best_index=degrees[np.argmin(np.array(rmse_te_list))]
D_best=degrees[D_best_index]
    

In [None]:
D_best

### Cross Validation Ridge Regression - Best Degree

In [None]:
degrees=np.arange(7)
lambdas=np.logspace(-6,0,20)
k_fold=5
rmse_tr_list=np.zeros((len(degrees),len(lambdas)))
rmse_te_list=np.zeros((len(degrees),len(lambdas)))
for i,D in enumerate(degrees):
    for j,lambda_ in enumerate(lambdas):
        #compute loss with cross-validation
        rmse_tr, rmse_te=apply_cross_validation(y,tX,k_fold,D,lambda_,1)
        rmse_tr_list[i,j]=rmse_tr
        rmse_te_list[i,j]=rmse_te
#D_best=degrees[np.argmin(np.array(rmse_te_list))]
D_best_index,lambda_best_index=np.unravel_index(np.argmin(rmse_te_list),rmse_te_list.shape)
D_best=degrees[D_best_index]
lambda_best=lambdas[lambda_best_index]
print(f'degree:{D_best} lambda_:{lambda_best}')    

### Testing ridge-regression with best Degree

In [None]:
phi_tX=build_poly(tX,D_best)
lambda_=1e-9
w,loss=ridge_regression(y,phi_tX,lambda_)
print(f'w:{w}\nloss:{loss}')

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
tX_test=np.delete(tX_test,remove_features,1)
tX_test,_,_ = standardize(tX_test)
tX_test=build_poly(tX_test,D_best)

In [None]:
tX_test.shape

In [None]:
weights=w
OUTPUT_PATH = 'output4' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)