In [1]:
import numpy as np
import matplotlib.pyplot as plt
import datetime

In [2]:
from proj1_helpers import *
from helpers import *
from imp_functions import *
from implementations import *

In [17]:
y, x, ids = load_csv_data('train.csv')

In [4]:
def preprocess_and_call_log_reg(x,y,max_iters=1000,gamma=0.001):

    x,_ = remove_outliers(x)
    x, mean_x, std_x = normalize(x)
    
    x = build_model_data(x)
    
    y = convert_label(y,0)
    initial_w = np.zeros((x.shape[1]))
    
    x_tr, x_val, y_tr, y_val = split_data(x, y, ratio = 0.8, myseed =7) #split the data for validation
    
    weight,_ = logistic_regression(y_tr, x_tr, initial_w, max_iters, gamma)

    y_tr_pred = predict(x_tr,weight)

    
    y_val_pred = predict(x_val,weight)
    
    print('Training Accuracy : ', np.sum(y_tr_pred==convert_label(y_tr,-1))/y_tr.shape[0])
    print('Validation Accuracy : ',np.sum(y_val_pred == convert_label(y_val,-1))/y_val.shape[0] )

In [5]:
preprocess_and_call_log_reg(x,y)

Training Accuracy :  0.711245
Validation Accuracy :  0.70742


In [6]:
def preprocess_and_call_regularized_log_reg(x,y,max_iters=1000,gamma=0.001,lambda_=0.005):

    x,_ = remove_outliers(x)

    x, mean_x, std_x = normalize(x)
    
    x, _ = remove_high_correlation(x, threshold=0.9, visualisation=False)
    x, _ = remove_low_correlation_with_label(x,y, threshold=0.005)
    
    x = build_model_data(x)
    
    y = convert_label(y,0)
    initial_w = np.zeros((x.shape[1]))
    
    x_tr, x_val, y_tr, y_val = split_data(x, y, ratio = 0.8, myseed =7) #split the data for validation
    
    weight,_ = reg_logistic_regression(y_tr, x_tr, lambda_, initial_w, max_iters, gamma)

    y_tr_pred = predict(x_tr,weight)

    
    y_val_pred = predict(x_val,weight)
    
    print('Training Accuracy : ', np.sum(y_tr_pred==convert_label(y_tr,-1))/y_tr.shape[0])
    print('Validation Accuracy : ',np.sum(y_val_pred == convert_label(y_val,-1))/y_val.shape[0] )

In [7]:
preprocess_and_call_regularized_log_reg(x,y,max_iters=1000,gamma=0.001,lambda_=0.05)

Removing columns:  [21 29]
Removing columns:  [14 15 17 18 23 24 26 27]
Training Accuracy :  0.71369
Validation Accuracy :  0.71026


In [8]:
def batch_gradient_descent(
        y, tx, initial_w, batch_size, max_iters, gamma, lambda_):
    """Batch gradient descent algorithm with variable learning rate"""

    w = initial_w
    losses = []
    for i in range(max_iters):
        if i > 150 and i <= 300:
            gamma = 0.05
        elif i > 300 and i <= 700 :
            gamma = 0.01
        elif i > 700 and i <=1000 :
            gamma = 0.001
        elif i > 1000 and i < 1300 :
            gamma = 0.0005
        elif i > 1300 and i <= 1600 :
            gamma = 0.0001
        elif i > 1600 and  i<=1800:
            gamma = 0.00005
        elif i > 1800:
            gamma = 0.00001

        for minibatch_y,minibatch_tx in batch_iter(y,tx,batch_size):
            
            loss, grad =  reg_logistic_gradient(minibatch_y, minibatch_tx, lambda_,initial_w)
            w = w - gamma * grad
            losses.append(loss)
            
        avg_loss = np.mean(losses)
        
    return w,losses

In [9]:
def batch_GD_NLL_Loss(x,y,batch_size,max_iters=1000,gamma=0.001,lambda_=0.005):

    x,_ = remove_outliers(x)

    x, mean_x, std_x = normalize(x)
    
    x, _ = remove_high_correlation(x, threshold=0.9, visualisation=False)
    x, _ = remove_low_correlation_with_label(x,y, threshold=0.005)
    
    x = build_poly(x,2)
    
    y = convert_label(y,0)
    initial_w = np.zeros((x.shape[1]))
    
    x_tr, x_val, y_tr, y_val = split_data(x, y, ratio = 0.8, myseed =7) #split the data for validation
    
    weight,_ = batch_gradient_descent(y_tr, x_tr, initial_w,batch_size, max_iters, gamma,lambda_)

    y_tr_pred = predict(x_tr,weight)

    
    y_val_pred = predict(x_val,weight)
    
    print('Training Accuracy : ', np.sum(y_tr_pred==convert_label(y_tr,-1))/y_tr.shape[0])
    print('Validation Accuracy : ',np.sum(y_val_pred == convert_label(y_val,-1))/y_val.shape[0] )

In [10]:
batch_GD_NLL_Loss(x,y,256,max_iters=2000,gamma=0.1,lambda_= 0.005)

Removing columns:  [21 29]
Removing columns:  [14 15 17 18 23 24 26 27]
Training Accuracy :  0.69437
Validation Accuracy :  0.69058


In [18]:
def preprocess_and_call_least_squares(x,y):

    x,_ = remove_outliers(x)
    x, mean_x, std_x = normalize(x)
    
    x = build_model_data(x)
    
    initial_w = np.zeros((x.shape[1]))
    
    x_tr, x_val, y_tr, y_val = split_data(x, y, ratio = 0.8, myseed =7) #split the data for validation
    
    weight,_ = least_squares(y_tr, x_tr)

    y_tr_pred = predict(x_tr,weight)

    
    y_val_pred = predict(x_val,weight)
    
    print('Training Accuracy : ', np.sum(y_tr_pred==y_tr)/y_tr.shape[0])
    print('Validation Accuracy : ',np.sum(y_val_pred == y_val)/y_val.shape[0])

In [19]:
preprocess_and_call_least_squares(x,y)

Training Accuracy :  0.74526
Validation Accuracy :  0.74006


In [20]:
def preprocess_and_call_least_squares_GD(x,y,max_iters=1000,gamma=0.001):

    x,_ = remove_outliers(x)
    x, mean_x, std_x = normalize(x)
    
    x = build_model_data(x)
    
    initial_w = np.zeros((x.shape[1]))
    
    x_tr, x_val, y_tr, y_val = split_data(x, y, ratio = 0.8, myseed =7) #split the data for validation
    
    weight,_ = least_squares_GD(y_tr, x_tr, initial_w, max_iters, gamma)

    y_tr_pred = predict(x_tr,weight)

    
    y_val_pred = predict(x_val,weight)
    
    print('Training Accuracy : ', np.sum(y_tr_pred==y_tr)/y_tr.shape[0])
    print('Validation Accuracy : ',np.sum(y_val_pred == y_val)/y_val.shape[0])

In [21]:
preprocess_and_call_least_squares_GD(x,y,max_iters=1000,gamma=0.001)

Training Accuracy :  0.72622
Validation Accuracy :  0.72128


In [22]:
def preprocess_and_call_least_squares_SGD(x,y,max_iters=1000,gamma=0.001):

    x,_ = remove_outliers(x)
    x, mean_x, std_x = normalize(x)
    
    x = build_model_data(x)
    
    initial_w = np.zeros((x.shape[1]))
    
    x_tr, x_val, y_tr, y_val = split_data(x, y, ratio = 0.8, myseed =7) #split the data for validation
    
    weight,_ = least_squares_GD(y_tr, x_tr, initial_w, max_iters, gamma)

    y_tr_pred = predict(x_tr,weight)

    
    y_val_pred = predict(x_val,weight)
    
    print('Training Accuracy : ', np.sum(y_tr_pred==y_tr)/y_tr.shape[0])
    print('Validation Accuracy : ',np.sum(y_val_pred == y_val)/y_val.shape[0])

In [23]:
preprocess_and_call_least_squares_SGD(x,y,max_iters=1000,gamma=0.001)

Training Accuracy :  0.72622
Validation Accuracy :  0.72128
