In [180]:
# Useful starting lines
import numpy as np
from proj1_helpers import *
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt

from costs import compute_mse, compute_loss
# from plots import *
from helpers import *
# from grid_search import *
# import datetime
# from ridge_regression import *
# from gradient_descent import *
# from stochastic_gradient_descent import *
from implementations import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [182]:
DATA_PATH_TRAIN = '../data/train.csv'
y_train, data_train, ids_tr = load_csv_data(DATA_PATH_TRAIN, sub_sample=False)
# y_train, data_train, ids_tr = load_csv_data(DATA_PATH_TRAIN, sub_sample=False)

In [183]:
DATA_PATH_TEST = '../data/test.csv'
_, data_test, ids_te = load_csv_data(DATA_PATH_TEST, sub_sample=False)

In [184]:
print(data_train.shape)
print(data_test.shape)

(250000, 30)
(568238, 30)


### Features pre-processing

In [186]:
datasets_train, datasets_test = preprocess_datasets(data_train, data_test, y_train = y_train)

Features standardized for subset : 0
Features standardized for subset : 1
Features standardized for subset : 2


# 1. Least Square Gradient Descent 


In [189]:
def cross_validation(y, x, k_indices, k, method, max_iters = 0, gamma = 0, lambda_ = 0):
    """return the loss of ridge regression."""

    # get k'th subgroup in test, others in train
    ind_train = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    ind_test = k_indices[k]
    ind_train = ind_train.reshape(-1)
    
    x_train = x[ind_train]
    y_train = y[ind_train]
    x_test = x[ind_test]
    y_test = y[ind_test]
    
    # form data  -- TODO: Change for augmented features
    poly_train = x_train
    poly_test = x_test

    if method == least_squares_GD or method == least_squares_SGD or method == logistic_regression:
        initial_w = np.zeros([poly_train.shape[1]])
        w, loss = method(y = y_train, tx = poly_train, initial_w = initial_w, max_iters = max_iters, gamma = gamma)
        
    elif method == least_squares:
        w, loss = method(y = y_train, tx = poly_train)
        
    elif method == ridge_regression:
        w, loss = method(y = y_train, tx = poly_train, lambda_ = lambda_)
        
    elif method == reg_logistic_regression:
        initial_w = np.zeros([poly_train.shape[1]])
        w, loss = method(y = y_train, tx = poly_train, lambda_ = lambda_, initial_w = initial_w, max_iters = max_iters, gamma = gamma)
    
    # Compute prediction for train and test
    y_pred_train = predict_labels(w, poly_train)
    y_pred_test = predict_labels(w, poly_test)
    
    loss_tr = 2 * np.sqrt(2 * compute_loss(y_train, poly_train, w))
    loss_te = 2 * np.sqrt(2 * compute_loss(y_test, poly_test, w))
#     print(" --- y = -1 :: Reel: {} :: Prediction: {}".format( (y_train == -1).sum(), (y_pred_train == -1).sum() )) 
#     print(" --- y = +1 :: Reel: {} :: Prediction: {}".format( (y_train == 1).sum(), (y_pred_train == 1).sum() )) 
    
    # Compute accuracy for train and test 
    accuracy_train = compute_accuracy(y_pred_train, y_train)
    accuracy_test = compute_accuracy(y_pred_test, y_test)
    
    
    return accuracy_train, accuracy_test, w, loss_tr, loss_te

In [194]:
from plots import cross_validation_visualization

# Model parameters
method = logistic_regression

max_iters = 150
seed = 1
k_fold = 8
lambdas = np.logspace(-5, 0, 15)
# lambdas = []
# lambdas.append(1)
# gammas = np.arange(0.5, 0.7, 0.1)
gamma = 0.7

y_pred_final = np.zeros((len(ids_te),1))

# For each case (jet = 0, jet = 1, jet = 2,3)
for ind, subset_train in enumerate(datasets_train):
    print("\n\n For case jet = {} \n".format(ind))
    x = subset_train[0]
    y = subset_train[1]
    
    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)

    # define lists to store the accuracy of training data and test data
    accuracy_train = []
    accuracy_test = []

#     for gamma in gammas:
    
    accuracy_train_lambda = []
    accuracy_test_lambda = []

    rmse_train_lambda = []
    rmse_test_lambda = []

    for lambda_ in lambdas:
        param_method = dict(max_iters = max_iters, gamma = gamma, lambda_ = lambda_)

        accuracy_train_k = []
        accuracy_test_k = []
        rmse_train_k = []
        rmse_test_k = []
        w_k = []
        # cross validation  
        for k in range(k_fold):
            accuracy_train_crt, accuracy_test_crt, w, rmse_tr_crt, rmse_te_crt = cross_validation(y, x, k_indices, k, method = method, **param_method)
            accuracy_train_k.append(accuracy_train_crt)
            accuracy_test_k.append(accuracy_test_crt)
            rmse_train_k.append(rmse_tr_crt)
            rmse_test_k.append(rmse_te_crt)
            w_k.append(w)
#                 print("%d - Training accuracy: %f / Test accuracy : %f" % (k, accuracy_train_crt, accuracy_test_crt))

        print("\nSUBSET {} --- lambda_ = {} :: gamma = {}".format(ind, lambda_, gamma))
        print("Average test accuracy: {}".format(np.mean(accuracy_test_k)))
        print("Variance test accuracy: {}".format(np.std(accuracy_test_k)))
        print("Min test accuracy: {}".format(np.min(accuracy_test_k)))
        print("Max test accuracy: {}\n".format(np.max(accuracy_test_k)))   

        accuracy_train_lambda.append(np.mean(accuracy_train_k))
        accuracy_test_lambda.append(np.mean(accuracy_test_k))

        rmse_train_lambda.append(np.mean(rmse_train_k))
        rmse_test_lambda.append(np.mean(rmse_test_k))
        

#         fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6), sharey=True)
#         axes[0].boxplot([rmse_train_lambda, rmse_test_lambda] , labels=['rmse_train_lambda', 'rmse_test_lambda'], notch=True, bootstrap=10000)
#         cross_validation_visualization(lambdas, rmse_train_lambda, rmse_test_lambda)

    accuracy_train.append(np.mean(accuracy_train_lambda))
    accuracy_test.append(np.mean(accuracy_test_lambda))
            


    
    
    # ****************************************
    # ****** Predict for datasets_test *******
    # ****************************************
    if test == True:
        subset_test = datasets_test[ind][0]
        poly_test = subset_test

        y_pred_crt = predict_labels(w_k[0], poly_test)   # TODO : for now takes the first w
        np.put(y_pred_final, datasets_test[ind][1], y_pred_crt)




 For case jet = 0 


SUBSET 0 --- lambda_ = 1e-05 :: gamma = 0.7
Average test accuracy: 0.7428737288814157
Variance test accuracy: 0.0066308483540856175
Min test accuracy: 0.7286412042597485
Max test accuracy: 0.7525822723997118


SUBSET 0 --- lambda_ = 2.2758459260747865e-05 :: gamma = 0.7
Average test accuracy: 0.7445552085835536
Variance test accuracy: 0.0039154271709591765
Min test accuracy: 0.7388101529345824
Max test accuracy: 0.752902554247738


SUBSET 0 --- lambda_ = 5.1794746792312125e-05 :: gamma = 0.7
Average test accuracy: 0.741572583873809
Variance test accuracy: 0.01325764725198929
Min test accuracy: 0.7096645047641925
Max test accuracy: 0.7570662182720794


SUBSET 0 --- lambda_ = 0.00011787686347935866 :: gamma = 0.7
Average test accuracy: 0.7422832092241172
Variance test accuracy: 0.009613620057195205
Min test accuracy: 0.7183121146609016
Max test accuracy: 0.752902554247738


SUBSET 0 --- lambda_ = 0.0002682695795279727 :: gamma = 0.7
Average test accuracy: 0.7406517


SUBSET 2 --- lambda_ = 0.0031622776601683794 :: gamma = 0.7
Average test accuracy: 0.4890537112606154
Variance test accuracy: 0.08895961449520427
Min test accuracy: 0.3625234366383589
Max test accuracy: 0.6702327120326459


SUBSET 2 --- lambda_ = 0.0071968567300115215 :: gamma = 0.7
Average test accuracy: 0.6065953457593471
Variance test accuracy: 0.06564790752819066
Min test accuracy: 0.4468953347303408
Max test accuracy: 0.6733208337928752


SUBSET 2 --- lambda_ = 0.016378937069540647 :: gamma = 0.7
Average test accuracy: 0.49692566449762876
Variance test accuracy: 0.07250539818466127
Min test accuracy: 0.44336605271865004
Max test accuracy: 0.6654902393294364


SUBSET 2 --- lambda_ = 0.037275937203149416 :: gamma = 0.7
Average test accuracy: 0.5060659534575935
Variance test accuracy: 0.04766409999296237
Min test accuracy: 0.44799823535899413
Max test accuracy: 0.5577368479100033


SUBSET 2 --- lambda_ = 0.08483428982440726 :: gamma = 0.7
Average test accuracy: 0.5376916289842285
Va

In [None]:
if test == True:
    if method == least_squares_GD:
        create_csv_submission(ids_te, y_pred_final, '../least_squares_GD.csv')
        
    elif method == least_squares_SGD:
        create_csv_submission(ids_te, y_pred_final, '../least_squares_SGD.csv')
        
    elif method == least_squares:
        create_csv_submission(ids_te, y_pred_final, '../least_squares.csv')
        
    elif method == ridge_regression:
        create_csv_submission(ids_te, y_pred_final, '../ridge_regression.csv')
        
    elif method == logistic_regression:
        create_csv_submission(ids_te, y_pred_final, '../logistic_regression.csv')
        
    elif method == reg_logistic_regression:
        create_csv_submission(ids_te, y_pred_final, '../reg_logistic_regression.csv')
        