# Project 1: Machine Learning

In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import json
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from helper import *
from processing import *
from implementations import *
from feature_expansion import *
from crossvalidation import *
from metrics import f1_score, mse_loss

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Import of the data
Import of the train and test data

In [5]:
y_train, x_train, id_train = load_csv_data("../data/train.csv")
y_test , x_test , id_test  = load_csv_data("../data/test.csv")
# print('train data shape: ', x_train.shape, y_train.shape)
# print('test  data shape: ', x_test.shape, y_test.shape)
with open('../col_name.json', 'r') as file:
    features = json.load(file)['col_names']

# Preprocess data
We pre process the data to get a clean dataset

In [6]:
x_train_cleaned = standardize(clean_data(x_train, features))

We then divide the dataset depending on the Pri_Jet_number feature which can take values 0, 1, 2 or 3. Since the number values that are equal to 3 is really small, we will combine it with the values which have 2 so we will have a 3 subsets

### Feature expansion
We will now do feature engineering to increase the results we will have. We do degree root transformation, polynomial transformation, logarithmic transformation and reciprocical transformation.

In [7]:
x_train_finished = build_new_x(x_train_cleaned)

  log_column = np.log(1 + log_column)


In [8]:
test_data = clean_data(x_train, features)
print(np.isnan(x_train_finished).sum())

724309


# Cross-Validation Pipeline

cross validation pipeline. we will use 5-fold cross validation for choosing the optimal parameters.
below is the list of models we will use throughout this process.

1. least_squares (no parameter tuning needed.)
2. ridge LS 
3. mse_gd
4. mse_sgd
5. logistic
6. reg_logistic

we will optimise the weigth based on the mse loss. But the selection process will be based on observing F1 score on validation set.

To see the effect of data manipulation, we will try 3 sets of data.
1. cleaned data
2. standardized data
3. feature-engineered data

From this process, we would expect the 3rd trial would give us the best result.

In [29]:
def cross_validation(y, x, k_indices, k, lambda_):
    """return the loss of ridge regression."""
    # get k'th subgroup in test, others in train
    te_indices = k_indices[k]
    tr_indices = k_indices[~(np.arange(k_indices.shape[0]) == k)].reshape(-1)
    
    # split the data based on train and validation indices
    y_trn, y_val = y[tr_indices], y[te_indices]
    x_trn, x_val = x[tr_indices], x[te_indices]

    # ridge regression
    w, _ = ridge_regression(y_trn, x_trn, lambda_)
    
    # calculate the loss for train and test data
    loss_trn = np.sqrt(mse_loss(y_trn, x_trn, w))
    loss_val = np.sqrt(mse_loss(y_val, x_val, w))
    
    # get validation f1-score
    y_pred = get_classification_pred(x_val, w)
    f1_val = f1_score(y_val, y_pred)
    return loss_trn, loss_val, f1_val

def best_degree_selection(y, x, k_fold, seed = 1):
    # define the range of lambda values to try.
    lambdas = np.logspace(-15,0,100)

    # split data in k fold
    k_indices = build_k_indices(y, k_fold, seed)
    
    # cross validation
    f1_scores = []
    for lambda_ in lambdas:
        rmse_te_tmp = []
        results = [cross_validation(y, x, k_indices, k, lambda_) for k in range(k_fold)]
        f1_scores.append(np.mean(results, axis=0)[-1])
    
    optimum_idx = np.argmax(f1_scores)
    best_lambda, best_f1 = lambdas[optimum_idx], f1_scores[optimum_idx]
    print(f" Lambda : {best_lambda}, F1: {best_f1}")
    return best_lambda, best_f1

In [30]:
best_degree_selection(y_train, x_train, 5)

 Lambda : 0.043287612810830614, F1: 0.5688146561392932


(0.043287612810830614, 0.5688146561392932)

# Learning algorithms

### Least squares

In [6]:
weight, loss = least_squares(y_train, x_train_cleaned)
print(compute_mse_loss(y_train, x_train_cleaned, weight))

0.51004


### Least squares with ridges regression

In [7]:
weight, loss = ridge_regression(y_train, x_train_cleaned, 10)
print(loss)

0.509928


### Least squares with gradient descent

In [8]:
weight, loss = mean_squared_error_gd(y_train, x_train_cleaned, np.ones((31,)), 100, 1e-3)
loss

1.215536

### Least squares with stochastic gradient descent

In [9]:
weight, loss = mean_squared_error_sgd(y_train, x_train_cleaned, np.ones((31,)), 100, 1e-3)
loss

0.775736

### Logistic regression

In [10]:
w, l = logistic_regression(y_train, x_train_cleaned, np.ones((31,)), 1000, 1e-3)

Current iteration=0, loss=1.995336631273058
Current iteration=100, loss=1.0491363661898327
Current iteration=200, loss=0.27006069549426187
Current iteration=300, loss=-0.37199300383762574
Current iteration=400, loss=-0.8974960829542113
Current iteration=500, loss=-1.3222079469218488
Current iteration=600, loss=-1.661546133055785
Current iteration=700, loss=-1.9325063200387396
Current iteration=800, loss=-2.15229341612981
Current iteration=900, loss=-2.335662630699634


In [11]:
w, l = reg_logistic_regression(y_train, x_train_cleaned, 0.2,  np.ones((31,)), 1000, 1e-3)

Current iteration=0, loss=1.995336631273058
Current iteration=100, loss=0.9989272803604264
Current iteration=200, loss=0.20757768961068648
Current iteration=300, loss=-0.41169620130994156
Current iteration=400, loss=-0.8829184065559729
Current iteration=500, loss=-1.2271743752375226
Current iteration=600, loss=-1.4682340429038723
Current iteration=700, loss=-1.632840431601386
Current iteration=800, loss=-1.745383871074444
Current iteration=900, loss=-1.8236927657875832
