# Project 1: Machine Learning

In [4]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import json
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from typing import Tuple
from helper import load_csv_data
from processing import *
from implementations import *
from feature_expansion import *
from crossvalidation import *
from metrics import f1_score, mse_loss

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Import of the data
Import of the train and test data

In [5]:
y_train, x_train, id_train = load_csv_data("../data/train.csv")
y_test , x_test , id_test  = load_csv_data("../data/test.csv")
# print('train data shape: ', x_train.shape, y_train.shape)
# print('test  data shape: ', x_test.shape, y_test.shape)
with open('../col_name.json', 'r') as file:
    features = json.load(file)['col_names']

# Preprocess data
We pre process the data to get a clean dataset

In [6]:
x_train_cleaned = standardize(clean_data(x_train, features))

In [7]:
features = np.genfromtxt("../data/train.csv",
              delimiter=',',
              encoding='UTF-8-sig',
              dtype=None,
              names=True).dtype.names[2:]

We then divide the dataset depending on the Pri_Jet_number feature which can take values 0, 1, 2 or 3. Since the number values that are equal to 3 is really small, we will combine it with the values which have 2 so we will have a 3 subsets

### Feature expansion
We will now do feature engineering to increase the results we will have. We do degree root transformation, polynomial transformation, logarithmic transformation and reciprocical transformation.

In [95]:
x_train_finished = build_new_x(x_train_cleaned)

eoeoeoeoeoeoeoeoe!!!!!!!!!!!!!!!!!!!!!
0
0
0
0


In [8]:
X_0, y_0, X_1, y_1, X_23, y_23 = pre_process_data(x_train, y_train, features)

In [None]:
for i in X_0:
    if(all([b.is_integer() for b in i] or all([b.is_float() for b in i]) == False):
        print(i)

# Cross-Validation Pipeline

cross validation pipeline. we will use 5-fold cross validation for choosing the optimal parameters.
below is the list of models we will use throughout this process.

1. least_squares (no parameter tuning needed.)
2. ridge LS 
3. mse_gd
4. mse_sgd
5. logistic
6. reg_logistic

we will optimise the weigth based on the mse loss. But the selection process will be based on observing F1 score on validation set.

To see the effect of data manipulation, we will try 3 sets of data.
1. cleaned data
2. standardized data
3. feature-engineered data

From this process, we would expect the 3rd trial would give us the best result.

In [None]:
def func(d):
      
    for key in d:
        print("key:", key, "Value:", d[key])
          
# Driver's code
D = {'':1, 'b':2, 'c':3}
func(D)

In [33]:
ls_dict = {'tx': x_train, 'y': y_train}
least_squares(**ls_dict)

(array([ 8.03299588e-05, -7.20237523e-03, -6.05398451e-03, -5.47555763e-04,
        -1.93895382e-02,  4.73455547e-04, -2.60381502e-02,  3.25107737e-01,
        -3.80965362e-05, -2.72728860e+00, -2.21218488e-01,  9.50801264e-02,
         6.40374762e-02,  2.73554833e+00, -3.31802422e-04, -9.54328018e-04,
         2.74030502e+00, -5.34164915e-04,  9.73498603e-04,  3.69225052e-03,
         3.54487433e-04, -5.43344599e-04, -3.30448035e-01, -1.40800498e-03,
         8.31432880e-04,  1.02117272e-03, -1.68047416e-03, -5.83664815e-03,
        -1.11087998e-02,  2.72774855e+00]),
 0.33968680955669167)

In [94]:
from itertools import product

class HyperParameterTuner:
    def __init__(
        self, 
        x: np.ndarray, 
        y: np.ndarray, 
        model_name: str,
        num_folds: int,
        num_seed: int=0,
        max_iter: int=1000  
    ):

        available_models = {
        'least_squares': least_squares,
        'ridge': ridge_regression,
        'mse_gd': mean_squared_error_gd,
        'mse_sgd': mean_squared_error_sgd,
        'logistic': logistic_regression,
        'reg_logistic': reg_logistic_regression
        }

        self.x = x
        self.y = y
        self.model = available_models[model_name]
        self.model_name = model_name
        self.num_folds = num_folds
        self.num_seed  = num_seed
        self.max_iter  = max_iter
        
        # build k_indices
        np.random.seed(self.num_seed)
        self.build_k_indices()

        # get model params given model specs.
        model_parameters = {
            'least_squares': {}, 
            'ridge'        : {'lambda_': None},
            'mse_gd'       : {'initial_w': np.ones((len(y),1)), 'max_iters': self.max_iter, 'gamma': None},
            'mse_sgd'      : {'initial_w': np.ones((len(y),1)), 'max_iters': self.max_iter, 'gamma': None},
            'logistic'     : {'initial_w': np.ones((len(y),1)), 'max_iters': self.max_iter, 'gamma': None},
            'reg_logistic' : {'initial_w': np.ones((len(y),1)), 'max_iters': self.max_iter, 'gamma': None, 'lambda_': None},
        }
        self.hyp_params = model_parameters[model_name]

    def tune_(self) -> Tuple[list, float]:
        """
        hyperparameter tuning done by grid search.
        best parameters are found by finding the maximum f1 scores.
        """

        lambdas = np.logspace(-15,0,100)
        gammas  = np.linspace(0,1,100)        

        f1_scores = []
        params = self.hyp_params
        # cross validation
        if self.model_name == 'least_squares':
            results = np.array([[k, self.cross_validation_per_k(k, self.hyp_params)[-1]] for k in range(self.num_folds)])
            return results[np.argmax(results[:,-1])]   
        
        elif self.model_name == 'reg_logistic':
            lambda_and_gammas = product(gammas, lambdas)
            for (gamma, lambda_) in lambda_and_gammas:
                params['gamma'], params['lambda_'] = gamma, lambda_
                results = np.concatenate([self.cross_validation_per_k(k, params) for k in range(self.num_folds)], axis=0)
                f1_scores.append(np.mean(results, axis=0)[-1])
        
            optimum_idx = np.argmax(f1_scores)
            best_params, best_f1 = lambda_and_gammas[optimum_idx], f1_scores[optimum_idx]
            return best_params, best_f1     

        elif self.model_name == 'ridge':
            for lambda_ in lambdas:
                params['lambda_'] =lambda_
                results = np.concatenate([self.cross_validation_per_k(k, params) for k in range(self.num_folds)], axis=0)
                f1_scores.append(np.mean(results, axis=0)[-1])
                        
            optimum_idx = np.argmax(f1_scores)
            best_params, best_f1 = lambdas[optimum_idx], f1_scores[optimum_idx]
            return best_params, best_f1     
        else:
            for gamma in gammas:
                params['gamma'] = gamma
                results = np.concatenate([self.cross_validation_per_k(k, params) for k in range(self.num_folds)], axis=0)
                f1_scores.append(np.mean(results, axis=0)[-1])
            
            optimum_idx = np.argmax(f1_scores)
            best_params, best_f1 = gammas[optimum_idx], f1_scores[optimum_idx]
            return best_params, best_f1     
        

    def cross_validation_per_k(self, k: int, params: dict):
        """return the loss of given model."""
        # get k'th subgroup in test, others in train
        tr_indices, te_indices = self.k_indices[~(np.arange(self.k_indices.shape[0]) == k)].reshape(-1),\
                                 self.k_indices[k]
        
        # split the data based on train and validation indices
        y_trn, y_val = self.y[tr_indices], self.y[te_indices]
        x_trn, x_val = self.x[tr_indices], self.x[te_indices]

        # run the model
        params['tx'], params['y'] = x_trn, y_trn
        w, _ = self.model(**params)
        
        # calculate the loss for train and test data
        loss_trn = np.sqrt(mse_loss(y_trn, x_trn, w))
        loss_val = np.sqrt(mse_loss(y_val, x_val, w))
        
        # get validation f1-score
        y_pred = get_classification_pred(x_val, w)
        f1_val = f1_score(y_val, y_pred)
        return loss_trn, loss_val, f1_val

    def build_k_indices(self):
        """
        build k indices for k-fold.
        Args:
            y:      shape=(N,)
            k_fold: K in K-fold, i.e. the fold num
            seed:   the random seed

        Returns:
            A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold
        """
        num_row  = self.y.shape[0]
        interval = int(num_row / self.num_folds)
        indices  = np.random.permutation(num_row)

        self.k_indices = np.array([indices[k * interval : (k + 1) * interval] for k in range(self.num_folds)])
   


In [86]:
tuner = HyperParameterTuner(x_train, y_train, 'least_squares', 5)

In [87]:
tuner.tune_()

array([2.        , 0.57177354])

# Learning algorithms

### Least squares

In [9]:
w_0, loss_0 = least_squares(y_0, X_0)
w_1, loss_1 = least_squares(y_1, X_1)
w_23, loss_23 = least_squares(y_23, X_23)
y_pred_0 = get_classification_pred(X_0, w_0)
y_pred_1 = get_classification_pred(X_1, w_1)
y_pred_23 = get_classification_pred(X_23, w_23)
print(f1_score(y_0, y_pred_0))
print(f1_score(y_1, y_pred_1))
print(f1_score(y_23, y_pred_23))

(99913, 19)
(77544, 23)
(72543, 30)
0.5592677345537758
0.530324932436679
0.6845379481959054


### Least squares with ridges regression

In [10]:
w_0, loss_0 = ridge_regression(y_0, X_0, 10)
w_1, loss_1 = ridge_regression(y_1, X_1, 10)
w_23, loss_23 = ridge_regression(y_23, X_23, 10)
y_pred_0 = get_classification_pred(X_0, w_0)
y_pred_1 = get_classification_pred(X_1, w_1)
y_pred_23 = get_classification_pred(X_23, w_23)
print(f1_score(y_0, y_pred_0))
print(f1_score(y_1, y_pred_1))
print(f1_score(y_23, y_pred_23))

0.6259504278027698
0.5559456972792309
0.6526675851584398


### Least squares with gradient descent

In [11]:
w_0, loss_0 = mean_squared_error_gd(y_0, X_0, np.ones((19,)), 100, 1e-3)
w_1, loss_1 = mean_squared_error_gd(y_1, X_1, np.ones((23,)), 100, 1e-3)
w_23, loss_23 = mean_squared_error_gd(y_23, X_23, np.ones((30,)), 100, 1e-3)
y_pred_0 = get_classification_pred(X_0, w_0)
y_pred_1 = get_classification_pred(X_1, w_1)
y_pred_23 = get_classification_pred(X_23, w_23)
print(f1_score(y_0, y_pred_0))
print(f1_score(y_1, y_pred_1))
print(f1_score(y_23, y_pred_23))

0.3424220782533286
0.4789386303865536
0.5330964394854666


### Least squares with stochastic gradient descent

In [132]:
w_0, loss_0 = mean_squared_error_sgd(y_0, X_0, np.ones((19,)), 100, 1e-3)
w_1, loss_1 = mean_squared_error_sgd(y_1, X_1, np.ones((23,)), 100, 1e-3)
w_23, loss_23 = mean_squared_error_sgd(y_23, X_23, np.ones((30,)), 100, 1e-3)
y_pred_0 = get_classification_pred(X_0, w_0)
y_pred_1 = get_classification_pred(X_1, w_1)
y_pred_23 = get_classification_pred(X_23, w_23)
print(f1_score(y_0, y_pred_0))
print(f1_score(y_1, y_pred_1))
print(f1_score(y_23, y_pred_23))

1


ValueError: operands could not be broadcast together with shapes (99913,) (99913,19) 

### Logistic regression

In [None]:
w, l = logistic_regression(y_train, x_train_cleaned, np.ones((31,)), 1000, 1e-3)
w_0, loss_0 = mean_squared_error_sgd(y_0, X_0, np.ones((19,)), 100, 1e-3)
w_1, loss_1 = mean_squared_error_sgd(y_1, X_1, np.ones((23,)), 100, 1e-3)
w_23, loss_23 = mean_squared_error_sgd(y_23, X_23, np.ones((30,)), 100, 1e-3)
y_pred_0 = get_classification_pred(X_0, w_0)
y_pred_1 = get_classification_pred(X_1, w_1)
y_pred_23 = get_classification_pred(X_23, w_23)
print(f1_score(y_0, y_pred_0))
print(f1_score(y_1, y_pred_1))
print(f1_score(y_23, y_pred_23))

### Regularized logistic regression

In [None]:
w, l = reg_logistic_regression(y_train, x_train_cleaned, 0.2,  np.ones((31,)), 1000, 1e-3)

# Cross validation 

In [68]:
tuner = HyperParameterTuner(X_0, y_0, 'ridge', 5)

In [69]:
tuner.tune_()

ended lambda 1e-15
ended lambda 1.1787686347935866e-14
ended lambda 1.389495494373136e-13
ended lambda 1.637893706954068e-12
ended lambda 1.9306977288832457e-11
ended lambda 2.275845926074791e-10
ended lambda 2.6826957952797275e-09
ended lambda 3.162277660168379e-08
ended lambda 3.727593720314938e-07
ended lambda 4.393970560760786e-06
ended lambda 5.179474679231202e-05
ended lambda 0.0006105402296585314
ended lambda 0.007196856730011528
ended lambda 0.08483428982440726
ended lambda 1.0


(0.0006105402296585314, 0.6605847438997582)