# Full ML Pipeline
This notebook contains the code for the Project 1 full ML pipeline from data loading to hyperparameter tuning and to prediction on the test data.

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from helpers import *
from implementations import *

## Load the training data

In [2]:
DATA_TRAIN_PATH = 'data/train.csv'
y_train, X_train, ids = load_csv_data(DATA_TRAIN_PATH)

## Load the testing data

In [3]:
DATA_TEST_PATH = 'data/test.csv' 
_, X_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [4]:
X_train.shape, X_test.shape

((250000, 30), (568238, 30))

As we have selected the Regularized Logistic Regression as the best model suited for our task, we will perform the following steps based on this model.

## Preprocessing and feature engineering

In [7]:
X_train_zero, y_train_zero, X_train_one, y_train_one, X_train_many, y_train_many = split_by_jet_num(DATA_TRAIN_PATH, X_train, y_train)
X_test_zero, ids_test_zero, X_test_one, ids_test_one, X_test_many, ids_test_many = split_by_jet_num(DATA_TRAIN_PATH, X_test, ids_test)

## Hyperparameter tuning
Here we will tune our hyperparameters for the Regularized Logistic Regression

In [8]:
def tune_hyperparameters(X_train, y_train, X_test):
    tX_train, ty_train, tX_test, _, cont_features = preprocess(X_train, y_train, X_test, imputable_th=0.5, encodable_th=1, switch_encoding=True)
    param_grid = {
        'max_iters': 1000,
        'degree': list(range(1, 5)),
        'lambda_': np.logspace(-3, 0, 4),
        'gamma': [0.01, 0.1],
        'cont_features': [cont_features]
    }
    return reg_logistic_regression_cv(ty_train, tX_train, param_grid=param_grid)

In [9]:
metrics_zero, params_zero = tune_hyperparameters(X_train_zero, y_train_zero, X_test_zero)

  precision = tp / (tp + fp)


In [10]:
metrics_one, params_one = tune_hyperparameters(X_train_one, y_train_one, X_test_one)

In [11]:
metrics_many, params_many = tune_hyperparameters(X_train_many, y_train_many, X_test_many)

In [12]:
metrics_zero, params_zero

({'loss': 0.36910622237707635,
  'accuracy': 83.86430223592906,
  'f1_score': 0.6450241534035307},
 {'max_iters': 1000,
  'degree': 3,
  'lambda_': 0.001,
  'gamma': 0.1,
  'cont_features': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)})

In [13]:
metrics_one, params_one

({'loss': 0.4400778655095133,
  'accuracy': 80.07430997876858,
  'f1_score': 0.70956626219191},
 {'max_iters': 1000,
  'degree': 3,
  'lambda_': 0.001,
  'gamma': 0.1,
  'cont_features': (1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22)})

In [14]:
metrics_many, params_many

({'loss': 0.406439698330773,
  'accuracy': 82.17357640599704,
  'f1_score': 0.8025500180718208},
 {'max_iters': 1000,
  'degree': 3,
  'lambda_': 0.001,
  'gamma': 0.1,
  'cont_features': (1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29)})

## Run the model with tuned hyperparameters and make predictions
We can see that tuned hyperparameter values for all 3 models are in fact the same i.e. degree=3, lambda_=0.001, gamma=0.1 

In [16]:
def train_predict(X_train, y_train, X_test, max_iters=3000, degree=2, lambda_=0.01, gamma=0.01, imputable_th=1, encodable_th=0):
    tX_train, ty_train, tX_test, _, cont_features = preprocess(X_train, y_train, X_test, imputable_th=imputable_th, encodable_th=encodable_th, switch_encoding=True)
    tX_train_poly = build_poly(tX_train, degree=degree, cont_features=cont_features)
    weights, loss = reg_logistic_regression(ty_train, tX_train_poly, max_iters=max_iters, lambda_=lambda_, gamma=gamma)
    tX_test_poly = build_poly(tX_test, degree=degree, cont_features=cont_features)
    y_pred = predict_logistic(weights, tX_test_poly)
    y_pred = replace_values(y_pred, from_val=0, to_val=-1)
    return y_pred, weights, loss

In [17]:
y_pred_zero, weights_zero, loss_zero = train_predict(X_train_zero, y_train_zero, X_test_zero, max_iters=3000, gamma=params_zero['gamma'],
                                                     degree=params_zero['degree'], lambda_=params_zero['lambda_'], imputable_th=1, encodable_th=0)

In [18]:
y_pred_one, weights_one, loss_one = train_predict(X_train_one, y_train_one, X_test_one, max_iters=3000, gamma=params_one['gamma'],
                                                  degree=params_one['degree'], lambda_=params_one['lambda_'], imputable_th=1, encodable_th=0)

In [19]:
y_pred_many, weights_many, loss_many = train_predict(X_train_many, y_train_many, X_test_many, max_iters=3000, gamma=params_many['gamma'],
                                                     degree=params_many['degree'], lambda_=params_many['lambda_'], imputable_th=1, encodable_th=0)

In [20]:
a = X_train_zero.shape[0]
b = X_train_one.shape[0] 
c = X_train_many.shape[0]
avg_accuracy = ((metrics_zero['accuracy']*a) +  (metrics_one['accuracy']*b) + (metrics_many['accuracy']*c))/(a+b+c)

print(f"Average accuracy: {avg_accuracy}")

Average accuracy: 82.19813630204901


## Merge predictions and save ouput in csv format for submission:

In [21]:
from datetime import datetime
y_pred = np.vstack([y_pred_zero, y_pred_one, y_pred_many])
ids_test = np.hstack([ids_test_zero, ids_test_one, ids_test_many])
method = 'reg_logistic_regression'
time = datetime.now().strftime('%Y%m%dH%H%M%S')
OUTPUT_PATH = f'submissions/submission_{method}_{time}'
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)