# Project 1: Machine Learning

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from helper import *
from processing import *
from implementations import *
from feature_expansion import *

# Import of the data
Import of the train and test data

In [60]:
y_train, x_train, id_train = load_csv_data("../data/train.csv")
print(x_train.shape, y_train.shape)

(250000, 30) (250000,)


In [4]:
y_test, x_test, id_test = load_csv_data("../data/test.csv")
print(x_test.shape, y_test.shape)

(568238, 30) (568238,)


In [5]:
features = np.genfromtxt("../data/train.csv",
              delimiter=',',
              encoding='UTF-8-sig',
              dtype=None,
              names=True).dtype.names[2:]

In [31]:
print(features)

('DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_deltar_tau_lep', 'DER_pt_tot', 'DER_sum_pt', 'DER_pt_ratio_lep_tau', 'DER_met_phi_centrality', 'DER_lep_eta_centrality', 'PRI_tau_pt', 'PRI_tau_eta', 'PRI_tau_phi', 'PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_met', 'PRI_met_phi', 'PRI_met_sumet', 'PRI_jet_num', 'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'PRI_jet_all_pt')


# Preprocess data
We pre process the data to get a clean dataset

In [99]:
x_train_cleaned, _, _ = standardize(clean_data(x_train, features))

0
1
2
3
5
8
9
10
13
16
19
20
21
23
26


We then divide the dataset depending on the Pri_Jet_number feature which can take values 0, 1, 2 or 3. Since the number values that are equal to 3 is really small, we will combine it with the values which have 2 so we will have a 3 subsets

In [33]:
print(x_train_cleaned.shape)

(250000, 30)


### Feature expansion
We will now do feature engineering to increase the results we will have. We do degree root transformation, polynomial transformation, logarithmic transformation and reciprocical transformation.

In [96]:
x_train_finished = build_new_x(x_train_cleaned, features)
x_train_finished = np.insert(x_train_finished, [0], np.ones((x_train.shape[0],1)), axis=1)

[ 1.07398883  0.09429484  0.61533879 -0.17287391 -0.47835472  0.91872062
 -0.45853853 -0.45404716 -0.01547287  1.7430674  -0.4707713  -0.47287028
 -0.48636696 -0.12030909 -0.47714724 -0.4843244   0.09396758 -0.46297347
 -0.51586555 -0.29876766 -0.49174983  2.43113838 -0.46605423  0.27236973
 -0.46436151 -0.48361346  0.03117869 -0.47463072 -0.51655392  0.79217235]
(250000, 0)
1
2
3
(250000, 87)


In [97]:
print(x_train[0])

[ 1.38470e+02  5.16550e+01  9.78270e+01  2.79800e+01  9.10000e-01
  1.24711e+02  2.66600e+00  3.06400e+00  4.19280e+01  1.97760e+02
  1.58200e+00  1.39600e+00  2.00000e-01  3.26380e+01  1.01700e+00
  3.81000e-01  5.16260e+01  2.27300e+00 -2.41400e+00  1.68240e+01
 -2.77000e-01  2.58733e+02  2.00000e+00  6.74350e+01  2.15000e+00
  4.44000e-01  4.60620e+01  1.24000e+00 -2.47500e+00  1.13497e+02]


In [None]:
test_data = clean_data(x_train, features)
print(np.isnan(test_data))

In [61]:
X_0, y_0, X_1, y_1, X_23, y_23 = pre_process_data(x_train, y_train, features)

18
22
29
18 99913


In [56]:
print(X_1.shape)

(77544, 155)


# Learning algorithms

### Least squares

In [6]:
weight, loss = least_squares(y_train, x_train_cleaned)
print(compute_mse_loss(y_train, x_train_cleaned, weight))

0.51004


### Least squares with ridges regression

In [7]:
weight, loss = ridge_regression(y_train, x_train_cleaned, 10)
print(loss)

0.509928


### Least squares with gradient descent

In [8]:
weight, loss = mean_squared_error_gd(y_train, x_train_cleaned, np.ones((31,)), 100, 1e-3)
loss

1.215536

### Least squares with stochastic gradient descent

In [9]:
weight, loss = mean_squared_error_sgd(y_train, x_train_cleaned, np.ones((31,)), 100, 1e-3)
loss

0.775736

### Logistic regression

In [10]:
w, l = logistic_regression(y_train, x_train_cleaned, np.ones((31,)), 1000, 1e-3)

Current iteration=0, loss=1.995336631273058
Current iteration=100, loss=1.0491363661898327
Current iteration=200, loss=0.27006069549426187
Current iteration=300, loss=-0.37199300383762574
Current iteration=400, loss=-0.8974960829542113
Current iteration=500, loss=-1.3222079469218488
Current iteration=600, loss=-1.661546133055785
Current iteration=700, loss=-1.9325063200387396
Current iteration=800, loss=-2.15229341612981
Current iteration=900, loss=-2.335662630699634


In [11]:
w, l = reg_logistic_regression(y_train, x_train_cleaned, 0.2,  np.ones((31,)), 1000, 1e-3)

Current iteration=0, loss=1.995336631273058
Current iteration=100, loss=0.9989272803604264
Current iteration=200, loss=0.20757768961068648
Current iteration=300, loss=-0.41169620130994156
Current iteration=400, loss=-0.8829184065559729
Current iteration=500, loss=-1.2271743752375226
Current iteration=600, loss=-1.4682340429038723
Current iteration=700, loss=-1.632840431601386
Current iteration=800, loss=-1.745383871074444
Current iteration=900, loss=-1.8236927657875832
