In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#download the data
!wget http://opendata.cern.ch/record/328/files/atlas-higgs-challenge-2014-v2.csv.gz
!gunzip atlas-higgs-challenge-2014-v2.csv.gz

In [3]:
#Read the data (the input file contains ~820k different events)
df = pd.read_csv('atlas-higgs-challenge-2014-v2.csv')

In [9]:
#remove unnecessary variables
df = df.drop('EventId',axis=1).drop('Weight',axis=1).drop('KaggleSet',axis=1).drop('KaggleWeight',axis=1)

Define training, validation and test sets, at the beginning use small dataset for model selection

In [15]:
n_training = 100
#training_df = df[:500000]
#valid_df = df[500000:650000]
#test_df = df[650000:]
small_df = df[:n_training].replace({'s':1., 'b':0.}).copy()            #use 10k to practice and build the model

It is usefull to use [fastai](https://github.com/fastai) libraries to construct a model

In [17]:
from fastai.tabular import TabularDataBunch

convert the data to fastai an object which will be read by the fastai trainer, see [fastai.tabular](https://docs.fast.ai/tabular.data.html) for more info
```python
TabularDataBunch.from_df(path, df:DataFrame, dep_var:str, valid_idx:Collection[int], 
                         procs:Optional[Collection[TabularProc]]=None, cat_names:OptStrList=None, 
                         cont_names:OptStrList=None, classes:Collection[T_co]=None, test_df=None, 
                         bs:int=64, val_bs:int=None, num_workers:int=4, 
                         dl_tfms:Optional[Collection[Callable]]=None, device:device=None, 
                         collate_fn:Callable='data_collate', no_check:bool=False) → DataBunch
```
The `TabularDataBunch.from_df` requires to specify indeces for validation set, let's use 20% of the training data for that propose

In [18]:
valid_idx = range(0, int(0.2*n_training)) #indeces of events used for validation 
data = TabularDataBunch.from_df('.', small_df, valid_idx = valid_idx, dep_var='Label', bs=len(small_df)-len(valid_idx))

Let's construct a simple NN, for the propose we will use 4-Layer NN with H = [100,50,10]
![4-Layer NN](images/4L_NN.png)

In [19]:
import torch
import torch.nn as nn
#torch.cuda.set_device(1)

In [20]:
input_dim = small_df.shape[1] - 1 # dim = 31 - 'label' = 30
H = [100, 50, 10]             # dimentions of the hidden layers
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.layer1 = nn.Linear(input_dim,H[0])
        self.layer2 = nn.Linear(H[0],H[1])
        self.layer3 = nn.Linear(H[1],H[2])
        self.layer4 = nn.Linear(H[2],2)
        self.activation = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
        
    def forward(self, categories, x):
        #note: we dont use categories, but we need to set it up in the model
        print(x.shape)
        out  = self.layer1(x)
        out  = self.activation(out)
        out  = self.layer2(out)
        out  = self.activation(out)
        out  = self.layer3(out)
        out  = self.activation(out)
        out  = self.layer4(out)
        out  = self.sigmoid(out)
        
        return x

Use additional libraries:
- [basic_train](https://docs.fast.ai/basic_train.html#Learner): to define Learner (to train the data)
- [mectirs](https://docs.fast.ai/metrics.html#accuracy): the evaluation metric (accuracy)

In [21]:
from fastai.basic_train import Learner
from fastai.metrics import accuracy

In [22]:
net = Net()
learn = Learner(data, net, loss_func = nn.CrossEntropyLoss())

All set!! we can start to train our model!

In [None]:
learn.fit(epochs = 2, lr = 5e-3)

epoch,train_loss,valid_loss


In [14]:
X,y = next(iter(data.train_dl))

In [17]:
X[1].shape

torch.Size([80, 30])

In [33]:
learn.lr_find()

torch.Size([80, 30])


RuntimeError: CUDA error: out of memory

In [13]:
!nvidia-smi

TabularDataBunch;

Train: LabelList (80 items)
x: TabularList
DER_mass_vis 41.7650; DER_met_phi_centrality -1.1780; PRI_jet_subleading_eta -999.0000; DER_pt_tot 18.4370; PRI_lep_phi 1.8690; DER_sum_pt 57.1570; PRI_lep_eta 0.3410; PRI_tau_pt 27.4530; PRI_met_phi -1.1690; PRI_met 67.9090; PRI_jet_leading_eta -999.0000; DER_mass_MMC -999.0000; DER_pt_ratio_lep_tau 1.0820; DER_deltaeta_jet_jet -999.0000; DER_lep_eta_centrality -999.0000; PRI_jet_leading_pt -999.0000; DER_pt_h 18.4370; PRI_tau_eta 1.5800; PRI_tau_phi 2.5100; PRI_jet_num 0.0000; PRI_jet_all_pt 0.0000; PRI_jet_leading_phi -999.0000; DER_deltar_tau_lep 1.3950; DER_mass_jet_jet -999.0000; DER_mass_transverse_met_lep 89.7050; PRI_jet_subleading_pt -999.0000; PRI_lep_pt 29.7040; PRI_jet_subleading_phi -999.0000; DER_prodeta_jet_jet -999.0000; PRI_met_sumet 225.1390; ,DER_mass_vis 60.2310; DER_met_phi_centrality -0.8000; PRI_jet_subleading_eta -999.0000; DER_pt_tot 25.1560; PRI_lep_phi 1.3350; DER_sum_pt 64.8330; PRI_lep_eta 1.161