In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import lightgbm as lgb

In [5]:
DATA_DIR = '/home/jupyter/DREAM'
df = pd.read_csv(os.path.join(DATA_DIR, 'data.csv'))
print(df.shape)
df.head()

(138, 13)


Unnamed: 0,eventId,clusterId,PrimaryDecayMode,VecShowerEnergy,S_sum,C_sum,S_rad_mean,C_rad_mean,S_hot,C_hot,CoverS,dist2charge,label
0,0,0,0,31313.811557,27515.965,30114.676,2.837071,2.04982,0.110429,0.228488,1.094444,0.0,0
1,1,1,1,22119.663201,2722.9253,,3.355176,,0.165529,,,0.0,1
2,2,2,3,7815.112261,7369.4688,6288.264,2.286271,2.166214,0.086934,0.104079,0.853286,,2
3,2,3,3,2634.673219,2338.272,2582.522,1.868708,1.387169,0.185439,0.253425,1.104457,,2
4,4,6,2,27610.212415,14253.621,11683.265,8.165252,4.762796,0.049337,0.059803,0.81967,0.0,3


In [6]:
# train test split by eventId!
id_train, id_test = train_test_split(df.eventId.values,
                                     test_size=0.3, 
                                     random_state=42,
                                     stratify=df.PrimaryDecayMode.values)
print(id_train.shape, id_test.shape)

(96,) (42,)


In [7]:
excl_columns = ['PrimaryDecayMode', 'clusterId', 'VecShowerEnergy', 'label']

# train dataset
df_train = df[df.eventId.isin(id_train)].drop(['eventId'], axis=1)
y_train = df_train.label.values
z_train = df_train.VecShowerEnergy.values
df_train.drop(excl_columns, inplace=True, axis=1)
X_train = df_train.values
train_data = lgb.Dataset(X_train, label=y_train)

# test data set
df_test = df[df.eventId.isin(id_test)].drop(['eventId'], axis=1)
y_test = df_test.label.values
z_test = df_test.VecShowerEnergy.values
id_test = df_test.clusterId.values.reshape([-1,1])
df_test.drop(excl_columns, inplace=True, axis=1)
X_test = df_test.values
test_data = lgb.Dataset(X_test, label=y_test)

### Plot features

In [10]:
# xtitle = df_train.columns
# fig, ax = plt.subplots(4, 2, figsize=(8,12))
# for i in range(X_train.shape[1]):
#     for j in np.unique(y_train):
#         h = ax[i//2][i%2].hist(X_train[:,i][y_train==j], histtype='step')
#         ax[i//2][i%2].set_xlabel(xtitle[i])
#         ax[i//2][i%2].set_ylabel(f"Events") # / {binwidth}")

# Classification

(Boosted) decision trees 'unaffected' by feature scaling

### Training

In [17]:
params = {
    'num_class': 4,
    'application': 'multiclass',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_leaves': [15, 31]
}

In [18]:
cv_result_lgb = lgb.cv(params,
                       train_data,
                       num_boost_round=500,
                       early_stopping_rounds=20,
                       verbose_eval=1)

LightGBMError: Parameter num_leaves should be of type int, got "15,31"

In [16]:
cv_result_lgb

{'multi_logloss-mean': [0.9284776420252975,
  0.8564313519727802,
  0.7972013590115125,
  0.7447093702571576,
  0.6991945199692097,
  0.6581018780041115,
  0.6213252671415194,
  0.5892546965529647,
  0.5603987633320238,
  0.5327267239491723,
  0.5073869389818829,
  0.4859987141975185,
  0.4650393417843321,
  0.44597545280754874,
  0.42669906836073246,
  0.41087067326016397,
  0.3952066150267491,
  0.38196334030146684,
  0.36854722999742584,
  0.357794621409008,
  0.3478381021171591,
  0.3391471548734564,
  0.33206122042449093,
  0.32350559230023407,
  0.3167358570873281,
  0.3126235267122798,
  0.30686423969353765,
  0.30241862264224756,
  0.2999917002885145,
  0.29639105887992984,
  0.294093568439395,
  0.29093249804616916,
  0.2895357850930946,
  0.28813000223586827,
  0.28653389093927095,
  0.2854013928737992,
  0.2820408477909185],
 'multi_logloss-stdv': [0.057235396693610864,
  0.05649985680525976,
  0.060777508344323224,
  0.060035042860987986,
  0.06425117508708601,
  0.06753701

### Testing

In [15]:
y_pred = np.argmax(cv_result_lgb.predict(X_test), axis=1).reshape([-1,1])

AttributeError: 'dict' object has no attribute 'predict'

In [91]:
confusion_matrix(y_test, y_pred)

array([[ 33,   0,   0,   2],
       [  0,  10,   0,   4],
       [  0,   0, 154,   0],
       [  0,   0,   0,  18]])

In [99]:
df_new = pd.DataFrame(np.hstack((id_test, y_pred)), columns=['clusterId', 'predictedLabel'])
df_merged = df.merge(df_new, how='inner', on='clusterId')
df_merged.head()

Unnamed: 0,eventId,clusterId,PrimaryDecayMode,VecShowerEnergy,S_sum,S_rad_mean,S_hot,C_sum,C_rad_mean,C_hot,dist2charge,label,predictedLabel
0,48,117,4,27145.152914,21314.076172,2.100422,0.109939,23976.773438,1.978385,0.13685,38.708239,3,3
1,61,158,4,2803.146925,1603.018311,1.664667,0.10884,1857.292969,1.490503,0.152381,66.105149,3,3
2,61,161,4,10922.210278,8050.244141,1.966331,0.115087,8720.432617,2.006596,0.110548,50.164319,3,3
3,62,166,4,10361.45942,7399.282227,1.627021,0.202144,8136.710938,1.680261,0.186957,68.603123,3,3
4,62,165,4,7246.74764,1847.577271,0.756551,0.535721,272.964966,0.877988,0.571399,0.0,4,4


## Regression

### Training

### Testing

## Decay classification

In [175]:
def decayClassification(x):
    counts = x.predictedLabel.value_counts()
    mode = np.nan
    if 1 in counts.keys():
        mode = 1
    elif 2 in counts.keys():
        mode = 2
    elif 4 in counts.keys():
        mode = 3
        if 3 in counts.keys():
            if counts[3] == 2:
                mode = 4
            elif counts[3] == 4:
                mode = 5
    x['predictedPrimaryDecayMode'] = mode
    return x

In [180]:
df_merged = df_merged.groupby('eventId').apply(decayClassification)

In [181]:
confusion_matrix = pd.crosstab(df_merged.PrimaryDecayMode, df_merged.predictedPrimaryDecayMode, rownames=['Actual'], colnames=['Predicted'])
print(confusion_matrix)

Predicted  1.0  2.0  3.0  4.0  5.0
Actual                            
1           33    0    2    0    0
2            0   10    4    0    0
3            0    0    3    0    0
4            0    0    7   18    0
5            0    0   16    0    5
