# Preprocess dataset
Preprocess the dataset in order to help the UML model to better understand the differences between different jets.  

We do the following:
- Normalize energy: let $E_p$ be the particle energy, $E_j$ the corresponding jet energy, then $E_n=\frac{E_p-E[E_p]}{E_j}$
- Momenta w.r.t. Jet: Rotate the particle 3-momentum s.t. the Jet 3-momentum points up
- Delta angles: using the Rodrigues' rotation formula, rotate the angles s.t. the Jet points up

In [8]:
import pandas as pd
import numpy as np

## Read data

In [9]:
jet_df_path = '../data/jet_df.csv'
particle_df_path = '../data/particle_df.csv'

jet_df = pd.read_csv(jet_df_path).set_index('jetID')
particle_df = pd.read_csv(particle_df_path).set_index('jetID', append=True, drop=False)
preprocessed_df = particle_df.reset_index(drop=True)

jet_df

Unnamed: 0_level_0,eventID,nParticles,jetArea,jetPx,jetPy,jetPz,jetE,jetPolarPx,jetPolarPy,jetPolarPz,jetPolarE,jetPhi,jetTheta
jetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0,23,2.004635,-241.374752,0.754266,219.719568,330.240926,241.375931,0.816488,3.138468,50.200008,3.138468,0.832331
1,0,41,1.974715,178.037584,-5.547558,536.195118,566.155630,178.123993,1.821678,-0.031149,36.039368,-0.031149,0.320730
2,1,26,2.004635,157.606353,198.511810,-237.355351,348.306921,253.469330,-0.835698,0.899762,27.082243,0.899762,2.323376
3,1,20,2.024582,-141.065390,-96.179119,-883.349503,900.496654,170.733322,-2.345976,-2.543183,37.921139,-2.543183,2.950667
4,2,62,2.044528,56.939189,187.775177,-313.881867,372.725501,196.218216,-1.248802,1.276378,43.599136,1.276378,2.582897
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5846,3664,27,1.974715,-151.225081,90.369100,-323.509105,370.442555,176.169235,-1.367961,2.602954,39.165623,2.602954,2.642938
5847,3665,31,2.034555,-213.921806,-120.123401,638.003886,685.286867,245.340927,1.683915,-2.629941,48.754082,-2.629941,0.367112
5848,3666,12,1.994662,107.221678,-166.477443,936.815281,958.434796,198.018250,2.258261,-0.998607,41.990013,-0.998607,0.208308
5849,3667,31,2.024582,228.747313,-138.803456,1020.375888,1056.391140,267.566315,2.048469,-0.545403,56.599957,-0.545403,0.256449


## Energy

In [10]:
preprocessed_df['particleE'] = ((particle_df['particleE'] - particle_df['particleE'].groupby(particle_df['jetID']).transform('mean')) / jet_df['jetE']).reset_index(drop=True)
preprocessed_df

Unnamed: 0,eventID,jetID,particleType,particleVx,particleVy,particleVz,particlePx,particlePy,particlePz,particleE,particlePolarPx,particlePolarPy,particlePolarPz,particlePolarE,particlePhi,particleTheta
0,0,0,0,0.000000,0.000000,0.000000,-115.595071,5.513218,107.093643,0.433873,115.726471,0.827630,3.093935,2.347607e-01,3.093935,0.824122
1,0,0,0,0.000000,0.000000,0.000000,-83.072377,4.831796,75.798599,0.297262,83.212776,0.816948,3.083494,5.078805e-01,3.083494,0.831991
2,0,0,-211,-0.981025,1.422285,-33.456345,-11.168506,-8.774579,9.043395,0.007404,14.203125,0.600055,-2.475661,1.395264e-01,-2.475661,1.003814
3,0,0,130,0.073932,0.089866,-2.399344,-8.233158,-1.087632,6.647210,-0.011373,8.304688,0.732994,-3.010249,-1.192093e-07,-3.010249,0.895801
4,0,0,-211,0.073905,0.089409,-2.399101,-8.048296,0.478376,6.097900,-0.012971,8.062500,0.698202,3.082224,1.395264e-01,3.082224,0.923257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168053,3667,5850,130,0.074860,0.090503,0.155008,-0.182099,0.969869,-4.018967,-0.146314,0.986816,-2.112186,1.756392,0.000000e+00,1.756392,2.900816
168054,3667,5850,130,0.074860,0.090503,0.155008,-0.081700,0.617667,-1.643846,-0.149106,0.623047,-1.697440,1.702304,0.000000e+00,1.702304,2.779304
168055,3667,5850,-211,-3.592961,-1.674459,-4.924807,-0.173333,0.360385,-1.548881,-0.149285,0.399902,-2.063478,2.019101,1.395264e-01,2.019101,2.888923
168056,3667,5850,-211,-3.274410,-1.363759,-0.610807,-0.159524,0.365642,-1.511836,-0.149327,0.398926,-2.042421,1.982186,1.395264e-01,1.982186,2.883604


## Angles

In [11]:
preprocessed_df['particlePhi'] = (particle_df['particlePhi'] - jet_df['jetPhi']).reset_index(drop=True)
preprocessed_df['particleTheta'] = (particle_df['particleTheta'] - jet_df['jetTheta']).reset_index(drop=True)
preprocessed_df

Unnamed: 0,eventID,jetID,particleType,particleVx,particleVy,particleVz,particlePx,particlePy,particlePz,particleE,particlePolarPx,particlePolarPy,particlePolarPz,particlePolarE,particlePhi,particleTheta
0,0,0,0,0.000000,0.000000,0.000000,-115.595071,5.513218,107.093643,0.433873,115.726471,0.827630,3.093935,2.347607e-01,-0.044533,-0.008209
1,0,0,0,0.000000,0.000000,0.000000,-83.072377,4.831796,75.798599,0.297262,83.212776,0.816948,3.083494,5.078805e-01,-0.054973,-0.000340
2,0,0,-211,-0.981025,1.422285,-33.456345,-11.168506,-8.774579,9.043395,0.007404,14.203125,0.600055,-2.475661,1.395264e-01,-5.614129,0.171484
3,0,0,130,0.073932,0.089866,-2.399344,-8.233158,-1.087632,6.647210,-0.011373,8.304688,0.732994,-3.010249,-1.192093e-07,-6.148717,0.063471
4,0,0,-211,0.073905,0.089409,-2.399101,-8.048296,0.478376,6.097900,-0.012971,8.062500,0.698202,3.082224,1.395264e-01,-0.056243,0.090926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168053,3667,5850,130,0.074860,0.090503,0.155008,-0.182099,0.969869,-4.018967,-0.146314,0.986816,-2.112186,1.756392,0.000000e+00,-0.618198,-0.003610
168054,3667,5850,130,0.074860,0.090503,0.155008,-0.081700,0.617667,-1.643846,-0.149106,0.623047,-1.697440,1.702304,0.000000e+00,-0.672285,-0.125122
168055,3667,5850,-211,-3.592961,-1.674459,-4.924807,-0.173333,0.360385,-1.548881,-0.149285,0.399902,-2.063478,2.019101,1.395264e-01,-0.355489,-0.015503
168056,3667,5850,-211,-3.274410,-1.363759,-0.610807,-0.159524,0.365642,-1.511836,-0.149327,0.398926,-2.042421,1.982186,1.395264e-01,-0.392404,-0.020821


## 3-Momenta

In [12]:
def project(mod, theta, phi):
    x = mod*np.sin(theta)*np.cos(phi)
    y = mod*np.sin(theta)*np.sin(phi)
    z = mod*np.cos(theta)
    return x, y, z

In [13]:
momenta_mod = ((particle_df['particlePx']**2 + particle_df['particlePy']**2 + particle_df['particlePz']**2)**0.5).reset_index(drop=True)
preprocessed_df['particlePx'], preprocessed_df['particlePy'], preprocessed_df['particlePz'] = project(momenta_mod, preprocessed_df['particleTheta'], preprocessed_df['particlePhi'])
preprocessed_df

Unnamed: 0,eventID,jetID,particleType,particleVx,particleVy,particleVz,particlePx,particlePy,particlePz,particleE,particlePolarPx,particlePolarPy,particlePolarPz,particlePolarE,particlePhi,particleTheta
0,0,0,0,0.000000,0.000000,0.000000,-1.292998,0.057620,157.670509,0.433873,115.726471,0.827630,3.093935,2.347607e-01,-0.044533,-0.008209
1,0,0,0,0.000000,0.000000,0.000000,-0.038184,0.002101,112.560172,0.297262,83.212776,0.816948,3.083494,5.078805e-01,-0.054973,-0.000340
2,0,0,-211,-0.981025,1.422285,-33.456345,2.253819,1.782139,16.590842,0.007404,14.203125,0.600055,-2.475661,1.395264e-01,-5.614129,0.171484
3,0,0,130,0.073932,0.089866,-2.399344,0.668614,0.090453,10.615932,-0.011373,8.304688,0.732994,-3.010249,-1.192093e-07,-6.148717,0.063471
4,0,0,-211,0.073905,0.089409,-2.399101,0.916440,-0.051598,10.067063,-0.012971,8.062500,0.698202,3.082224,1.395264e-01,-0.056243,0.090926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168053,3667,5850,130,0.074860,0.090503,0.155008,-0.012173,0.008658,4.138318,-0.146314,0.986816,-2.112186,1.756392,0.000000e+00,-0.618198,-0.003610
168054,3667,5850,130,0.074860,0.090503,0.155008,-0.171647,0.136628,1.744216,-0.149106,0.623047,-1.697440,1.702304,0.000000e+00,-0.672285,-0.125122
168055,3667,5850,-211,-3.592961,-1.674459,-4.924807,-0.023248,0.008631,1.599481,-0.149285,0.399902,-2.063478,2.019101,1.395264e-01,-0.355489,-0.015503
168056,3667,5850,-211,-3.274410,-1.363759,-0.610807,-0.030079,0.012449,1.563243,-0.149327,0.398926,-2.042421,1.982186,1.395264e-01,-0.392404,-0.020821


## Save

In [16]:
ofname = particle_df_path.split('.csv')[0] + '_preprocessed.csv'
preprocessed_df.to_csv(ofname, index=False)