In [1]:
import pandas as pd
import numpy as np
import os
import sys

In [2]:
col_dirs = os.listdir('Collisions/')
print(col_dirs)

list_of_df = []
for col_run in col_dirs:
    list_of_df.append(pd.read_csv('Collisions/'+col_run+'/'+col_run+'_final.csv'))
    
collisions_data = pd.concat(list_of_df)
collisions_data = collisions_data.reset_index(drop=True)

['320917', '322179', '324980', '322022', '324878', '321305', '324293', '324970', '321457', '321475']


In [3]:
collisions_data.head(5)

Unnamed: 0,run,chamber,wheel,sector,station,avg_cluster_size,occupancy,avg_bx_dist,avg_no_of_clusters,avg_multiplicity,lumisections,type,avg_efficiency,fid_eff_ch_level
0,320917,W-2_RB1in_S01,-2,1,1,1.49793,59592,0.00822,1.00361,1.50333,1925,Col,45.593319,78.73167
1,320917,W-2_RB1out_S01,-2,1,1,1.47164,53391,0.00995,1.00312,1.47624,1925,Col,51.693338,85.7043
2,320917,W-2_RB2in_S01,-2,1,2,1.56267,66112,0.010613,1.00334,1.5679,1925,Col,73.506504,89.979511
3,320917,W-2_RB2out_S01,-2,1,2,1.48085,58812,0.013395,1.00285,1.48508,1925,Col,78.429471,92.924316
4,320917,W-2_RB3+_S01,-2,1,3,1.49392,27509,0.0082,1.0012,1.4957,1925,Col,86.473099,91.954258


# Analyzing the records with zero fiducial efficiency

In [4]:
zero_fid_rows = collisions_data.loc[collisions_data['fid_eff_ch_level'] == 0]

# Calculating the percentage of the zero-efficiency valued records in the dataset
data_size = collisions_data.shape[0]
zero_data_size = zero_fid_rows.shape[0]
zero_percentage = round((zero_data_size/data_size)*100,2)
print("Dataset for all the chambers contains approximately "+str(zero_percentage)+"% of records with 0 fiducial efficiency\n\n")

# Records with zero-valued fid efficiency
print(zero_fid_rows)

Dataset for all the chambers contains approximately 8.19% of records with 0 fiducial efficiency


         run         chamber  wheel  sector  station  avg_cluster_size  \
7     320917    W-2_RB4-_S01     -2       1        4           0.00000   
12    320917    W-2_RB3+_S02     -2       2        3           0.00000   
21    320917    W-2_RB3-_S03     -2       3        3           0.00000   
33    320917    W-2_RB4-_S04     -2       4        4           0.00000   
41    320917    W-2_RB4-_S05     -2       5        4           0.00000   
...      ...             ...    ...     ...      ...               ...   
4756  321475   W+2_RB2in_S07      2       7        2           2.03058   
4757  321475  W+2_RB2out_S07      2       7        2           1.86599   
4768  321475    W+2_RB4+_S08      2       8        4           0.00000   
4790  321475    W+2_RB3-_S11      2      11        3           0.00000   
4797  321475    W+2_RB3-_S12      2      12        3           0.00000   

      occupan

### 1. Looking at incorrect zero efficiency valued records

In [5]:
# Collecting records with fiducial efficiency equal to 0 when some 
# other attributes contains non-zero values. Meaning there are
# some discrepancies in efficiency computation since the chamber
# contains metrics that would otherwise result in fiducial efficiency
# greater than 0.

incorrect_zero_efficiency = zero_fid_rows[(zero_fid_rows['avg_cluster_size'] != 0) | (zero_fid_rows['occupancy'] != 0) | (zero_fid_rows['avg_bx_dist'] != 0) | (zero_fid_rows['avg_no_of_clusters'] != 0) | (zero_fid_rows['avg_multiplicity'] != 0)]
incorrect_zero_efficiency.head(5)

Unnamed: 0,run,chamber,wheel,sector,station,avg_cluster_size,occupancy,avg_bx_dist,avg_no_of_clusters,avg_multiplicity,lumisections,type,avg_efficiency,fid_eff_ch_level
436,320917,W+2_RB2in_S07,2,7,2,1.63335,65522,0.011467,1.00323,1.63862,1925,Col,0.0,0.0
437,320917,W+2_RB2out_S07,2,7,2,1.52535,54368,0.012064,1.00228,1.52882,1925,Col,0.0,0.0
916,322179,W+2_RB2in_S07,2,7,2,2.02692,88465,0.009463,1.00361,2.03424,1788,Col,0.0,0.0
917,322179,W+2_RB2out_S07,2,7,2,1.87182,73817,0.008901,1.00254,1.87658,1788,Col,0.0,0.0
1396,324980,W+2_RB2in_S07,2,7,2,2.03772,97301,0.008984,1.00353,2.04491,2325,Col,0.0,0.0


In [6]:
# As can be seen, almost all of the errors that occurred are
# computed of the chamber in the exact same location. It can be
# concluded that computation of fiducial efficiency is incorrect 
# when computing efficiency for Wheel +2, Sector 7, Station 2 IN and OUT
# chambers. Thus, needs further investigation, as all of these chambers
# for all the runs (20 chambers in total) contains incorrect values.

# The values should be probably removed when applying machine learning 
# models as they are incorrect. However,
# all the other records containing 0 fid efficiency seems to be appropriate,
# as all the other metrics are also zero.
print(" Chambers of incorrectly computed fiducial efficiency:")
print(incorrect_zero_efficiency['chamber'].values)



# There is also this one chamber which seems to have incorrect
# data assigned to him, such as values of 41 for avg_cluster size,
# occupancy, avg_multiplicity and so on. Can say that it's an 
# noise in the data that has to be removed as well for further 
# modeling. 
print("\n\n An outlier chamber containing fiducial efficiency of 0:")
incorrect_zero_efficiency[incorrect_zero_efficiency['chamber']=='W-1_RB2out_S07']

 Chambers of incorrectly computed fiducial efficiency:
['W+2_RB2in_S07' 'W+2_RB2out_S07' 'W+2_RB2in_S07' 'W+2_RB2out_S07'
 'W+2_RB2in_S07' 'W+2_RB2out_S07' 'W+2_RB2in_S07' 'W+2_RB2out_S07'
 'W-1_RB2out_S07' 'W+2_RB2in_S07' 'W+2_RB2out_S07' 'W+2_RB2in_S07'
 'W+2_RB2out_S07' 'W+2_RB2in_S07' 'W+2_RB2out_S07' 'W+2_RB2in_S07'
 'W+2_RB2out_S07' 'W+2_RB2in_S07' 'W+2_RB2out_S07' 'W+2_RB2in_S07'
 'W+2_RB2out_S07']


 An outlier chamber containing fiducial efficiency of 0:


Unnamed: 0,run,chamber,wheel,sector,station,avg_cluster_size,occupancy,avg_bx_dist,avg_no_of_clusters,avg_multiplicity,lumisections,type,avg_efficiency,fid_eff_ch_level
2069,324878,W-1_RB2out_S07,-1,7,2,41.0,41,1.0,1.0,41.0,1800,Col,0.0,0.0


### 2. Looking at possibly correct zero efficiency valued records

In [7]:
# Collecting records with fiducial efficiency equal to 0 when all the 
# other attributes (avg_cluster_size, occupancy, etc.) also contains
# zero values. Meaning the fiducial efficiency of zero for such chambers
# should be correct.

correct_zero_efficiency = zero_fid_rows[(zero_fid_rows['avg_cluster_size'] == 0) & (zero_fid_rows['occupancy'] == 0) & (zero_fid_rows['avg_bx_dist'] == 0) & (zero_fid_rows['avg_no_of_clusters'] == 0) & (zero_fid_rows['avg_multiplicity'] == 0)]
correct_zero_efficiency.head(5)

Unnamed: 0,run,chamber,wheel,sector,station,avg_cluster_size,occupancy,avg_bx_dist,avg_no_of_clusters,avg_multiplicity,lumisections,type,avg_efficiency,fid_eff_ch_level
7,320917,W-2_RB4-_S01,-2,1,4,0.0,0,0.0,0.0,0.0,1925,Col,0.0,0.0
12,320917,W-2_RB3+_S02,-2,2,3,0.0,0,0.0,0.0,0.0,1925,Col,0.0,0.0
21,320917,W-2_RB3-_S03,-2,3,3,0.0,0,0.0,0.0,0.0,1925,Col,0.0,0.0
33,320917,W-2_RB4-_S04,-2,4,4,0.0,0,0.0,0.0,0.0,1925,Col,0.0,0.0
41,320917,W-2_RB4-_S05,-2,5,4,0.0,0,0.0,0.0,0.0,1925,Col,0.0,0.0


In [8]:
# It can be clearly seen that chambers in the outer layers
# (station 4 or 3) tend to have more zero valued fiducial
# efficiency rather than chambers in the first two Stations
# (Probably reasonable? as the detectors are further away
# and it's safe to assume they might be detecting less muons)
print("Stations with zero efficiency distribution:")
print(correct_zero_efficiency['station'].value_counts())

# There is no strong correlation between 0 fiducial efficiency
# and Sector numbers. However, Sectors 1 and 7 at the top
# seems reasonable based on their perpendicular position
# in the detector, receiving less cosmic muons?
# (however, what about sector 4 with most 0 efficiency records??)
print("\nSectors with zero efficiency distribution:")
print(correct_zero_efficiency['sector'].value_counts())


# No strong correlation between wheel and 0 fiducial efficiency
# whatsoever, except for the negative side of the barrel
# containing more 0-efficiency chambers.
print("\n Wheels with zero efficiency distribution:")
print(correct_zero_efficiency['wheel'].value_counts())

Stations with zero efficiency distribution:
4    252
3    103
2     17
Name: station, dtype: int64

Sectors with zero efficiency distribution:
4     93
1     60
7     55
8     40
5     32
2     30
12    30
11    20
3     12
Name: sector, dtype: int64

 Wheels with zero efficiency distribution:
-1    165
-2     85
 0     52
 1     40
 2     30
Name: wheel, dtype: int64


### 3. Conclusion on zero efficiency valued records

Assuming that the records analyzed in 1. are damaged and they cannot be used for further computations it is safe to believe that the chamber will have the fiducial efficiency of 0 if and only if all the other attributes of such chamber contain 0 values, as inspected above in 2.


Having this in mind, we might consider dropping all the records containing zero fiducial efficiency value as they won't provide us with any relevant information when calculating the efficiency of a chamber, since only the records containing all the attributes (avg_cluster_size, occupancy, avg_bx_dist, avg_no_of_clusters, avg_multiplicity) of value zero will have a final fiducial efficiency equal to 0. 

Moreover, the data suggests that some actual fiducial efficiency values start at 40 (see cell below)

In [9]:
# Looking at the histograms in EDA_Collisions we can see that the dataset
# contains almost no records that have fiducial efficiency greater that 0 
# and less than 40. To make sure, we check the exsiting data:

eff_0_to_40 = collisions_data[(collisions_data['fid_eff_ch_level']>0)&(collisions_data['fid_eff_ch_level']<40)]
print(eff_0_to_40)
print("\n There are",eff_0_to_40.shape[0],"records containing fid efficiency between 0 and 40 in the dataset")

# As we can see, no fiducial efficiency values between 0 and 40 exists for this dataset.

Empty DataFrame
Columns: [run, chamber, wheel, sector, station, avg_cluster_size, occupancy, avg_bx_dist, avg_no_of_clusters, avg_multiplicity, lumisections, type, avg_efficiency, fid_eff_ch_level]
Index: []

 There are 0 records containing fid efficiency between 0 and 40 in the dataset


# -------- Problem formulation --------

As the problem itself suggests, predicting the efficiency of a chamber makes it a regression problem. As for regression model, it tends to be more difficult to minimize the objective function of such model. Therefore, it is decided to firstly try to solve a classification problem, splitting the dataset into N classes based on the fiducial efficiency into N equal sized buckets and trying out multiple classification models to see what output can be expected. 

Later on, the data will be used to train the regression models to be able to predict the numerical value for fiducial efficiency for a single chamber. 

Such model would help to quickly approximate the fiducial efficiency of a chamber after a Collision or a Cosmic run based on the initial metrics collected from the detectors (such as avg_cluster_size, occupancy, etc.). Also, the fiducial efficiency could be approximated for the chambers that were incorrectly assigned with the fiducial efficiency of 0 (such as the records analyzed in 1.) or if the 'correct' zero-valued records (analyzed in 2.) appears to be a discrepancy as well, such chambers could be assigned by a fiducial efficiency approximation of such model.

Modeling decisions will be based on the data analysis performed in **EDA_Collisions.ipynb** notebook.

# Prepare data for classification task (assign labels, shuffle, remove unnecessary columns, etc.)

The data will have to be split into N classes (buckets) based on their fiducial efficiency values to have an equal distribution for each class

As mentioned above, the zero valued records will have to be removed so that the model could use only the most correct data to be able to predict the efficiency as best as possible.

In [10]:
# testing qcut function (zero values are not yet removed, might need to discuss)

bins = pd.qcut(collisions_data['fid_eff_ch_level'],3)
print(bins.value_counts())

(-0.001, 89.891]    1600
(89.891, 97.075]    1600
(97.075, 100.0]     1600
Name: fid_eff_ch_level, dtype: int64
