In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.svm import SVC
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None 

In [39]:
col_dirs = os.listdir('Collisions/')
print(col_dirs)

list_of_df = []
for col_run in col_dirs:
    list_of_df.append(pd.read_csv('Collisions/'+col_run+'/'+col_run+'_final2.csv'))
    
collisions_data = pd.concat(list_of_df)
collisions_data = collisions_data.reset_index(drop=True)
print(collisions_data.shape)

['325022', '321887', '322088', '321917', '322332', '325117', '323474', '324764', '322407', '324245', '322633', '324209', '321908', '324747', '323700', '324729', '325170', '321961', '322319', '321818', '323524', '325172', '322179', '322605', '325000', '322381', '323693', '325057', '324202', '323857', '323725', '321975', '322348', '321774', '321834', '323775', '321988', '321777', '321780', '324998', '322324', '321879', '321755', '324980', '323997', '323526', '323493', '324772', '322022', '324237', '324835', '324318', '322204', '322079', '325101', '322355', '323696', '324997', '323473', '324878', '321833', '321730', '324021', '323488', '322106', '324769', '321732', '323778', '324022', '321990', '323727', '324791', '324897', '321973', '323790', '322356', '323841', '324841', '321933', '321712', '323487', '321813', '324315', '322625', '323983', '325099', '324293', '324970', '322068', '325001', '321760', '322617', '321909', '321781', '321831', '322431', '323755', '324205', '325159', '321758',

In [40]:
collisions_data.head(5)

Unnamed: 0,run,chamber,wheel,sector,station,avg_cluster_size,occupancy,avg_bx_dist,avg_no_of_clusters,avg_multiplicity,lumisections,type,avg_efficiency,contains_zero_roll,rolls_count,fid_eff_ch_level
0,325022,W-2_RB1in_S01,-2,1,1,1.92231,81335,0.005341,1.00396,1.92993,1588,Col,52.692854,0,2,96.813076
1,325022,W-2_RB1out_S01,-2,1,1,1.85157,71452,0.007152,1.0033,1.85768,1588,Col,57.521912,0,2,97.891937
2,325022,W-2_RB2in_S01,-2,1,2,2.00157,85179,0.00752,1.00368,2.00894,1588,Col,79.87812,0,2,98.281658
3,325022,W-2_RB2out_S01,-2,1,2,1.85402,73117,0.011106,1.00323,1.86001,1588,Col,82.926298,0,3,98.574773
4,325022,W-2_RB3+_S01,-2,1,3,1.86561,34053,0.003506,1.00115,1.86776,1588,Col,91.554276,0,2,98.225697


# Analyzing the records with zero fiducial efficiency

In [41]:
zero_fid_rows = collisions_data.loc[collisions_data['fid_eff_ch_level'] == 0]

# Calculating the percentage of the zero-efficiency valued records in the dataset
data_size = collisions_data.shape[0]
zero_data_size = zero_fid_rows.shape[0]
zero_percentage = round((zero_data_size/data_size)*100,2)
print("Dataset for all the chambers contains approximately "+str(zero_percentage)+"% of records with 0 fiducial efficiency\n\n")

# Records with zero-valued fid efficiency
print(zero_fid_rows)

Dataset for all the chambers contains approximately 8.4% of records with 0 fiducial efficiency


          run         chamber  wheel  sector  station  avg_cluster_size  \
7      325022    W-2_RB4-_S01     -2       1        4           0.00000   
12     325022    W-2_RB3+_S02     -2       2        3           0.00000   
21     325022    W-2_RB3-_S03     -2       3        3           0.00000   
28     325022    W-2_RB3+_S04     -2       4        3           0.00000   
33     325022    W-2_RB4-_S04     -2       4        4           0.00000   
...       ...             ...    ...     ...      ...               ...   
63316  322322   W+2_RB2in_S07      2       7        2           2.02445   
63317  322322  W+2_RB2out_S07      2       7        2           1.86836   
63328  322322    W+2_RB4+_S08      2       8        4           0.00000   
63350  322322    W+2_RB3-_S11      2      11        3           0.00000   
63357  322322    W+2_RB3-_S12      2      12        3           0.00000   

  

### 1. Looking at incorrect zero efficiency valued records

In [42]:
# Collecting records with fiducial efficiency equal to 0 when some 
# other attributes contains non-zero values. Meaning there are
# some discrepancies in efficiency computation since the chamber
# contains metrics that would otherwise result in fiducial efficiency
# greater than 0.

incorrect_zero_efficiency = zero_fid_rows[(zero_fid_rows['avg_cluster_size'] != 0) | (zero_fid_rows['occupancy'] != 0) | (zero_fid_rows['avg_bx_dist'] != 0) | (zero_fid_rows['avg_no_of_clusters'] != 0) | (zero_fid_rows['avg_multiplicity'] != 0)]
incorrect_zero_efficiency.head(50)

Unnamed: 0,run,chamber,wheel,sector,station,avg_cluster_size,occupancy,avg_bx_dist,avg_no_of_clusters,avg_multiplicity,lumisections,type,avg_efficiency,contains_zero_roll,rolls_count,fid_eff_ch_level
436,325022,W+2_RB2in_S07,2,7,2,2.03245,81745,0.008553,1.00397,2.04051,1588,Col,0.0,1,2,0.0
437,325022,W+2_RB2out_S07,2,7,2,1.88529,68061,0.009363,1.00253,1.89006,1588,Col,0.0,1,3,0.0
916,321887,W+2_RB2in_S07,2,7,2,2.01219,52160,0.010184,1.00372,2.01967,951,Col,0.0,1,2,0.0
917,321887,W+2_RB2out_S07,2,7,2,1.87155,43770,0.008979,1.00227,1.8758,951,Col,0.0,1,3,0.0
1396,322088,W+2_RB2in_S07,2,7,2,2.04925,32997,0.009067,1.00362,2.05666,604,Col,0.0,1,2,0.0
1397,322088,W+2_RB2out_S07,2,7,2,1.87113,27036,0.007613,1.00243,1.87568,604,Col,0.0,1,3,0.0
1876,321917,W+2_RB2in_S07,2,7,2,1.99279,49459,0.072364,1.00409,2.00093,808,Col,0.0,1,2,0.0
1877,321917,W+2_RB2out_S07,2,7,2,1.87386,41787,0.077623,1.0027,1.87891,808,Col,0.0,1,3,0.0
2069,322332,W-1_RB2out_S07,-1,7,2,7.0,7,1.0,1.0,7.0,1079,Col,0.0,1,2,0.0
2356,322332,W+2_RB2in_S07,2,7,2,2.01266,59285,0.010083,1.00354,2.01979,1079,Col,0.0,1,2,0.0


In [43]:
# As can be seen, almost all of the errors that occurred are
# computed of the chamber in the exact same location. It can be
# concluded that computation of fiducial efficiency is incorrect 
# when computing efficiency for Wheel +2, Sector 7, Station 2 IN and OUT
# chambers. Thus, needs further investigation, as all of these chambers
# for all the runs (20 chambers in total) contains incorrect values.

# The values should be probably removed when applying machine learning 
# models as they are incorrect. However,
# all the other records containing 0 fid efficiency seems to be appropriate,
# as all the other metrics are also zero.
print(" Chambers of incorrectly computed fiducial efficiency:")
print(incorrect_zero_efficiency['chamber'].value_counts())



# There are also few chambers that seem to have incorrect
# data assigned to them, such as values of 41 for avg_cluster size,
# occupancy, avg_multiplicity and so on. Can say that it's an 
# noise in the data that has to be removed as well for further 
# modeling. 
print("\n\n The outlier chambers containing fiducial efficiency of 0:")
incorrect_zero_efficiency[(incorrect_zero_efficiency['wheel'] != 2) | (incorrect_zero_efficiency['sector'] != 7) | (incorrect_zero_efficiency['station'] != 2)]

 Chambers of incorrectly computed fiducial efficiency:
W+2_RB2in_S07     132
W+2_RB2out_S07    132
W-1_RB2out_S07      8
W-2_RB4-_S05        3
W+2_RB4+_S01        2
W-2_RB4+_S01        2
W+2_RB4-_S01        1
W-1_RB4-_S01        1
W-2_RB3+_S04        1
Name: chamber, dtype: int64


 The outlier chambers containing fiducial efficiency of 0:


Unnamed: 0,run,chamber,wheel,sector,station,avg_cluster_size,occupancy,avg_bx_dist,avg_no_of_clusters,avg_multiplicity,lumisections,type,avg_efficiency,contains_zero_roll,rolls_count,fid_eff_ch_level
2069,322332,W-1_RB2out_S07,-1,7,2,7.0,7,1.0,1.0,7.0,1079,Col,0.0,1,2,0.0
6268,324747,W-2_RB3+_S04,-2,4,3,1.0,1,0.0,1.0,1.0,1116,Col,0.0,1,2,0.0
7206,324729,W-2_RB4+_S01,-2,1,4,1.85693,1220,0.006088,1.0,1.85693,209,Col,50.0,1,2,0.0
7590,324729,W+2_RB4+_S01,2,1,4,1.65364,931,0.007105,1.0,1.65364,209,Col,66.666666,1,2,0.0
13481,324202,W-2_RB4-_S05,-2,5,4,1.0,9,1.22222,1.0,1.0,238,Col,0.0,1,2,0.0
14069,323857,W-1_RB2out_S07,-1,7,2,6.0,6,0.0,1.0,6.0,369,Col,0.0,1,2,0.0
26069,325101,W-1_RB2out_S07,-1,7,2,1.0,1,1.0,1.0,1.0,483,Col,0.0,1,2,0.0
28469,324878,W-1_RB2out_S07,-1,7,2,41.0,41,1.0,1.0,41.0,1800,Col,0.0,1,2,0.0
30869,322106,W-1_RB2out_S07,-1,7,2,1.0,1,0.0,1.0,1.0,876,Col,0.0,1,2,0.0
35189,321973,W-1_RB2out_S07,-1,7,2,3.0,3,3.0,1.0,3.0,1244,Col,0.0,1,2,0.0


### 2. Looking at possibly correct zero efficiency valued records

In [44]:
# Collecting records with fiducial efficiency equal to 0 when all the 
# other attributes (avg_cluster_size, occupancy, etc.) also contains
# zero values. Meaning the fiducial efficiency of zero for such chambers
# should be correct.

correct_zero_efficiency = zero_fid_rows[(zero_fid_rows['avg_cluster_size'] == 0) & (zero_fid_rows['occupancy'] == 0) & (zero_fid_rows['avg_bx_dist'] == 0) & (zero_fid_rows['avg_no_of_clusters'] == 0) & (zero_fid_rows['avg_multiplicity'] == 0)]
correct_zero_efficiency.head(5)

Unnamed: 0,run,chamber,wheel,sector,station,avg_cluster_size,occupancy,avg_bx_dist,avg_no_of_clusters,avg_multiplicity,lumisections,type,avg_efficiency,contains_zero_roll,rolls_count,fid_eff_ch_level
7,325022,W-2_RB4-_S01,-2,1,4,0.0,0,0.0,0.0,0.0,1588,Col,0.0,1,2,0.0
12,325022,W-2_RB3+_S02,-2,2,3,0.0,0,0.0,0.0,0.0,1588,Col,0.0,1,2,0.0
21,325022,W-2_RB3-_S03,-2,3,3,0.0,0,0.0,0.0,0.0,1588,Col,0.0,1,2,0.0
28,325022,W-2_RB3+_S04,-2,4,3,0.0,0,0.0,0.0,0.0,1588,Col,0.0,1,2,0.0
33,325022,W-2_RB4-_S04,-2,4,4,0.0,0,0.0,0.0,0.0,1588,Col,0.0,1,2,0.0


In [45]:
# It can be clearly seen that chambers in the outer layers
# (station 4 or 3) tend to have more zero valued fiducial
# efficiency rather than chambers in the first two Stations
# (Probably reasonable? as the detectors are further away
# and it's safe to assume they might be detecting less muons)
print("Stations with zero efficiency distribution:")
print(correct_zero_efficiency['station'].value_counts())

# There is no strong correlation between 0 fiducial efficiency
# and Sector numbers. However, Sectors 1 and 7 at the top
# seems reasonable based on their perpendicular position
# in the detector, receiving less cosmic muons?
# (however, what about sector 4 with most 0 efficiency records??)
print("\nSectors with zero efficiency distribution:")
print(correct_zero_efficiency['sector'].value_counts())


# No strong correlation between wheel and 0 fiducial efficiency
# whatsoever, except for the negative side of the barrel
# containing more 0-efficiency chambers.
print("\n Wheels with zero efficiency distribution:")
print(correct_zero_efficiency['wheel'].value_counts())

Stations with zero efficiency distribution:
4    3360
3    1395
2     261
1      24
Name: station, dtype: int64

Sectors with zero efficiency distribution:
4     1215
1      807
7      759
8      552
5      416
12     412
2      412
11     278
3      148
9       24
10      17
Name: sector, dtype: int64

 Wheels with zero efficiency distribution:
-1    2210
-2    1103
 0     708
 1     622
 2     397
Name: wheel, dtype: int64


### 3. Conclusion on zero efficiency valued records

Assuming that the records analyzed in 1. are damaged and they cannot be used for further computations it is safe to believe that the chamber will have the fiducial efficiency of 0 if and only if all the other attributes of such chamber contain 0 values, as inspected above in 2.


Having this in mind, we might consider dropping all the records containing zero fiducial efficiency value as they won't provide us with any relevant information when calculating the efficiency of a chamber, since only the records containing all the attributes (avg_cluster_size, occupancy, avg_bx_dist, avg_no_of_clusters, avg_multiplicity) of value zero will have a final fiducial efficiency equal to 0. 

Moreover, the data suggests that some actual fiducial efficiency values start at 40 (see cell below), thus, it is possible to assume that the 'correct' records with fiducial efficiency of 0 might be also discrepancies of the detectors, since the other fiducial efficiency values are nowhere close to 0 (closest one to be over 40).

In [46]:
# Looking at the histograms in EDA_Collisions we can see that the dataset
# contains almost no records that have fiducial efficiency greater that 0 
# and less than 40. To make sure, we check the exsiting data:


eff_0_to_40 = collisions_data[(collisions_data['fid_eff_ch_level']>0)&(collisions_data['fid_eff_ch_level']<40)]

print(collisions_data[(collisions_data['fid_eff_ch_level']>=80)&(collisions_data['fid_eff_ch_level']<90)].shape)

print("\n There are",eff_0_to_40.shape[0],"records containing fid efficiency between 0 and 40 in the dataset")
eff_0_to_40.head(5)

# As we can see, there are only 50 records with fid_eff between 0 and 40 for this dataset.

(3765, 16)

 There are 50 records containing fid efficiency between 0 and 40 in the dataset


Unnamed: 0,run,chamber,wheel,sector,station,avg_cluster_size,occupancy,avg_bx_dist,avg_no_of_clusters,avg_multiplicity,lumisections,type,avg_efficiency,contains_zero_roll,rolls_count,fid_eff_ch_level
921,321887,W+2_RB4-_S07,2,7,4,1.81956,20077,0.010332,1.001,1.82137,951,Col,56.666666,1,2,25.0
4361,324245,W-2_RB4-_S05,-2,5,4,1.03333,62,0.183333,1.0,1.03333,1683,Col,1.0,0,2,0.627335
5295,324209,W-2_RB4-_S02,-2,2,4,1.71717,1020,0.005051,1.0,1.71717,557,Col,0.0,1,2,37.5
5321,324209,W-2_RB4-_S05,-2,5,4,0.0,0,0.0,0.0,0.0,557,Col,0.0,1,2,0.5
6761,323700,W-2_RB4-_S05,-2,5,4,1.0,2,0.5,1.0,1.0,446,Col,0.0,1,2,5.977383


# -------- Problem formulation --------

As the problem itself suggests, predicting the efficiency of a chamber makes it a regression problem. As for regression model, it tends to be more difficult to minimize the objective function of such model. Therefore, it is decided to firstly try to solve a classification problem, splitting the dataset into N classes based on the fiducial efficiency into N equal sized buckets and trying out multiple classification models to see what output can be expected. 

Later on, the data will be used to train the regression models to be able to predict the numerical value for fiducial efficiency for a single chamber. 

Such model would help to quickly approximate the fiducial efficiency of a chamber after a Collision or a Cosmic run based on the initial metrics collected from the detectors (such as avg_cluster_size, occupancy, etc.). Also, the fiducial efficiency could be approximated for the chambers that were incorrectly assigned with the fiducial efficiency of 0 (such as the records analyzed in 1.) or if the 'correct' zero-valued records (analyzed in 2.) appears to be a discrepancy as well, such chambers could be assigned by a fiducial efficiency approximation of such model.

Modeling decisions will be based on the data analysis performed in **EDA_Collisions.ipynb** notebook.

# Prepare data for classification task (drop unnecessary columns, assign labels, standardize, shuffle, split into train/test/validate.)

Some attributes in the dataset are not useful for training the model. Such attributes would be:
* run (run number should not be an indicator for the efficiency computation)
* chamber (chamber name will be removed as all the chamber location data is stored within wheel, sector, station attributes)
* type (all the records used are Collisions type, therefore, the column has to be dropped)
* avg_efficiency (this is an efficiency retrieved from DQM system for the chambers, however, the project's aim is to compute a fiducial efficiency, thus, the column will have to be dropped)

The data will have to be split into N classes (buckets) based on their fiducial efficiency values to have an equal distribution for each class. 
For this case we'll try to use the existing data and split it into equal sized buckets based on the efficiency for current data. However, if there are some more substantial ranges of efficiency that could be split into, for example, weak efficiency, medium efficiency, and strong efficiency classes, we could augment data for each range to have an equal distribution. However, at the moment, it will be split into 3 classes by dividing existing data into equally distributed 3 buckets (see below, it gets us ranges (40.7, 91.2] for weak efficiency, (91.2, 97,2] for medium efficiency and (97.2, 100] for strong efficiency). The classes will be assigned with labes 0, 1 and 2 respectively.

As mentioned above, the zero valued records will have to be removed so that the model could use only the most correct data to be able to predict the efficiency as best as possible.

In [47]:
# print(collisions_data['contains_zero_roll'].value_counts())

### Dropping unnecessary columns and filtering records with non zero efficiency

In [48]:
# Try change occupancy and lumisection to occupancy_per_ls

collisions_data['occupancy_per_LS'] = collisions_data['occupancy'] / collisions_data['lumisections']

# Unnecessary columns are dropped. Copy in shaped_data for classification,
# collisions_data for regression.

print('original shape:',collisions_data.shape)
shaped_data = collisions_data.copy()
shaped_data.drop(['run', 'chamber', 'type', 'contains_zero_roll', 'rolls_count', 'avg_efficiency', 'occupancy', 'lumisections'], axis=1, inplace=True)

collisions_data.drop(['run', 'chamber', 'type', 'contains_zero_roll', 'rolls_count', 'avg_efficiency', 'occupancy', 'lumisections'], axis=1, inplace=True)
# shaped_data.drop(['run', 'chamber', 'type', 'contains_zero_roll', 'rolls_count', 'avg_efficiency'], axis=1, inplace=True)

print('collisions shape:', collisions_data.shape)
print('dropped shape:',shaped_data.shape)
shaped_data.head(5)

original shape: (63360, 17)
collisions shape: (63360, 9)
dropped shape: (63360, 9)


Unnamed: 0,wheel,sector,station,avg_cluster_size,avg_bx_dist,avg_no_of_clusters,avg_multiplicity,fid_eff_ch_level,occupancy_per_LS
0,-2,1,1,1.92231,0.005341,1.00396,1.92993,96.813076,51.218514
1,-2,1,1,1.85157,0.007152,1.0033,1.85768,97.891937,44.994962
2,-2,1,2,2.00157,0.00752,1.00368,2.00894,98.281658,53.639169
3,-2,1,2,1.85402,0.011106,1.00323,1.86001,98.574773,46.043451
4,-2,1,3,1.86561,0.003506,1.00115,1.86776,98.225697,21.443955


In [49]:
# The data with unnecessary columns removed will now be filtered to
# keep only records that contain the fiducial efficiency greater than 0.

data_with_eff = shaped_data.loc[shaped_data['fid_eff_ch_level'] > 0]
collisions_data = collisions_data.loc[collisions_data['fid_eff_ch_level']>0]
print(data_with_eff.shape)


# Get the range of the bins to split the data into three classes based
# on their fiducial efficiency values.

bins = pd.qcut(data_with_eff['fid_eff_ch_level'],3)
print(bins.value_counts())
interval_values = bins.cat.categories.right
print('Class splitting interval values: ',interval_values[0], interval_values[1], interval_values[2])

(58038, 9)
(0.033299999999999996, 96.173]    19346
(96.173, 97.582]                  19346
(97.582, 100.0]                   19346
Name: fid_eff_ch_level, dtype: int64
Class splitting interval values:  96.173 97.582 100.0


### Labels based on the efficiency value are added

In [50]:
# Creating a label column for the dataset, splitting into 3 equally distributed
# classes. label 0 - if efficiency is less than or equal to 91.22
# label 1 - if efficiency is greater than 91.22 and less than or equal to 97.22
# label 2 - if efficiency is greather than 97.22

def assign_label(fid_eff_val):
    if fid_eff_val <= interval_values[0]:
        return 0
    elif fid_eff_val <= interval_values[1]:
        return 1
    else:
        return 2
    
data_with_eff['label'] = data_with_eff['fid_eff_ch_level'].apply(lambda x: assign_label(x))

In [51]:
print(data_with_eff.shape)
print(data_with_eff['label'].value_counts())
data_with_eff.head(5)

(58038, 10)
2    19364
0    19346
1    19328
Name: label, dtype: int64


Unnamed: 0,wheel,sector,station,avg_cluster_size,avg_bx_dist,avg_no_of_clusters,avg_multiplicity,fid_eff_ch_level,occupancy_per_LS,label
0,-2,1,1,1.92231,0.005341,1.00396,1.92993,96.813076,51.218514,1
1,-2,1,1,1.85157,0.007152,1.0033,1.85768,97.891937,44.994962,2
2,-2,1,2,2.00157,0.00752,1.00368,2.00894,98.281658,53.639169,2
3,-2,1,2,1.85402,0.011106,1.00323,1.86001,98.574773,46.043451,2
4,-2,1,3,1.86561,0.003506,1.00115,1.86776,98.225697,21.443955,2


### Shuffle and  split data into Train/Validate/Test datasets

In [52]:
# Firstly, the column with fid_eff_ch_level will be dropped 
# since we're firstly training a classifier to predict one of the
# three classes that were assgined as labels 0, 1 and 2.
data_with_eff.drop(['fid_eff_ch_level'], axis=1, inplace=True)

# Also, shuffle the rows in the dataset
data_with_eff = shuffle(data_with_eff)
collisions_data = shuffle(collisions_data)

data_with_eff.head(5)

Unnamed: 0,wheel,sector,station,avg_cluster_size,avg_bx_dist,avg_no_of_clusters,avg_multiplicity,occupancy_per_LS,label
21228,-1,2,3,1.68322,0.00323,1.00054,1.68413,24.287379,2
51635,0,11,2,1.99369,0.019703,1.00415,2.00197,97.376712,1
39444,-2,11,2,1.98764,0.007503,1.00428,1.99616,67.55,2
5713,2,6,4,1.82219,0.003263,1.0,1.82219,2.005386,2
14754,1,9,1,2.15355,0.007346,1.00527,2.16491,113.673469,1


In [65]:
print(collisions_data.shape)
print(collisions_data.head(2))
col_list = collisions_data.columns
print(col_list)
print(data_with_eff.columns)

(58038, 9)
       wheel  sector  station  avg_cluster_size  avg_bx_dist  \
13955     -2       5        1           1.92674     0.007761   
60377      1      12        1           1.98227     0.009277   

       avg_no_of_clusters  avg_multiplicity  fid_eff_ch_level  \
13955             1.00271           1.93197         96.897438   
60377             1.00498           1.99215         98.139870   

       occupancy_per_LS  
13955         44.403794  
60377         52.624912  
Index(['wheel', 'sector', 'station', 'avg_cluster_size', 'avg_bx_dist',
       'avg_no_of_clusters', 'avg_multiplicity', 'fid_eff_ch_level',
       'occupancy_per_LS'],
      dtype='object')
Index(['wheel', 'sector', 'station', 'avg_cluster_size', 'avg_bx_dist',
       'avg_no_of_clusters', 'avg_multiplicity', 'occupancy_per_LS', 'label'],
      dtype='object')


In [71]:
# Separate the data into attributes dataFrame and
# labels dataFrame. X_data and Y_data

y_data = data_with_eff.iloc[:,-1:]
x_data = data_with_eff.iloc[:, 0:8]

y_data_reg = collisions_data.loc[:, collisions_data.columns == 'fid_eff_ch_level']
x_data_reg = collisions_data.loc[:, collisions_data.columns != 'fid_eff_ch_level']
x_data.head(2)

# Split the data into Train/Validate/Test datasets.
# Train dataset will be used to train models, the data
# the models will be learning from. Validation set to be used
# for tuning the models, for finding the best hyperparameters. 
# Whereas Test dataset will be used for final evaluation once the
# models are optimized the most.

train_ratio = 0.7
validation_ratio = 0.15
test_ratio = 0.15

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=1-train_ratio, random_state=1)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=1)

print("Dataset is split into Train/Validate/Test subsets of size:")
print("train - %.2f%%" % ((x_train.shape[0]/(x_train.shape[0]+x_val.shape[0]+x_test.shape[0]))*100))
print("validate - %.2f%%" % ((x_val.shape[0]/(x_train.shape[0]+x_val.shape[0]+x_test.shape[0]))*100))
print("test - %.2f%%" % ((x_test.shape[0]/(x_train.shape[0]+x_val.shape[0]+x_test.shape[0]))*100))


Dataset is split into Train/Validate/Test subsets of size:
train - 70.00%
validate - 15.00%
test - 15.00%


### Standardize data

In [72]:
# The data will be standardized so that the different
# attributes in the dataset will be within the same scale.
# So that one attribute would not contain more weight 
# than other attribute.

# Standardization transforms data such that its distribution
# whill have mean value of 0 and a standard deviation of 1. 
# It is performed feature/column wise. 
# x = (x-μ)/σ

# validation and test data will have to be standardized before testing
# by using mean and standard deviation of Train dataset.

#define scaler
scaler = preprocessing.StandardScaler().fit(x_train)
print(scaler.mean_)

# transform train data
x_train = scaler.transform(x_train)


# In practice, the scaler is stored in the preprocessing pipeline
# and new data arriving to the model will be scaled before
# sent into model black box to get prediction. However, right now,
# the validation data will be scaled at the same time.

x_val = scaler.transform(x_val)

[3.99990154e-02 6.45896716e+00 2.40658691e+00 1.92033808e+00
 1.44601569e-02 1.00275420e+00 1.92586350e+00 6.08027293e+01]


# Models

## Classification

In [73]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [86]:
# First test run is performed on a lazy ML model that 
# finds the K neareast neighbors in the dataset for a given
# sample. The nearest neighbors are computed by calculating
# Euclidean distance between the validation sample and 
# samples in train dataset.

# n_neighbors = 1 means that the class will be assigned with 
# respect to the closest existing smaple in the train dataset.
model = KNeighborsClassifier(n_neighbors=7)
model.fit(x_train,y_train.values.ravel())

knn_pred = model.predict(x_val)
knn_accuracy = accuracy_score(y_val, knn_pred)

# An initial run on such model gives us an accuracy of 
# over 75%, meaning, more than 3 out of 4 of the validation
# samples are classified correctly based on their metrics
print("KNN Accuracy: ", knn_accuracy)

KNN Accuracy:  0.791982540776476


In [87]:
incorrect_classes = {'label0':0, 'label1':0, 'label2':0}
count = 0
y_val_list = y_val['label'].tolist()

for i in range(len(y_val_list)):
    true_y = y_val_list[i]
    pred_y = knn_pred[i]
    if true_y != pred_y:
        count += 1
        if true_y == 0:
            incorrect_classes['label0'] += 1
        elif true_y == 1:
            incorrect_classes['label1'] += 1
        else:
            incorrect_classes['label2'] += 1

print("Number of misclassified samples:")
print(count)

print("\nNumber of missclasified samples by true label:")
print(incorrect_classes)

# It is quite obvious that the most missclassfied chambers are
# of label 1, since this class is in between two classes in
# in a very small range. Therefore, even classes 1 and 2 might
# be very similar based on their metrics and efficiency. 

# It is clear that label 0 was wrongly classified least times
# as it contains quite a large range of efficiency, thus, it should
# be easily predicting great part of its data. Because some records
# should be further away from the label 1 and label 2 data.

Number of misclassified samples:
1811

Number of missclasified samples by true label:
{'label0': 396, 'label1': 812, 'label2': 603}


In [76]:
# SVM Classifier
from sklearn.model_selection import cross_val_score
from sklearn import svm

clf = svm.SVC()
scores = cross_val_score(clf, x_train, y_train.values.ravel(), cv=10)
clf = clf.fit(x_train, y_train.values.ravel())

print("SVM cross-validation scores:",scores)
print("Expected classification accuracy on test set is:",(sum(scores)/len(scores)))

# clf = SVC(kernel='linear')
# clf = clf.fit(x_train, y_train.values.ravel())

SVM cross-validation scores: [0.56706867 0.59365001 0.56362294 0.56682254 0.57174502 0.58651243
 0.57483998 0.57410143 0.56819301 0.57040867]
Expected classification accuracy on test set is: 0.573696469187355


In [77]:

svm_predict = clf.predict(x_val)
svm_accuracy = accuracy_score(y_val, svm_predict)
print(svm_accuracy)

0.5721341603491845


## Regression