In [1]:
import os
import json
import pandas as pd
from utils import transform_histo_to_columns, BadJSON

### Two important steps
Original dataset consists of a column named histo, which is actually a string holding a list of numbers.

Example "[1,2,3,4,5,5]"

Another step is related to Pixel only histograms. We have a antiGoldenJSON for Pixel file which containts Lumisections where Pixel was BAD.

* First step is to transform that column into N columns called bin_0, ..., bin_100, ... bin_N
* Second step is to create column for labels called good_pixel where we take into account when pixel was good or bad

In [2]:
bad_json = BadJSON("/home/mantydze/data/bad_pixel.json")

# Location where datasets are stored
original_location = "/home/mantydze/data/ZeroBias2017B"#/perHistoMerged"
massaged_location = "/home/mantydze/data/ZeroBias2017B/massaged"

if not os.path.exists(massaged_location):
    os.makedirs(massaged_location)
    
# Histogram names to be massaged
cipxl = ["chargeInner_PXLayer_1", "chargeInner_PXLayer_2", "chargeInner_PXLayer_3", "chargeInner_PXLayer_4"]
copxl = ["chargeOuter_PXLayer_1", "chargeOuter_PXLayer_2", "chargeOuter_PXLayer_3", "chargeOuter_PXLayer_4"]
spxl = ["size_PXLayer_1", "size_PXLayer_2", "size_PXLayer_3", "size_PXLayer_4"]
spxd = ["size_PXDisk_-3", "size_PXDisk_-2", "size_PXDisk_-1", "size_PXDisk_+1", "size_PXDisk_+2", "size_PXDisk_+3"]
cpxd = ["charge_PXDisk_-3", "charge_PXDisk_-2", "charge_PXDisk_-1", "charge_PXDisk_+1", "charge_PXDisk_+2", "charge_PXDisk_+3"]

hnames = cipxl + copxl + spxl + spxd + cpxd

In [3]:
for hname in hnames:
    df_path = os.path.join(original_location, "%s.csv" % hname)
    
    if not os.path.isfile(df_path):
        print(df_path, "does not exist")
        continue
    
    print(df_path)
    
    df = pd.read_csv(df_path)
    
    df["good_pixel"] = df.apply(bad_json.is_good, axis=1)
    
    # Convert histo(list as string) into N columns named bin_0... bin_N
    df = transform_histo_to_columns(df)
    
    new_df_path = os.path.join(massaged_location, "%s.csv" % hname)
    print(new_df_path)
    df.to_csv(new_df_path, index=False)

/home/mantydze/data/ZeroBias2017B/chargeInner_PXLayer_1.csv
/home/mantydze/data/ZeroBias2017B/massaged/chargeInner_PXLayer_1.csv
/home/mantydze/data/ZeroBias2017B/chargeInner_PXLayer_2.csv
/home/mantydze/data/ZeroBias2017B/massaged/chargeInner_PXLayer_2.csv
/home/mantydze/data/ZeroBias2017B/chargeInner_PXLayer_3.csv
/home/mantydze/data/ZeroBias2017B/massaged/chargeInner_PXLayer_3.csv
/home/mantydze/data/ZeroBias2017B/chargeInner_PXLayer_4.csv
/home/mantydze/data/ZeroBias2017B/massaged/chargeInner_PXLayer_4.csv
/home/mantydze/data/ZeroBias2017B/chargeOuter_PXLayer_1.csv
/home/mantydze/data/ZeroBias2017B/massaged/chargeOuter_PXLayer_1.csv
/home/mantydze/data/ZeroBias2017B/chargeOuter_PXLayer_2.csv
/home/mantydze/data/ZeroBias2017B/massaged/chargeOuter_PXLayer_2.csv
/home/mantydze/data/ZeroBias2017B/chargeOuter_PXLayer_3.csv
/home/mantydze/data/ZeroBias2017B/massaged/chargeOuter_PXLayer_3.csv
/home/mantydze/data/ZeroBias2017B/chargeOuter_PXLayer_4.csv
/home/mantydze/data/ZeroBias2017B/mas