In [1]:
import os 
import pandas as pd
from label_utils import GoldenJSON

### Transform original dataset into smaller datasets per Histogram

Original dataset consists of N big dataframe files:
* ZeroBias_2017B_DataFrame_1DAnd2D_1.txt
* ZeroBias_2017B_DataFrame_1DAnd2D_2.txt
* ZeroBias_2017B_DataFrame_1DAnd2D_3.txt
* ...
* ZeroBias_2017B_DataFrame_1DAnd2D_99.txt

That makes difficult to extract information and work with only one Histogram.

Following code transforms original files into smaller files per Histogram

Transformation is done in two steps:
1. Read every original dataset file, extract per Histogram data and save it into a dedicated folder (per Histogram). Result is a list of folders for each Histogram containing data from original dataset in small files.
2. Merge small files into a single dataset file per Histogram.



In [2]:
# Location of big dataframe files
original_location = "DataFrame_1Dand2D_ZeroBias2017D/DF2017D"

# Intemediate location 
perhisto_location = os.path.join(original_location, "perHisto")

# Location for merged files into a single dataper per Histogram
perhisto_merged_location = os.path.join(original_location, "perHistoMerged")

In [3]:
# Should column names be renamed
do_rename = True 

new_column_names = {
    "fromlumi": "lumi", 
    "fromrun": "run", 
    "Entries": "entries",
    "Xmax": "xmax",
    "Xmin": "xmin",
    "Xbins": "xbins",
    "Ymax": "ymax",
    "Ymin": "ymin",
    "Ybins": "ybins"
}

In [4]:
# Should labels be applied to the merged dataframes
do_labels = True

if do_labels:
    gj = GoldenJSON("Cert_294927-306462_13TeV_PromptReco_Collisions17_JSON.txt")

In [5]:
# Step 1 Split

for df_index, df_entry in enumerate(sorted(os.listdir(original_location))):
    df_path = os.path.join(original_location, df_entry)
    
    if not os.path.isfile(df_path):
        continue
        
    print(df_path)
    
    df = pd.read_csv(df_path)

    if do_rename:
        # Rename columns names to keep consistency
        df.rename(columns=new_column_names, inplace=True)
    
    # Drop garbage
    df.drop(["fromrun.1", "fromlumi.1", "hname.1"], axis=1, inplace=True)
    
    # Find unique histogram names
    hnames = df.hname.unique()
    print("Unique histograms:", len(hnames))
    
    for hname in hnames:
        
        hdir = os.path.join(perhisto_location, hname)
        if not os.path.exists(hdir):
            os.makedirs(hdir)
            
        hpath = os.path.join(hdir, ("%s_%s.csv" % (df_index, hname)))
        
        # Dataframe contains only data for one histo
        df_histo = df[df['hname'] == hname]
        
        df_histo.to_csv(hpath, index=False)

DataFrame_1Dand2D_ZeroBias2017D/DF2017D/ZeroBias_2017D_DataFrame_1DAnd2D_1.txt
Unique histograms: 182
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/ZeroBias_2017D_DataFrame_1DAnd2D_10.txt
Unique histograms: 182
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/ZeroBias_2017D_DataFrame_1DAnd2D_11.txt
Unique histograms: 182
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/ZeroBias_2017D_DataFrame_1DAnd2D_12.txt
Unique histograms: 182
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/ZeroBias_2017D_DataFrame_1DAnd2D_13.txt
Unique histograms: 182
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/ZeroBias_2017D_DataFrame_1DAnd2D_14.txt
Unique histograms: 182
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/ZeroBias_2017D_DataFrame_1DAnd2D_15.txt
Unique histograms: 182
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/ZeroBias_2017D_DataFrame_1DAnd2D_16.txt
Unique histograms: 182
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/ZeroBias_2017D_DataFrame_1DAnd2D_17.txt
Unique histograms: 182
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/ZeroBias_2017D_DataFrame_1DAnd2D_1

In [6]:
# Step 2 Merge

if not os.path.exists(perhisto_merged_location):
    os.makedirs(perhisto_merged_location)
    
def merge(histogram_name):
    """
        Parameters
        ----------
        histogram_name : str
            histogram name is equal to a folder name containing small files
    """
    
    dir_path = os.path.join(perhisto_location, histogram_name)
    
    if not os.path.isdir(dir_path):
        return
    
    print(dir_path)
    
    dfs = []
    
    # Read files in directory
    for df_entry in sorted(os.listdir(dir_path)):
        
        df_path = os.path.join(dir_path, df_entry)
        
        if os.path.isfile(df_path):        
            
            df_ = pd.read_csv(df_path)
            dfs.append(df_)
    
    # Create big dataframe from multiple small chunks
    df = pd.concat(dfs, ignore_index=True)
    
    if do_labels:
        # Add label
        df["good"] = df.apply(gj.is_good, axis=1)
        
    # Save
    df_filepath = os.path.join(perhisto_merged_location, ("%s.csv" % histogram_name))
    print(df_filepath)
    df.to_csv(df_filepath, index=False)
    

for histogram_name in sorted(os.listdir(perhisto_location)):
    merge(histogram_name)

DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/NormalizedHitResiduals_TEC__wheel__1
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/NormalizedHitResiduals_TEC__wheel__1.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/NormalizedHitResiduals_TEC__wheel__2
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/NormalizedHitResiduals_TEC__wheel__2.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/NormalizedHitResiduals_TEC__wheel__3
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/NormalizedHitResiduals_TEC__wheel__3.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/NormalizedHitResiduals_TEC__wheel__4
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/NormalizedHitResiduals_TEC__wheel__4.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/NormalizedHitResiduals_TEC__wheel__5
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/NormalizedHitResiduals_TEC__wheel__5.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/NormalizedHitResiduals_TEC__wheel__6
Data

DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/Summary_ClusterPosition2D__OnTrack__TOB__layer__6.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/Summary_ClusterStoNCorr__OnTrack__TEC__MINUS__wheel__1
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/Summary_ClusterStoNCorr__OnTrack__TEC__MINUS__wheel__1.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/Summary_ClusterStoNCorr__OnTrack__TEC__MINUS__wheel__2
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/Summary_ClusterStoNCorr__OnTrack__TEC__MINUS__wheel__2.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/Summary_ClusterStoNCorr__OnTrack__TEC__MINUS__wheel__3
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/Summary_ClusterStoNCorr__OnTrack__TEC__MINUS__wheel__3.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/Summary_ClusterStoNCorr__OnTrack__TEC__MINUS__wheel__4
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/Summary_ClusterStoNCorr__OnTrack__TEC__MINUS__wheel__4.csv
DataFrame_1Dand2D_Z

DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/Summary_TotalNumberOfDigis__TEC__MINUS__wheel__6.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/Summary_TotalNumberOfDigis__TEC__MINUS__wheel__7
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/Summary_TotalNumberOfDigis__TEC__MINUS__wheel__7.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/Summary_TotalNumberOfDigis__TEC__MINUS__wheel__8
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/Summary_TotalNumberOfDigis__TEC__MINUS__wheel__8.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/Summary_TotalNumberOfDigis__TEC__MINUS__wheel__9
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/Summary_TotalNumberOfDigis__TEC__MINUS__wheel__9.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/Summary_TotalNumberOfDigis__TEC__PLUS__wheel__1
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/Summary_TotalNumberOfDigis__TEC__PLUS__wheel__1.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/Summary_TotalNumberOf

DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/chargeOuter_PXLayer_1.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/chargeOuter_PXLayer_2
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/chargeOuter_PXLayer_2.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/chargeOuter_PXLayer_3
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/chargeOuter_PXLayer_3.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/chargeOuter_PXLayer_4
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/chargeOuter_PXLayer_4.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/charge_PXDisk_+1
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/charge_PXDisk_+1.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/charge_PXDisk_+2
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/charge_PXDisk_+2.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/charge_PXDisk_+3
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/charge_PXDisk_+3.csv
DataFrame_1Dand2D_ZeroBias2017D/DF201

DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/size_PXDisk_+2
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/size_PXDisk_+2.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/size_PXDisk_+3
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/size_PXDisk_+3.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/size_PXDisk_-1
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/size_PXDisk_-1.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/size_PXDisk_-2
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/size_PXDisk_-2.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/size_PXDisk_-3
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/size_PXDisk_-3.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/size_PXLayer_1
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/size_PXLayer_1.csv
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHisto/size_PXLayer_2
DataFrame_1Dand2D_ZeroBias2017D/DF2017D/perHistoMerged/size_PXLayer_2.csv
DataFrame_1Dand2D_ZeroBias2017D/DF