# Integration of Cuts into DataHandler:

The `DataHandler` class should be instantiated with a `cuts` object. The `cuts` object would be a `JSON` file that has the following structure.

## Cuts JSON Structure:

```json
name (dtype -> string (e.g. "cut1")):
    {
        function: dtype -> string (e.g. "trimDF"),
        term: dtype -> string (e.g. "clusterEta"),
        operation: dtype -> string (e.g. ">"),
        value: dtype -> float/int (e.g. 0.8, 2),

    }
```

The `JSON` file can contain as many of such cuts as required. 

**IMPORTANT**
1. An empty `JSON` file would imply no cuts.
2. Cuts will be processed in the order they are presented in the `JSON` file.

## Cuts Available

The following cuts are available within the program.

`trimDF`, `trimAbsDF`, `summedEnergyCut`

You can define your own custom cuts in `Cuts.py` file.

---

## DataHandler.py

In [1]:
import pandas as pd
import csv
import numpy as np
from Cuts import *

def normalizeDF(df):
    df_norm = df.subtract(df.mean())
    df_norm = df_norm.divide(df.std())
    return df_norm


class DataHandler:
    def __init__(self, path2test, path2train, path2list_inputs, cuts_dir):
        self.path2test = path2test
        self.path2train = path2train
        self.list_inputs = self.getListInputs(path2list_inputs)
        
        self.train_raw = self.getAllTrainDF()
        self.test_raw = self.getAllTestDF()
        self.allColumns = self.train_raw.columns
        
        self.train_all_normed = normalizeDF(self.train_raw)
        self.test_all_normed = normalizeDF(self.test_raw)
        
        self.train_df = self.getTrainingData()
        self.test_df = self.getTestingData()
        self.target_df = self.train_raw["cluster_ENG_CALIB_TOT"]
        
        self.processCuts(cuts_dir, self.train_all_normed)
    
    def processCuts(self, cuts_dir, df):
        cuts_df = pd.read_json(cuts_dir)
        if len(cuts_df) == 0:
            print("No cuts applied...")
            return
        else:
            c = Cuts(cuts_df, df)
            df = c.getCuts(df)
            self.train_df = self.getTrainingDataFromCutDF(df)
            self.test_df = self.getTestingDataFromCutDF(df)
            self.target_df = self.getTargetDatafromCutDF(df, self.train_raw)
            
    def getTargetDatafromCutDF(self, df, test_df):
        indices = list(df.index.values)
        df = test_df.iloc[indices]
        return df["cluster_ENG_CALIB_TOT"]
    
    def getTestingDataFromCutDF(self, df):
        test_df = df[self.list_inputs]
        normed_df = normalizeDF(test_df)
        return normed_df

    def getTrainingDataFromCutDF(self, df):
        train_df = df[self.list_inputs]
        normed_df = normalizeDF(train_df)
        return normed_df
        
    def getListInputs(self, path2inputs): #gets list inputs data
        with open(path2inputs, newline='') as f:
            reader = csv.reader(f)
            data = list(reader)
        data = list(np.array(data).flatten())
        return data
    
    def getAllTestDF(self): #gets testing data frame
        test_all = pd.read_csv(self.path2test)
        return test_all

    def getAllTrainDF(self):#gets training data frame
        train_all = pd.read_csv(self.path2train)
        return train_all

    def getAllDataColumns(self): #gets the names of the data columns in the data frame
        columns = pd.read_csv(self.path2test, nrows=1)
        return columns

    def getTestingData(self):#gets list inputs testing data from normalized data frame
        test_df = self.test_raw[self.list_inputs]
        normed_df = normalizeDF(test_df)
        return normed_df

    def getTrainingData(self):#gets list inputs training data from normalized data frame
        train_df = self.train_raw[self.list_inputs]
        normed_df = normalizeDF(train_df)
        return normed_df

## Cuts.py

In [94]:
import pandas as pd
import numpy as np

class Cuts:
    
    def __init__(self, cuts_df, df):
        self.cuts_df = cuts_df
        self.df = df
        
    def getCuts(self, df):
        for col in self.cuts_df.columns:
            data = self.cuts_df[col].values
            fname = data[0]
            term = data[1]
            operation = data[2]
            value = data[3]
            df = getattr(self, fname)(df, term, value, operation, col) #selects the corresponding func from the json
            # and executes
            print("Size of dataframe: {}\n".format(len(df)))
        return df
    
    def summedEnergyCut(self, df,term,value,operation, col):
        print("Applying Cut: {}".format(col, term, operation, value))
        df.sort_values(by=['eventNumber'],inplace=True)
        cname = "sum"+term
        df[cname] = value*df.groupby('eventNumber')[term].transform('sum')

        if operation == "<":
            df = df[df[term] < df[cname]]
        elif operation == ">":
            df = df[df[term] > df[cname]]
        elif operation == "==":
            df = df[df[term] == df[cname]]

        return df

    def trimDF(self, df,term,value,operation, col):
        print("Applying Cut: {}".format(col, term, operation, value))
        
        if operation == "<":
            df = df[df[term] < value]
        elif operation == ">":
            df = df[df[term] > value]
        elif operation == "==":
            df = df[df[term] == value]
        
        return df

    def trimAbsDF(self, df,term,value,operation, col):
        print("Applying Cut: {}".format(col, term, operation, value))
        
        if operation == "<":
            df = df[abs(df[term]) < value]
        elif operation == ">":
            df = df[abs(df[term]) > value]
        elif operation == "==":
            df = df[abs(df[term]) == value]
            
        return df

## Testing:

In [95]:
cuts_json_dir = 'cuts/basic_cuts.json'
no_cuts_json_dir = 'cuts/example_cut1.json'

test_csv_dir = "data/test.csv"
train_csv_dir = "data/train.csv"
inputs_dir = "inputs/FelixInputs.csv"

In [96]:
cut_data = DataHandler(test_csv_dir, train_csv_dir, inputs_dir, cuts_json_dir)

Applying Cut: summedEnergyCut
Size of dataframe: 993972

Applying Cut: etaCut
Size of dataframe: 511149



In [6]:
no_cut_data = DataHandler(test_csv_dir, train_csv_dir, inputs_dir, no_cuts_json_dir)

No cuts applied...


In [7]:
nctrain_df = no_cut_data.train_df
nctrain_all_df = no_cut_data.train_raw
nctest_df = no_cut_data.test_df
nctarget_df = no_cut_data.target_df

In [98]:
ctrain_df = cut_data.train_df
ctrain_all_df = cut_data.train_raw
ctest_df = cut_data.test_df
ctarget_df = cut_data.target_df

In [99]:
len(ctrain_df), len(ctarget_df)

(511149, 511149)

In [155]:
len(nctrain_df), len(nctarget_df)

(2629411, 2629411)

In [93]:
def summedEnergyCut(df,term,value,operation, col):
    print("Applying Cut: {}".format(col, term, operation, value))
    df.sort_values(by=['eventNumber'],inplace=True)
    cname = "sum"+term
    df[cname] = value*df.groupby('eventNumber')[term].transform('sum')

    if operation == "<":
        df = df[df[term] < df[cname]]
    elif operation == ">":
        df = df[df[term] > df[cname]]
    elif operation == "==":
        df = df[df[term] == df[cname]]
    
    return df