In [1]:
import json
import pandas as pd
from pathlib import Path

In [2]:
def get_data(df, meta, verbose=False, cmap_name="Oranges", show_images=True):
    
    condition = None
    
    for row in meta:
        
        sub_condition = None
        
        if "include" in row:
            sub_condition = (df["run"] == row["run_number"]) & (df["lumi"].isin(row["include"]))
            
        if "exclude" in row:
            sub_condition = (df["run"] == row["run_number"]) & ~df["lumi"].isin(row["exclude"])
        
        if "include" in row and "exclude" in row:
            sub_condition = (df["run"] == row["run_number"]) & ~df["lumi"].isin(row["exclude"]) & df_orig["lumi"].isin(row["include"])
        
        if sub_condition is None:
            sub_condition = df["run"] == row["run_number"]
        
        if verbose:
            print("Sub")
            print(sub_condition.value_counts())
        
        if condition is None:
            condition = sub_condition
        else:
            condition = condition | sub_condition
        
        if verbose:
            print("Condition")
            print(condition.value_counts())
    
    if condition is not None:
         return df[condition].copy()
    else:
        return pd.DataFrame().reindex_like(df)

In [3]:
meta_medium = {
    "dset_path": "../data_raw/chargeInner_PXLayer_1.csv",
    "dset_name": "ZeroBias",
    "dset_era": "2017B",
    "histo_name": "chargeInner_PXLayer_1",
    
    "good": [{"run_number": 297050, "exclude": [420, 421, 424, 426, 429]}, 
             {"run_number": 297056, "exclude": [31, 34, 36, 40]}, 
             {"run_number": 297178, "exclude": [675, 678, 680, 683, 1266, 1378]}, 
            ],
    "bad": [{"run_number": 297047}, {"run_number": 297048}, {"run_number": 297049},
            {"run_number": 297170}, {"run_number": 297171}, 
            {"run_number": 297281}, {"run_number": 297284}, 
            {"run_number": 297286}, {"run_number": 297664}, 
            {"run_number": 297671}, {"run_number": 297672}, 
            {"run_number": 297050, "include": [420, 421, 424, 426, 429]}, 
            {"run_number": 297056, "include": [31, 34, 36, 40]}, 
            {"run_number": 297178, "include": [675, 678, 680, 683, 1266, 1378]}]
}

In [4]:
meta_medium_p = {
    "dset_path": "../data_raw/chargeInner_PXLayer_1.csv",
    "dset_name": "ZeroBias",
    "dset_era": "2017B",
    "histo_name": "chargeInner_PXLayer_1",
    
    "good": [{"run_number": 297050, "exclude": [420, 421, 424, 426, 429]}, 
             {"run_number": 297056, "exclude": [31, 34, 36, 40]}, 
             {"run_number": 297178, "exclude": [675, 678, 680, 683, 1266, 1378]}, 
             {"run_number": 297675, "exclude": [242]}
            ],
    "bad": [{"run_number": 297047}, {"run_number": 297048}, {"run_number": 297049}, 
            {"run_number": 297170}, {"run_number": 297171}, 
            {"run_number": 297281}, {"run_number": 297284}, 
            {"run_number": 297286}, {"run_number": 297664}, 
            {"run_number": 297671}, {"run_number": 297672}, 
            {"run_number": 297050, "include": [420, 421, 424, 426, 429]}, 
            {"run_number": 297056, "include": [31, 34, 36, 40]}, 
            {"run_number": 297178, "include": [675, 678, 680, 683, 1266, 1378]}]
}

In [5]:
# Single
meta_single = {
    "dset_path": "../data_raw/chargeInner_PXLayer_1.csv",
    "dset_name": "ZeroBias",
    "dset_era": "2017B",
    "histo_name": "chargeInner_PXLayer_1",
    
    "good": [{"run_number": 297050, "exclude": [420, 421, 424, 426, 429]}],
    "bad": [{"run_number": 297050, "include": [420, 421, 424, 426, 429]},
            {"run_number": 297047}
           ]
}

In [6]:
# Small
meta_small = {
    "dset_path": "../data_raw/chargeInner_PXLayer_1.csv",
    "dset_name": "ZeroBias",
    "dset_era": "2017B",
    "histo_name": "chargeInner_PXLayer_1",
    
    "good": [
             {"run_number": 297050, "exclude": [420, 421, 424, 426, 429]}             
            ],
    "bad": [{"run_number": 297047}, 
            {"run_number": 297048},
            {"run_number": 297049},
            {"run_number": 297050, "include": [420, 421, 424, 426, 429]}
           ]
}

### Train & Test data

In [9]:
def make_df(meta, save_dir):
    print(save_dir)
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    # Read dataset
    df_orig = pd.read_csv(meta["dset_path"])

    # Drop empty rows
    df_orig.drop(df_orig[df_orig.entries == 0].index, inplace=True)

    # Drop garbage
    df_orig.drop(["Unnamed: 0", "Unnamed: 0.1", "fromrun.1", "fromlumi.1", "hname.1"], axis=1, inplace=True, errors="ignore")

    print("df_orig", df_orig.shape)

    # Train GOOD
    df_good = get_data(df_orig, meta["good"], cmap_name="Greens", show_images=False)
    df_good['y'] = 1
    print("df_good", df_good.shape)

    # Train BAD
    df_bad = get_data(df_orig, meta["bad"], cmap_name="Reds", show_images=False)
    df_bad['y'] = 0
    print("df_bad", df_bad.shape)

    # Train Dataset
    df_train = pd.concat([df_good, df_bad], ignore_index=True) 
    df_train.to_csv(save_dir.joinpath("train.csv"))

    print("df_train", df_train.shape)

    # Test data (original dataset without handpicked GOOD and BAD histograms)
    exclude_runs = list(df_good["run"].unique()) + list(df_bad["run"].unique())
    df_test = df_orig[~df_orig["run"].isin(exclude_runs)].copy()

    df_test.to_csv(save_dir.joinpath("test.csv"))

    print("df_test", df_test.shape)

    with open(save_dir.joinpath('meta.json'), 'w') as fh:
        json.dump(meta, fh)
    
    df_plot(df_train, save_path=save_dir.joinpath("df_train.jpg"))
#     do_tsne(df_train, save_path=save_dir.joinpath("tsne_df_train.jpg"))

#     df_test['y'] = 2
#     do_tsne(pd.concat([df_train, df_test], ignore_index=True), save_path=save_dir.joinpath("tsne_df.jpg"))

make_df(meta_small, "../data/small")
make_df(meta_medium, "../data/medium")
make_df(meta_medium_p, "../data/medium_p")
# make_df(meta_single, "../data/single")

../data/medium
df_orig (27208, 115)
df_good (2324, 116)
df_bad (312, 116)
df_train (2636, 116)
df_test (24572, 115)


In [8]:
# df_orig = pd.read_csv("../data_raw/chargeInner_PXLayer_1.csv")
# df_orig.drop(df_orig[df_orig.entries == 0].index, inplace=True)
# df_orig.drop(["Unnamed: 0", "Unnamed: 0.1", "fromrun.1", "fromlumi.1", "hname.1"], axis=1, inplace=True, errors="ignore")
# do_tsne(df_orig, save_path="../data_raw/df.jpg", ignore_y=True)