# look into RForest model

In [2]:
import os, glob, numpy as np, pandas as pd
import matplotlib.pyplot as plt
import time
plt.rcParams['figure.figsize'] = [25, 15]
plt.rcParams.update({'font.size': 45})

In [3]:
from joblib import load
import time

init = time.time()
rf = load("rf_10est_5depth.joblib")
print("[Info] Loaded models in {:.2f} seconds".format(time.time() - init))

[Info] Loaded models in 0.51 seconds


In [4]:
from sklearn.tree import plot_tree, export_text, export_graphviz
import matplotlib.pyplot as plt
import graphviz

In [5]:
my_features = ["NPE_outer", "NAS_outer", "MeanNPEActive_outer", "SpatlVar_outer", "SpatRange_outer", 
               "NPE_inner", "NAS_inner", "MeanNPEActive_inner", "SpatlVar_inner", "SpatRange_inner",
               "NPE_tot", "NAS_tot", "MeanNPEActive_tot"]

In [16]:
for i, estimator in enumerate(rf.estimators_):
    est_text = export_text(estimator, feature_names=my_features)
    with open(f'tree{i}.txt', 'w') as f:
        f.write(est_text)

In [7]:
for i, estimator in enumerate(rf.estimators_):
    export_graphviz(estimator, out_file=f"tree{i}.dot", 
                feature_names=my_features, class_names=["Ar39", "Mu"],  
                filled=True, rounded=True,
                special_characters=True)

In [40]:
# rewrite estimator 0 as function
def estimator0(x):
    result = 0
    if x[1]<=5.5:
        if x[0]<=7.5:
            if x[8]<=-0.50:                
                result = 0 if x[10]<=5.50 else 1                
            else:
                result = 0 if x[10]<=15.5 else 1
                
        else:
            if x[9] <= 0:
                result = 1
            else:
                result = 0 if x[12] <= 2.65 else 1
    else:
        if x[0] <= 10.5:
            if x[5]<=0.50:
                if x[2]<=1.21:
                    result = 1 if x[3]<=8.06 else 0                    
                else:
                    result = 1
            else:    
                result = 0 if x[4]<=16.5 else 1        
        else:
            result = 1
    return result

In [41]:
def estimator1(x):
    if x[4] <= 17.50:
        if x[1] <= 5.50:
            if x[7] <= 0.00:
                result = 0 if x[10] <= 6.50 else 1
            else:
                if x[10] <= 15.50:
                    result = 0
                else:
                    result = 1 if x[3] <= 3.86 else 0
        else:
            if x[0] <= 11.50:
                if x[7] <= 0.00:
                    result = 0 if x[0] <= 7.50 else 1
                else:
                    result = 0
            else:
                result = 1
    else:
        result = 1
    return result

In [42]:
def estimator2(x):
    if x[1] <= 5.50:
        if x[0] <= 7.50:
            if x[0] <= 4.50:
                if x[7] <= 0.00:
                    result = 0 if x[4] <= 17.50 else 1
                else:
                    result = 0
            else:
                if x[9] <= 0.00:
                    result = 0 if x[2] <= 1.32 else 1
                else:
                    result = 0
        else:
            if x[9] <= 0.00:
                result = 1
            else:
                result = 0 if x[10] <= 15.50 else 1
    else:
        if x[10] <= 13.50:
            if x[8] <= -0.50:
                if x[2] <= 1.17:
                    result = 0 if x[12] <= 1.15 else 1
                else:
                    result = 1
            else:
                result = 0 if x[3] <= 33.69 else 1
        else:
            result = 1
    return result

In [43]:
def estimator3(x):
    if x[10] <= 12.50:
        if x[7] <= 0.00:
            if x[2] <= 1.27:
                if x[0] <= 4.50:
                    result = 0 if x[3] <= 43.38 else 1
                else:
                    result = 0 if x[1] <= 4.50 else 1
            else:
                result = 1
        else:
            result = 0
    else:
        if x[6] <= 2.50:
            if x[2] <= 1.84:
                result = 0 if x[0] <= 11.50 else 1
            else:
                if x[10] <= 15.50:
                    result = 1 if x[6] <= 1.50 else 0
                else:
                    result = 1
        else:
            if x[12] <= 1.88:
                result = 0 if x[0] <= 11.50 else 1
            else:
                result = 0 if x[1] <= 4.50 else 1
    return result

# load data

In [19]:
def load_dataframe_from_files(dirin, fileprefix, max_files=100):
    import glob
    files = glob.glob(os.path.join(dirin, fileprefix))
    print("[Info] Loading {} files wt prefix:\n{}".format(len(files), fileprefix))
    df = pd.read_csv(files[0], comment='#', index_col=False)
    for file in files[1:max_files]:
        print(".", end='')
        dftmp = pd.read_csv(file, comment='#', index_col=False)
        df = pd.concat([df, dftmp])
    print("")
    return df

In [20]:
n_inner_slices, n_outer_slices = 12, 20

In [21]:
# Load Ar39
init = time.time()
dirin = os.path.join("..", "..", "Data", "OutputProcessing", "UnseenTestData_11_10_2020", "Ar39", "Ar39_Snapshots")

fileprefix = "Snapshot1Ar39*"
dfall1ar39 = load_dataframe_from_files(dirin, fileprefix)

fileprefix = "Snapshot2Ar39*"
dfall2ar39 = load_dataframe_from_files(dirin, fileprefix)

fileprefix = "Snapshot3Ar39*"
dfall3ar39 = load_dataframe_from_files(dirin, fileprefix)

fileprefix = "Snapshot4Ar39*"
dfall4ar39 = load_dataframe_from_files(dirin, fileprefix)

fileprefix = "Snapshot5Ar39*"
dfall5ar39 = load_dataframe_from_files(dirin, fileprefix)

print("[Info] Loaded all 1, 2, 3, 4, 5 Ar39 Pileups in {:.2f} seconds".format(time.time() - init))

# Load Muons
init = time.time()
dirin = os.path.join("..", "..", "Data", "OutputProcessing", "UnseenTestData_11_10_2020", "Muons", "Muons_Snapshots")

fileprefix = "SnapshotMuons*"
dfallmu = load_dataframe_from_files(dirin, fileprefix)
print("[Info] Loaded all Muons in {:.2f} seconds".format(time.time() - init))

[Info] Loading 100 files wt prefix:
Snapshot1Ar39*
...................................................................................................
[Info] Loading 100 files wt prefix:
Snapshot2Ar39*
...................................................................................................
[Info] Loading 100 files wt prefix:
Snapshot3Ar39*
...................................................................................................
[Info] Loading 100 files wt prefix:
Snapshot4Ar39*
...................................................................................................
[Info] Loading 100 files wt prefix:
Snapshot5Ar39*
...................................................................................................
[Info] Loaded all 1, 2, 3, 4, 5 Ar39 Pileups in 15.36 seconds
[Info] Loading 100 files wt prefix:
SnapshotMuons*
...................................................................................................
[Info] Loaded all Muons in 0.79 

In [22]:
# prepare data
dfallmu["group"] = 0
dfall1ar39["group"] = 1
dfall2ar39["group"] = 2
dfall3ar39["group"] = 3
dfall4ar39["group"] = 4
dfall5ar39["group"] = 5
dfallmu["y"] = 1
dfall1ar39["y"] = 0
dfall2ar39["y"] = 0
dfall3ar39["y"] = 0
dfall4ar39["y"] = 0
dfall5ar39["y"] = 0
dfallar39 = pd.concat([dfall1ar39, dfall2ar39, dfall3ar39, dfall4ar39, dfall5ar39], axis=0)

In [23]:
# Create Features
def pe_detected(row):
    return np.sum(row.to_numpy())

def nr_active_slices(row):
    return np.nonzero(row.to_numpy())[0].shape[0]

def mean_npe(row):
    return np.mean(row.to_numpy())

def mean_npe_active(row):
    rownp = row.to_numpy()
    rownonzero = np.nonzero(rownp)[0]
    return np.mean(rownp[rownonzero]) if rownonzero.shape[0]>0 else -1

def std_npe(row):
    return np.std(row.to_numpy())

def std_npe_active(row):
    rownp = row.to_numpy()
    rownonzero = np.nonzero(rownp)[0]
    return np.std(rownp[rownonzero]) if rownonzero.shape[0]>0 else -1

def range_detections(row):
    rownp = row.to_numpy()
    rownonzero = np.nonzero(rownp)[0]
    return rownonzero[-1] - rownonzero[0] + 1 if rownonzero.shape[0]>0 else -1

def spatial_var(row):
    rownp = row.to_numpy()
    ids = np.repeat(np.argwhere(rownp>0), rownp[rownp>0])
    return np.var(ids) if ids.shape[0]>0 else -1

def spatial_std(row):
    rownp = row.to_numpy()
    ids = np.repeat(np.argwhere(rownp>0), rownp[rownp>0])
    return np.std(ids) if ids.shape[0]>0 else -1

In [24]:
# features for rforest
my_features = ["PEDetected_outer", "NActiveSlices_outer", "MeanNPEActive_outer", "SpatialVar_outer", "SpatialRange_outer", 
               "PEDetected_inner", "NActiveSlices_inner", "MeanNPEActive_inner", "SpatialVar_inner", "SpatialRange_inner",
               "PEDetected_tot", "NActiveSlices_tot", "MeanNPEActive_tot"]

init = time.time()
dfallmu["PEDetected_inner"] = dfallmu.iloc[:, 3:3+n_inner_slices].apply(lambda row: pe_detected(row), axis=1)
dfallmu["NActiveSlices_inner"] = dfallmu.iloc[:, 3:3+n_inner_slices].apply(lambda row: nr_active_slices(row), axis=1)
dfallmu["MeanNPEActive_inner"] = dfallmu.iloc[:, 3:3+n_inner_slices].apply(lambda row: mean_npe_active(row), axis=1)
dfallmu["StdNPEActive_inner"] = dfallmu.iloc[:, 3:3+n_inner_slices].apply(lambda row: std_npe_active(row), axis=1)
dfallmu["SpatialRange_inner"] = dfallmu.iloc[:, 3:3+n_inner_slices].apply(lambda row: range_detections(row), axis=1)
dfallmu["SpatialVar_inner"] = dfallmu.iloc[:, 3:3+n_inner_slices].apply(lambda row: spatial_var(row), axis=1)
dfallmu["SpatialStd_inner"] = dfallmu.iloc[:, 3:3+n_inner_slices].apply(lambda row: spatial_std(row), axis=1)
print("Compute Mu Inner features: {:.3f} sec".format(time.time() - init))
init = time.time()
dfallmu["PEDetected_outer"] = dfallmu.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: pe_detected(row), axis=1)
dfallmu["NActiveSlices_outer"] = dfallmu.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: nr_active_slices(row), axis=1)
dfallmu["MeanNPEActive_outer"] = dfallmu.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: mean_npe_active(row), axis=1)
dfallmu["StdNPEActive_outer"] = dfallmu.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: std_npe_active(row), axis=1)
dfallmu["SpatialRange_outer"] = dfallmu.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: range_detections(row), axis=1)
dfallmu["SpatialVar_outer"] = dfallmu.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: spatial_var(row), axis=1)
dfallmu["SpatialStd_outer"] = dfallmu.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: spatial_std(row), axis=1)
print("Compute Mu Outer features: {:.3f} sec".format(time.time() - init))
init = time.time()
dfallmu["PEDetected_tot"] = dfallmu["PEDetected_inner"] + dfallmu["PEDetected_outer"]
dfallmu["NActiveSlices_tot"] = dfallmu["NActiveSlices_inner"] + dfallmu["NActiveSlices_outer"]
dfallmu["MeanNPEActive_tot"] = dfallmu["PEDetected_tot"] / dfallmu["NActiveSlices_tot"]
dfallmu = dfallmu.fillna(-1)
print("Compute combined features: {:.3f} sec".format(time.time() - init))

init = time.time()
dfallar39["PEDetected_inner"] = dfallar39.iloc[:, 3:3+n_inner_slices].apply(lambda row: pe_detected(row), axis=1)
dfallar39["NActiveSlices_inner"] = dfallar39.iloc[:, 3:3+n_inner_slices].apply(lambda row: nr_active_slices(row), axis=1)
dfallar39["MeanNPEActive_inner"] = dfallar39.iloc[:, 3:3+n_inner_slices].apply(lambda row: mean_npe_active(row), axis=1)
dfallar39["StdNPEActive_inner"] = dfallar39.iloc[:, 3:3+n_inner_slices].apply(lambda row: std_npe_active(row), axis=1)
dfallar39["SpatialRange_inner"] = dfallar39.iloc[:, 3:3+n_inner_slices].apply(lambda row: range_detections(row), axis=1)
dfallar39["SpatialVar_inner"] = dfallar39.iloc[:, 3:3+n_inner_slices].apply(lambda row: spatial_var(row), axis=1)
dfallar39["SpatialStd_inner"] = dfallar39.iloc[:, 3:3+n_inner_slices].apply(lambda row: spatial_std(row), axis=1)
print("Compute Ar39 Inner features: {:.3f} sec".format(time.time() - init))
init = time.time()
dfallar39["PEDetected_outer"] = dfallar39.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: pe_detected(row), axis=1)
dfallar39["NActiveSlices_outer"] = dfallar39.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: nr_active_slices(row), axis=1)
dfallar39["MeanNPEActive_outer"] = dfallar39.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: mean_npe_active(row), axis=1)
dfallar39["StdNPEActive_outer"] = dfallar39.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: std_npe_active(row), axis=1)
dfallar39["SpatialRange_outer"] = dfallar39.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: range_detections(row), axis=1)
dfallar39["SpatialVar_outer"] = dfallar39.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: spatial_var(row), axis=1)
dfallar39["SpatialStd_outer"] = dfallar39.iloc[:, 3+n_inner_slices:3+n_inner_slices+n_outer_slices].apply(lambda row: spatial_std(row), axis=1)
print("Compute Ar39 Outer features: {:.3f} sec".format(time.time() - init))
init = time.time()
dfallar39["PEDetected_tot"] = dfallar39["PEDetected_inner"] + dfallar39["PEDetected_outer"]
dfallar39["NActiveSlices_tot"] = dfallar39["NActiveSlices_inner"] + dfallar39["NActiveSlices_outer"]
dfallar39["MeanNPEActive_tot"] = dfallar39["PEDetected_tot"] / dfallar39["NActiveSlices_tot"]
dfallar39 = dfallar39.fillna(-1)
print("Compute combined features: {:.3f} sec".format(time.time() - init))

Compute Mu Inner features: 2.898 sec
Compute Mu Outer features: 3.520 sec
Compute combined features: 0.008 sec
Compute Ar39 Inner features: 190.339 sec
Compute Ar39 Outer features: 248.847 sec
Compute combined features: 0.132 sec


In [25]:
data = pd.concat([dfallar39, dfallmu], axis=0)

# predictions

In [32]:
X_rf, y_rf = data.loc[:, my_features], data.loc[:, "y"]
X_rf = np.array(X_rf)
y_rf = np.array(y_rf)

In [63]:
for id_estimator in range(4):
    dt = rf.estimators_[id_estimator]
    init = time.time()
    data[f'y_original_est{id_estimator}_pred'] = dt.predict(X_rf)
    print("[Info] Estimator {} predict in {:.3f} seconds".format(id_estimator, time.time() - init))

[Info] Estimator 0 predict in 0.098 seconds
[Info] Estimator 1 predict in 0.059 seconds
[Info] Estimator 2 predict in 0.064 seconds
[Info] Estimator 3 predict in 0.059 seconds


In [59]:
est_fn = [estimator0, estimator1, estimator2, estimator3]
for id_estimator in range(4):
    dt_fn = est_fn[id_estimator]
    init = time.time()
    data[f'y_est{id_estimator}_pred'] = np.apply_along_axis(dt_fn, axis=1, arr=X_rf)
    print("[Info] Estimator {} predict in {:.3f} seconds".format(id_estimator, time.time() - init))

[Info] Estimator 0 predict in 4.094 seconds
[Info] Estimator 1 predict in 4.067 seconds
[Info] Estimator 2 predict in 4.334 seconds
[Info] Estimator 3 predict in 4.029 seconds


In [77]:
any(data["y_original_est3_pred"] != data["y_est3_pred"])

True