In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc

from sklearn.ensemble import RandomForestClassifier

In [3]:
class RandomForestTrain:
    
    def __init__(self):
        self.accs = []
        self.aucs = []
        
    def load_data(self, filename):
        self.df = pd.read_csv(filename)
        print(self.df.shape)
        
        # Filter list of columns which will be used for training
        bin_cols = [col for col in self.df.columns if 'bin_' in col]

        # remove first and last values as those are over/under flows
        bin_cols = bin_cols[1:-1]
        
        # Drop empty rows
        self.df.drop(self.df[self.df.entries == 0].index, inplace=True)
        print(self.df.shape)
            
        # Normalization, divide every bin value by total entries
        self.X = self.df.filter(bin_cols, axis=1).copy().div(self.df.entries, axis=0)

        self.y = self.df["good_pixel"]
    
    def train_eval(self, verbose=True):
        
        skf = StratifiedKFold(n_splits=10, shuffle=True)

        for train_index, test_index in skf.split(self.X, self.y):
            X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
            y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index]
        
            
            model = RandomForestClassifier(n_jobs=6).fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            if verbose:
                print(confusion_matrix(y_test, y_pred))

            acc_score = accuracy_score(y_test, y_pred)

            y_probas = model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_probas)
            auc_score = auc(fpr, tpr)
            
            if verbose:
                print("ACC", round(acc_score, 3) , "AUC", round(auc_score, 3))

            self.accs.append(round(acc_score, 3))
            self.aucs.append(round(auc_score, 3))

In [4]:
# # Class balance
# def print_unique(values):
#     unique, counts = np.unique(values, return_counts=True)

#     for cls, cnt in zip(unique, counts):
#         print("Class [%d] Count [%d]" % (cls, cnt))
        
# print_unique(y)

In [5]:
# Histogram names to be trained
cipxl = ["chargeInner_PXLayer_1", "chargeInner_PXLayer_2", "chargeInner_PXLayer_3", "chargeInner_PXLayer_4"]
copxl = ["chargeOuter_PXLayer_1", "chargeOuter_PXLayer_2", "chargeOuter_PXLayer_3", "chargeOuter_PXLayer_4"]
spxl = ["size_PXLayer_1", "size_PXLayer_2", "size_PXLayer_3", "size_PXLayer_4"]
spxd = ["size_PXDisk_-3", "size_PXDisk_-2", "size_PXDisk_-1", "size_PXDisk_+1", "size_PXDisk_+2", "size_PXDisk_+3"]
cpxd = ["charge_PXDisk_-3", "charge_PXDisk_-2", "charge_PXDisk_-1", "charge_PXDisk_+1", "charge_PXDisk_+2", "charge_PXDisk_+3"]

hnames = cipxl + copxl + spxl + spxd + cpxd

In [6]:
results = {}

for index, hname in enumerate(hnames):
    print(index+1, "/", len(hnames), hname)
    
    filename = "/home/mantydze/data/ZeroBias2017B/massaged/{hname}.csv".format(hname=hname)
    
    rft = RandomForestTrain()
    rft.load_data(filename)
    rft.train_eval(verbose=False)
    
    results[hname] = {
        "accs": rft.accs, 
        "aucs": rft.aucs,
        "acc_mean": round(np.mean(rft.accs), 3),
        "acc_std": round(np.std(rft.accs), 3),
        "auc_mean": round(np.mean(rft.aucs), 3),
        "auc_std": round(np.std(rft.aucs), 3)
    }
    
    print("Mean ACC", results[hname]["acc_mean"], "std", results[hname]["acc_std"]) 
    print("Mean AUC", results[hname]["auc_mean"], "std", results[hname]["auc_std"]) 
    print()

1 / 24 chargeInner_PXLayer_1
(28335, 120)
(27208, 120)
Mean ACC 0.987 std 0.002
Mean AUC 0.976 std 0.006

2 / 24 chargeInner_PXLayer_2
(28335, 120)
(27208, 120)
Mean ACC 0.986 std 0.002
Mean AUC 0.958 std 0.009

3 / 24 chargeInner_PXLayer_3
(28335, 120)
(27208, 120)
Mean ACC 0.964 std 0.003
Mean AUC 0.829 std 0.018

4 / 24 chargeInner_PXLayer_4
(28335, 120)
(27208, 120)
Mean ACC 0.965 std 0.003
Mean AUC 0.788 std 0.03

5 / 24 chargeOuter_PXLayer_1
(28335, 120)
(27208, 120)
Mean ACC 0.986 std 0.002
Mean AUC 0.974 std 0.008

6 / 24 chargeOuter_PXLayer_2
(28335, 120)
(27208, 120)
Mean ACC 0.986 std 0.002
Mean AUC 0.96 std 0.013

7 / 24 chargeOuter_PXLayer_3
(28335, 120)
(27208, 120)
Mean ACC 0.964 std 0.001
Mean AUC 0.811 std 0.026

8 / 24 chargeOuter_PXLayer_4
(28335, 120)
(27208, 120)
Mean ACC 0.965 std 0.002
Mean AUC 0.785 std 0.02

9 / 24 size_PXLayer_1
(28335, 50)
(27208, 50)
Mean ACC 0.991 std 0.002
Mean AUC 0.988 std 0.005

10 / 24 size_PXLayer_2
(28335, 50)
(27208, 50)
Mean ACC 0.

In [7]:
df = pd.DataFrame.from_dict(results, orient='index')

In [8]:
df

Unnamed: 0,accs,aucs,acc_mean,acc_std,auc_mean,auc_std
chargeInner_PXLayer_1,"[0.983, 0.989, 0.987, 0.988, 0.988, 0.992, 0.9...","[0.962, 0.979, 0.975, 0.983, 0.982, 0.98, 0.97...",0.987,0.002,0.976,0.006
chargeInner_PXLayer_2,"[0.986, 0.984, 0.983, 0.987, 0.985, 0.989, 0.9...","[0.957, 0.952, 0.957, 0.962, 0.971, 0.965, 0.9...",0.986,0.002,0.958,0.009
chargeInner_PXLayer_3,"[0.965, 0.966, 0.969, 0.963, 0.964, 0.957, 0.9...","[0.834, 0.847, 0.854, 0.862, 0.825, 0.811, 0.8...",0.964,0.003,0.829,0.018
chargeInner_PXLayer_4,"[0.96, 0.964, 0.967, 0.963, 0.964, 0.968, 0.96...","[0.742, 0.787, 0.815, 0.759, 0.759, 0.802, 0.7...",0.965,0.003,0.788,0.03
chargeOuter_PXLayer_1,"[0.985, 0.985, 0.984, 0.989, 0.985, 0.985, 0.9...","[0.984, 0.971, 0.965, 0.981, 0.962, 0.968, 0.9...",0.986,0.002,0.974,0.008
chargeOuter_PXLayer_2,"[0.984, 0.985, 0.986, 0.984, 0.985, 0.99, 0.98...","[0.953, 0.963, 0.967, 0.946, 0.935, 0.974, 0.9...",0.986,0.002,0.96,0.013
chargeOuter_PXLayer_3,"[0.961, 0.963, 0.964, 0.963, 0.964, 0.965, 0.9...","[0.816, 0.845, 0.865, 0.821, 0.779, 0.776, 0.8...",0.964,0.001,0.811,0.026
chargeOuter_PXLayer_4,"[0.963, 0.964, 0.964, 0.965, 0.965, 0.966, 0.9...","[0.804, 0.735, 0.773, 0.799, 0.795, 0.779, 0.7...",0.965,0.002,0.785,0.02
size_PXLayer_1,"[0.992, 0.993, 0.992, 0.994, 0.99, 0.988, 0.99...","[0.995, 0.986, 0.986, 0.991, 0.989, 0.98, 0.98...",0.991,0.002,0.988,0.005
size_PXLayer_2,"[0.982, 0.98, 0.986, 0.979, 0.979, 0.982, 0.97...","[0.914, 0.894, 0.921, 0.904, 0.887, 0.914, 0.8...",0.981,0.002,0.907,0.013


In [9]:
results

{'chargeInner_PXLayer_1': {'accs': [0.983,
   0.989,
   0.987,
   0.988,
   0.988,
   0.992,
   0.985,
   0.986,
   0.989,
   0.987],
  'aucs': [0.962, 0.979, 0.975, 0.983, 0.982, 0.98, 0.978, 0.976, 0.979, 0.97],
  'acc_mean': 0.987,
  'acc_std': 0.002,
  'auc_mean': 0.976,
  'auc_std': 0.006},
 'chargeInner_PXLayer_2': {'accs': [0.986,
   0.984,
   0.983,
   0.987,
   0.985,
   0.989,
   0.988,
   0.984,
   0.99,
   0.988],
  'aucs': [0.957,
   0.952,
   0.957,
   0.962,
   0.971,
   0.965,
   0.965,
   0.939,
   0.962,
   0.948],
  'acc_mean': 0.986,
  'acc_std': 0.002,
  'auc_mean': 0.958,
  'auc_std': 0.009},
 'chargeInner_PXLayer_3': {'accs': [0.965,
   0.966,
   0.969,
   0.963,
   0.964,
   0.957,
   0.964,
   0.968,
   0.965,
   0.962],
  'aucs': [0.834,
   0.847,
   0.854,
   0.862,
   0.825,
   0.811,
   0.808,
   0.819,
   0.824,
   0.81],
  'acc_mean': 0.964,
  'acc_std': 0.003,
  'auc_mean': 0.829,
  'auc_std': 0.018},
 'chargeInner_PXLayer_4': {'accs': [0.96,
   0.964,
 