Merge pull request jemrobinson#11 from mickypaganini/master

Rebalancing + handle ftrain=1
mickypaganini · May 13, 2016 · ccebb61 · ccebb61
2 parents a68f375 + efaf599
commit ccebb61
Show file tree

Hide file tree

Showing 7 changed files with 96 additions and 42 deletions.
diff --git a/bbyy_jet_classifier/plotting/plot_inputs.py b/bbyy_jet_classifier/plotting/plot_inputs.py
@@ -1,13 +1,24 @@
+
 import logging
 import matplotlib.pyplot as plt
 import numpy as np
 import os
 import plot_atlas
 
-ROOT_2_LATEX = {"abs_eta_j": r"$|\eta_{j}|$", "abs_eta_jb": r"$|\eta_{jb}|$",
-                "Delta_eta_jb": r"$\Delta\eta_{jb}$", "Delta_phi_jb": r"$\Delta\phi_{jb}$",
-                "idx_by_mH": r"$m_{H}$ matching order", "idx_by_pT": r"$p_{T}$ order",
-                "m_jb": r"$m_{jb}$", "pT_j": r"$p_{T}^{j}$", "pT_jb": r"$p_{T}^{jb}$"}
+ROOT_2_LATEX = {
+    "abs_eta_j": r"$|\eta_{j}|$", 
+    "abs_eta_jb": r"$|\eta_{jb}|$",
+    "Delta_eta_jb": r"$\Delta\eta_{jb}$", 
+    "Delta_phi_jb": r"$\Delta\phi_{jb}$",
+    "idx_by_mH": r"$m_{H}$ matching order", 
+    "idx_by_pT": r"$p_{T}$ order",
+    "m_jb": r"$m_{jb}$", 
+    "pT_j": r"$p_{T}^{j}$", 
+    "pT_jb": r"$p_{T}^{jb}$",
+    "MV2c20_FCBE_70": r"MV2c20 FCBE(70%)", 
+    "MV2c20_FCBE_77": r"MV2c20 FCBE(77%)",
+    "MV2c20_FCBE_85": r"MV2c20 FCBE(85%)" 
+    }
 
 
 def input_distributions(classification_variables, training_data, test_data, directory):
@@ -34,35 +45,41 @@ def input_distributions(classification_variables, training_data, test_data, dire
         # -- Initialise figure and axes
         figure = plt.figure(figsize=(6, 6), dpi=100)
         axes = plt.axes()
-        bins = np.linspace(min([min(test_data['X'][:, i]), min(test_data['X'][:, i])]), max([max(test_data['X'][:, i]), max(test_data['X'][:, i])]), 50)
-        bin_centres = np.array([0.5 * (l + h) for l, h in zip(bins[:-1], bins[1:])])
+        try:
+            bins = np.linspace(min([min(training_data['X'][:, i]), min(test_data['X'][:, i])]), max([max(training_data['X'][:, i]), max(test_data['X'][:, i])]), 50)
+            bin_centres = np.array([0.5 * (l + h) for l, h in zip(bins[:-1], bins[1:])])
 
-        # -- Plot test data if available
-        test_data_correct = test_data['X'][test_data['y'] == 1][:, i]
-        test_data_incorrect = test_data['X'][test_data['y'] == 0][:, i]
-        if test_data_correct.size > 0:
-            y_1, _, _ = plt.hist(test_data_correct, bins=bins, weights=test_data['w'][test_data['y'] == 1] / float(sum(test_data['w'][test_data['y'] == 1])),
-                                 histtype="stepfilled", label="Correct (test)", color="blue", alpha=0.5)
-        if test_data_incorrect.size > 0:
-            y_2, _, _ = plt.hist(test_data_incorrect, bins=bins, weights=test_data['w'][test_data['y'] == 0] / float(sum(test_data['w'][test_data['y'] == 0])),
-                                 histtype="stepfilled", label="Incorrect (test)", color="red", alpha=0.5)
+            # -- Plot test data if available --> test data may not have a correct category?
+            X_test_correct = test_data['X'][test_data['y'] == 1][:, i]
+            X_test_incorrect = test_data['X'][test_data['y'] == 0][:, i]
+            if X_test_correct.size > 0:
+                y_1, _, _ = plt.hist(X_test_correct, bins=bins, weights=test_data['w'][test_data['y'] == 1] / float(sum(test_data['w'][test_data['y'] == 1])), histtype="stepfilled", label="Correct (test)", color="blue", alpha=0.5)
+            y_2, _, _ = plt.hist(X_test_incorrect, bins=bins, weights=test_data['w'][test_data['y'] == 0] / float(sum(test_data['w'][test_data['y'] == 0])), histtype="stepfilled", label="Incorrect (test)", color="red", alpha=0.5)
 
-        # -- Plot training data if available
-        training_data_correct = training_data['X'][training_data['y'] == 1][:, i]
-        training_data_incorrect = training_data['X'][training_data['y'] == 0][:, i]
-        if training_data_correct.size > 0:
-            _contents, _ = np.histogram(training_data_correct, bins=bins, weights=training_data['w'][training_data['y'] == 1] / float(sum(training_data['w'][training_data['y'] == 1])))
+            # -- Plot training data --> certainly available, otherwise we would be in the `except`
+            X_train_correct = training_data['X'][training_data['y'] == 1][:, i]
+            X_train_incorrect = training_data['X'][training_data['y'] == 0][:, i]
+            _contents, _ = np.histogram(X_train_correct, bins=bins, weights=training_data['w'][training_data['y'] == 1] / float(sum(training_data['w'][training_data['y'] == 1])))
             plt.scatter(bin_centres[np.nonzero(_contents)], _contents[np.nonzero(_contents)], label="Correct (train)", color="blue")
-        if training_data_incorrect.size > 0:
-            _contents, _ = np.histogram(training_data_incorrect, bins=bins, weights=training_data['w'][training_data['y'] == 0] / float(sum(training_data['w'][training_data['y'] == 0])))
+            _contents, _ = np.histogram(X_train_incorrect, bins=bins, weights=training_data['w'][training_data['y'] == 0] / float(sum(training_data['w'][training_data['y'] == 0])))
             plt.scatter(bin_centres[np.nonzero(_contents)], _contents[np.nonzero(_contents)], label="Incorrect (train)", color="red")
 
+        except IndexError:
+            non_empty = training_data if len(training_data['y']) > 0 else test_data
+            bins = np.linspace(min(non_empty['X'][:, i]), max(non_empty['X'][:, i]), 50)
+            y_1, _, _ = plt.hist(non_empty['X'][non_empty['y'] == 1][:, i], bins=bins, 
+                weights=non_empty['w'][non_empty['y'] == 1] / float(sum(non_empty['w'][non_empty['y'] == 1])), 
+                histtype="stepfilled", label="Correct", color="blue", alpha=0.5)
+            y_2, _, _ = plt.hist(non_empty['X'][non_empty['y'] == 0][:, i], bins=bins, 
+                weights=non_empty['w'][non_empty['y'] == 0] / float(sum(non_empty['w'][non_empty['y'] == 0])), 
+                histtype="stepfilled", label="Incorrect", color="red", alpha=0.5)
+
         # -- Plot legend/axes/etc.
         plt.legend(loc="upper right", fontsize=15)
         plt.xlabel(ROOT_2_LATEX[variable])
         plt.ylabel("Fraction of events")
         axes.set_xlim(min(bins), max(bins))
-        axes.set_ylim([0, max([1e-10, max(y_1 + y_2)])])
+        axes.set_ylim([0, max([1e-10, max(y_1 + y_2)])]) # ???
         plot_atlas.use_atlas_labels(axes)
 
         # -- Write figure and close plot to save memory

diff --git a/bbyy_jet_classifier/plotting/plot_roc.py b/bbyy_jet_classifier/plotting/plot_roc.py
@@ -40,13 +40,13 @@ def signal_eff_bkg_rejection(ML_strategy, mHmatch_test, pThigh_test, yhat_test,
     logging.getLogger("plotting.signal/bkg").info("Plotting signal efficiency and background rejection")
     plot_atlas.set_style()
     discrim_dict = add_curve(ML_strategy.name, "black", calculate_roc(test_data["y"], yhat_test, weights=test_data["w"]))
-    figure, axes = ROC_plotter(discrim_dict, min_eff=0.0, max_eff=1.0, min_rej=1, max_rej=10**4, logscale=True)
+    figure = ROC_plotter(discrim_dict, min_eff=0.0, max_eff=1.0, min_rej=1, max_rej=10**4, logscale=True)
 
     # -- Add ROC curves and efficiency points for old strategies
     plt.plot(eff_mH_signal, 1.0 / eff_mH_bkg, marker="o", color="r", label=r"Closest m$_{H}$", linewidth=0)  # add point for "mHmatch" strategy
     plt.plot(eff_pT_signal, 1.0 / eff_pT_bkg, marker="o", color="b", label=r"Highest p$_{T}$", linewidth=0)  # add point for "pThigh" strategy
     plt.legend()
-    plot_atlas.use_atlas_labels(axes)
+    plot_atlas.use_atlas_labels(plt.axes())
     figure.savefig(os.path.join(ML_strategy.output_directory, "ROC.pdf"))
 
     # -- Save out ROC curve as pickle for later comparison
@@ -59,19 +59,19 @@ def roc_comparison():
     ------------
         Quick script to load and compare ROC curves produced from different classifiers
     """
-    TMVABDT = cPickle.load(open(os.path.join("output", "RootTMVA", "pickle", "RootTMVA_ROC.pkl"), "rb"))
-    sklBDT = cPickle.load(open(os.path.join("output", "sklBDT", "pickle", "sklBDT_ROC.pkl"), "rb"))
+    TMVABDT = cPickle.load(open(os.path.join("output", "RootTMVA", "pickle", "root_tmva_ROC.pkl"), "rb"))
+    sklBDT = cPickle.load(open(os.path.join("output", "sklBDT", "pickle", "skl_BDT_ROC.pkl"), "rb"))
     dots = cPickle.load(open(os.path.join("output", "sklBDT", "pickle", "old_strategies_dict.pkl"), "rb"))
 
     sklBDT["color"] = "green"
     curves = {"sklBDT": sklBDT, "RootTMVA": TMVABDT}
 
     # -- Initialise figure and axes
     logging.getLogger("RunClassifier").info("Plotting")
-    figure, axes = ROC_plotter(curves, title=r"Performance of Second b-Jet Selection Strategies", min_eff=0.1, max_eff=1.0, ymax=1000, logscale=True)
+    figure = ROC_plotter(curves, title=r"Performance of Second b-Jet Selection Strategies", min_eff=0.1, max_eff=1.0, max_rej=10**4, logscale=True)
 
     plt.plot(dots["eff_mH_signal"], 1.0 / dots["eff_mH_bkg"], marker="o", color="r", label=r"Closest m$_{H}$", linewidth=0)  # add point for "mHmatch" strategy
     plt.plot(dots["eff_pT_signal"], 1.0 / dots["eff_pT_bkg"], marker="o", color="b", label=r"Highest p$_{T}$", linewidth=0)  # add point for "pThigh" strategy
     plt.legend()
-    plot_atlas.use_atlas_labels(axes)
+    plot_atlas.use_atlas_labels(plt.axes())
     figure.savefig(os.path.join("output", "ROCcomparison.pdf"))
diff --git a/bbyy_jet_classifier/process_data.py b/bbyy_jet_classifier/process_data.py
@@ -53,11 +53,22 @@ def load(input_filename, correct_treename, incorrect_treename, excluded_variable
     y = np.concatenate((np.ones(correct_recarray_feats.shape[0]), np.zeros(incorrect_recarray_feats.shape[0])))
     w = np.concatenate((correct_recarray["event_weight"], incorrect_recarray["event_weight"]))
     mHmatch = np.concatenate((correct_recarray["idx_by_mH"] == 0, incorrect_recarray["idx_by_mH"] == 0))
-    pThigh = np.concatenate((correct_recarray["idx_by_pT"] == 0, incorrect_recarray["idx_by_pT"] == 0))
+    pThigh = np.concatenate((correct_recarray["idx_by_pT"] == 0, incorrect_recarray["idx_by_pT"] == 0))    
 
     # -- Construct training and test datasets, automatically permuted
-    X_train, X_test, y_train, y_test, w_train, w_test, _, mHmatch_test, _, pThigh_test = \
-        train_test_split(X, y, w, mHmatch, pThigh, train_size=training_fraction)
+    if training_fraction == 1:
+        ix = range(len(y))
+        import random
+        random.shuffle(ix)
+        X_train, y_train, w_train = X[ix], y[ix], w[ix]
+        y_test = w_test = mHmatch_test = pThigh_test = np.array([])
+        X_test = np.array([[]])
+    else:
+        X_train, X_test, y_train, y_test, w_train, w_test, _, mHmatch_test, _, pThigh_test = \
+            train_test_split(X, y, w, mHmatch, pThigh, train_size=training_fraction)
+
+    # -- Balance training weights
+    w_train = balance_weights(y_train, w_train)
 
     # -- Put X, y and w into a dictionary to conveniently pass these objects around
     train_data = {'X': X_train, 'y': y_train, 'w': w_train}
@@ -92,3 +103,31 @@ def feature_selection(train_data, features, k):
 
     # -- Return names of top features
     logging.getLogger("RunClassifier").info("The {} most important features are {}".format(k, [f for (_, f) in sorted(zip(tf.scores_, features), reverse=True)][:k]))
+
+
+def balance_weights(y_train, w_train, targetN = 10000):
+    '''
+    Definition:
+    -----------
+        Function that rebalances the class weights
+        This is useful because we often train on datasets with very different quantities of signal and background
+        This allows us to bring the samples back to equal quantities of signal and background
+
+    Args:
+    -----
+        y_train = array of dim (# training examples) with target values
+        w_train = array of dim (# training examples) with the initial weights as extracted from the ntuple
+        targetN(optional, default to 10000) = target equal number of signal and background events
+
+    Returns:
+    --------
+        w_train = array of dim (# training examples) with the new rescaled weights
+    '''
+
+    for classID in np.unique(y_train):
+        w_train[y_train == classID] *= float(targetN) / float(np.sum(w_train[y_train == classID]))
+
+    return w_train
+
+
+
diff --git a/bbyy_jet_classifier/strategies/base_strategy.py b/bbyy_jet_classifier/strategies/base_strategy.py
@@ -5,8 +5,8 @@ class BaseStrategy(object):
     default_output_location = None
 
     def __init__(self, output_directory):
-        self.name = self.__module__.split(".")[-1].replace("_", " ")
-        self.output_directory = os.path.join(output_directory, self.default_output_subdir)  # output_directory if output_directory is not None else self.default_output_location
+        self.name = self.__module__.split(".")[-1] 
+        self.output_directory = os.path.join(output_directory, self.default_output_subdir)
         self.ensure_directory(self.output_directory)
 
     @staticmethod

diff --git a/bbyy_jet_classifier/strategies/skl_BDT.py b/bbyy_jet_classifier/strategies/skl_BDT.py
@@ -34,9 +34,6 @@ def train(self, train_data, classification_variables, variable_dict):
         classifier.fit(train_data['X'], train_data['y'], sample_weight=train_data['w'])
 
         # -- Dump output to pickle
-        self.ensure_directory("{}/pickle/".format(self.output_directory))
-        joblib.dump(classifier, "{}/pickle/sklBDT_clf.pkl".format(self.output_directory), protocol=cPickle.HIGHEST_PROTOCOL)
-
         self.ensure_directory(os.path.join(self.output_directory, "pickle"))
         joblib.dump(classifier, os.path.join(self.output_directory, "pickle", "sklBDT_clf.pkl"), protocol=cPickle.HIGHEST_PROTOCOL)
 
@@ -62,8 +59,8 @@ def test(self, data, classification_variables, process):
         logging.getLogger("sklBDT.test").info("Evaluating performance...")
 
         # -- Load scikit classifier
-        classifier = joblib.load("{}/pickle/sklBDT_clf.pkl".format(self.output_directory))
-
+        classifier = joblib.load(os.path.join(self.output_directory, 'pickle', 'sklBDT_clf.pkl'))
+        
         # -- Get classifier predictions
         yhat = classifier.predict_proba(data['X'])[:, 1]
 

diff --git a/run_classifier.py b/run_classifier.py
@@ -33,8 +33,8 @@
         process_data.load(args.input, args.correct_tree, args.incorrect_tree, args.exclude, args.ftrain)
 
     #-- Plot input distributions
-    strategies.BaseStrategy.ensure_directory(args.output + "/classification_variables")
-    plot_inputs.input_distributions(classification_variables, train_data, test_data, directory=args.output + "/classification_variables")
+    strategies.BaseStrategy.ensure_directory(os.path.join(args.output, "classification_variables"))
+    plot_inputs.input_distributions(classification_variables, train_data, test_data, directory=os.path.join(args.output, "classification_variables"))
 
     # -- Sequentially evaluate all the desired strategies on the same train/test sample
     for strategy_name in args.strategy:

diff --git a/viz/performance.py b/viz/performance.py
@@ -109,6 +109,7 @@ def ROC_plotter(curves, min_eff=0, max_eff=1, min_rej=1, max_rej=10**4, pp=False
         plt.plot(data["efficiency"][sel], data["rejection"][sel], "-", label=r"" + tagger, color=data["color"], **kwargs)
 
     # -- Plot legend/axes/etc.
+    plt.grid(b=True, which="both", alpha=0.5)
     plt.legend()
     plt.xlim(min_eff, max_eff)
     plt.ylim(min_rej, max_rej)
@@ -121,7 +122,7 @@ def ROC_plotter(curves, min_eff=0, max_eff=1, min_rej=1, max_rej=10**4, pp=False
     if pp:
         plt.savefig(figure)
     else:
-        return figure, axes
+        return figure
 
 
 def add_curve(name, color, curve_pair):