Skip to content

Commit

Permalink
Merge pull request jemrobinson#11 from mickypaganini/master
Browse files Browse the repository at this point in the history
Rebalancing + handle ftrain=1
  • Loading branch information
jemrobinson committed May 13, 2016
2 parents a68f375 + efaf599 commit ccebb61
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 42 deletions.
63 changes: 40 additions & 23 deletions bbyy_jet_classifier/plotting/plot_inputs.py
@@ -1,13 +1,24 @@

import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import plot_atlas

ROOT_2_LATEX = {"abs_eta_j": r"$|\eta_{j}|$", "abs_eta_jb": r"$|\eta_{jb}|$",
"Delta_eta_jb": r"$\Delta\eta_{jb}$", "Delta_phi_jb": r"$\Delta\phi_{jb}$",
"idx_by_mH": r"$m_{H}$ matching order", "idx_by_pT": r"$p_{T}$ order",
"m_jb": r"$m_{jb}$", "pT_j": r"$p_{T}^{j}$", "pT_jb": r"$p_{T}^{jb}$"}
ROOT_2_LATEX = {
"abs_eta_j": r"$|\eta_{j}|$",
"abs_eta_jb": r"$|\eta_{jb}|$",
"Delta_eta_jb": r"$\Delta\eta_{jb}$",
"Delta_phi_jb": r"$\Delta\phi_{jb}$",
"idx_by_mH": r"$m_{H}$ matching order",
"idx_by_pT": r"$p_{T}$ order",
"m_jb": r"$m_{jb}$",
"pT_j": r"$p_{T}^{j}$",
"pT_jb": r"$p_{T}^{jb}$",
"MV2c20_FCBE_70": r"MV2c20 FCBE(70%)",
"MV2c20_FCBE_77": r"MV2c20 FCBE(77%)",
"MV2c20_FCBE_85": r"MV2c20 FCBE(85%)"
}


def input_distributions(classification_variables, training_data, test_data, directory):
Expand All @@ -34,35 +45,41 @@ def input_distributions(classification_variables, training_data, test_data, dire
# -- Initialise figure and axes
figure = plt.figure(figsize=(6, 6), dpi=100)
axes = plt.axes()
bins = np.linspace(min([min(test_data['X'][:, i]), min(test_data['X'][:, i])]), max([max(test_data['X'][:, i]), max(test_data['X'][:, i])]), 50)
bin_centres = np.array([0.5 * (l + h) for l, h in zip(bins[:-1], bins[1:])])
try:
bins = np.linspace(min([min(training_data['X'][:, i]), min(test_data['X'][:, i])]), max([max(training_data['X'][:, i]), max(test_data['X'][:, i])]), 50)
bin_centres = np.array([0.5 * (l + h) for l, h in zip(bins[:-1], bins[1:])])

# -- Plot test data if available
test_data_correct = test_data['X'][test_data['y'] == 1][:, i]
test_data_incorrect = test_data['X'][test_data['y'] == 0][:, i]
if test_data_correct.size > 0:
y_1, _, _ = plt.hist(test_data_correct, bins=bins, weights=test_data['w'][test_data['y'] == 1] / float(sum(test_data['w'][test_data['y'] == 1])),
histtype="stepfilled", label="Correct (test)", color="blue", alpha=0.5)
if test_data_incorrect.size > 0:
y_2, _, _ = plt.hist(test_data_incorrect, bins=bins, weights=test_data['w'][test_data['y'] == 0] / float(sum(test_data['w'][test_data['y'] == 0])),
histtype="stepfilled", label="Incorrect (test)", color="red", alpha=0.5)
# -- Plot test data if available --> test data may not have a correct category?
X_test_correct = test_data['X'][test_data['y'] == 1][:, i]
X_test_incorrect = test_data['X'][test_data['y'] == 0][:, i]
if X_test_correct.size > 0:
y_1, _, _ = plt.hist(X_test_correct, bins=bins, weights=test_data['w'][test_data['y'] == 1] / float(sum(test_data['w'][test_data['y'] == 1])), histtype="stepfilled", label="Correct (test)", color="blue", alpha=0.5)
y_2, _, _ = plt.hist(X_test_incorrect, bins=bins, weights=test_data['w'][test_data['y'] == 0] / float(sum(test_data['w'][test_data['y'] == 0])), histtype="stepfilled", label="Incorrect (test)", color="red", alpha=0.5)

# -- Plot training data if available
training_data_correct = training_data['X'][training_data['y'] == 1][:, i]
training_data_incorrect = training_data['X'][training_data['y'] == 0][:, i]
if training_data_correct.size > 0:
_contents, _ = np.histogram(training_data_correct, bins=bins, weights=training_data['w'][training_data['y'] == 1] / float(sum(training_data['w'][training_data['y'] == 1])))
# -- Plot training data --> certainly available, otherwise we would be in the `except`
X_train_correct = training_data['X'][training_data['y'] == 1][:, i]
X_train_incorrect = training_data['X'][training_data['y'] == 0][:, i]
_contents, _ = np.histogram(X_train_correct, bins=bins, weights=training_data['w'][training_data['y'] == 1] / float(sum(training_data['w'][training_data['y'] == 1])))
plt.scatter(bin_centres[np.nonzero(_contents)], _contents[np.nonzero(_contents)], label="Correct (train)", color="blue")
if training_data_incorrect.size > 0:
_contents, _ = np.histogram(training_data_incorrect, bins=bins, weights=training_data['w'][training_data['y'] == 0] / float(sum(training_data['w'][training_data['y'] == 0])))
_contents, _ = np.histogram(X_train_incorrect, bins=bins, weights=training_data['w'][training_data['y'] == 0] / float(sum(training_data['w'][training_data['y'] == 0])))
plt.scatter(bin_centres[np.nonzero(_contents)], _contents[np.nonzero(_contents)], label="Incorrect (train)", color="red")

except IndexError:
non_empty = training_data if len(training_data['y']) > 0 else test_data
bins = np.linspace(min(non_empty['X'][:, i]), max(non_empty['X'][:, i]), 50)
y_1, _, _ = plt.hist(non_empty['X'][non_empty['y'] == 1][:, i], bins=bins,
weights=non_empty['w'][non_empty['y'] == 1] / float(sum(non_empty['w'][non_empty['y'] == 1])),
histtype="stepfilled", label="Correct", color="blue", alpha=0.5)
y_2, _, _ = plt.hist(non_empty['X'][non_empty['y'] == 0][:, i], bins=bins,
weights=non_empty['w'][non_empty['y'] == 0] / float(sum(non_empty['w'][non_empty['y'] == 0])),
histtype="stepfilled", label="Incorrect", color="red", alpha=0.5)

# -- Plot legend/axes/etc.
plt.legend(loc="upper right", fontsize=15)
plt.xlabel(ROOT_2_LATEX[variable])
plt.ylabel("Fraction of events")
axes.set_xlim(min(bins), max(bins))
axes.set_ylim([0, max([1e-10, max(y_1 + y_2)])])
axes.set_ylim([0, max([1e-10, max(y_1 + y_2)])]) # ???
plot_atlas.use_atlas_labels(axes)

# -- Write figure and close plot to save memory
Expand Down
12 changes: 6 additions & 6 deletions bbyy_jet_classifier/plotting/plot_roc.py
Expand Up @@ -40,13 +40,13 @@ def signal_eff_bkg_rejection(ML_strategy, mHmatch_test, pThigh_test, yhat_test,
logging.getLogger("plotting.signal/bkg").info("Plotting signal efficiency and background rejection")
plot_atlas.set_style()
discrim_dict = add_curve(ML_strategy.name, "black", calculate_roc(test_data["y"], yhat_test, weights=test_data["w"]))
figure, axes = ROC_plotter(discrim_dict, min_eff=0.0, max_eff=1.0, min_rej=1, max_rej=10**4, logscale=True)
figure = ROC_plotter(discrim_dict, min_eff=0.0, max_eff=1.0, min_rej=1, max_rej=10**4, logscale=True)

# -- Add ROC curves and efficiency points for old strategies
plt.plot(eff_mH_signal, 1.0 / eff_mH_bkg, marker="o", color="r", label=r"Closest m$_{H}$", linewidth=0) # add point for "mHmatch" strategy
plt.plot(eff_pT_signal, 1.0 / eff_pT_bkg, marker="o", color="b", label=r"Highest p$_{T}$", linewidth=0) # add point for "pThigh" strategy
plt.legend()
plot_atlas.use_atlas_labels(axes)
plot_atlas.use_atlas_labels(plt.axes())
figure.savefig(os.path.join(ML_strategy.output_directory, "ROC.pdf"))

# -- Save out ROC curve as pickle for later comparison
Expand All @@ -59,19 +59,19 @@ def roc_comparison():
------------
Quick script to load and compare ROC curves produced from different classifiers
"""
TMVABDT = cPickle.load(open(os.path.join("output", "RootTMVA", "pickle", "RootTMVA_ROC.pkl"), "rb"))
sklBDT = cPickle.load(open(os.path.join("output", "sklBDT", "pickle", "sklBDT_ROC.pkl"), "rb"))
TMVABDT = cPickle.load(open(os.path.join("output", "RootTMVA", "pickle", "root_tmva_ROC.pkl"), "rb"))
sklBDT = cPickle.load(open(os.path.join("output", "sklBDT", "pickle", "skl_BDT_ROC.pkl"), "rb"))
dots = cPickle.load(open(os.path.join("output", "sklBDT", "pickle", "old_strategies_dict.pkl"), "rb"))

sklBDT["color"] = "green"
curves = {"sklBDT": sklBDT, "RootTMVA": TMVABDT}

# -- Initialise figure and axes
logging.getLogger("RunClassifier").info("Plotting")
figure, axes = ROC_plotter(curves, title=r"Performance of Second b-Jet Selection Strategies", min_eff=0.1, max_eff=1.0, ymax=1000, logscale=True)
figure = ROC_plotter(curves, title=r"Performance of Second b-Jet Selection Strategies", min_eff=0.1, max_eff=1.0, max_rej=10**4, logscale=True)

plt.plot(dots["eff_mH_signal"], 1.0 / dots["eff_mH_bkg"], marker="o", color="r", label=r"Closest m$_{H}$", linewidth=0) # add point for "mHmatch" strategy
plt.plot(dots["eff_pT_signal"], 1.0 / dots["eff_pT_bkg"], marker="o", color="b", label=r"Highest p$_{T}$", linewidth=0) # add point for "pThigh" strategy
plt.legend()
plot_atlas.use_atlas_labels(axes)
plot_atlas.use_atlas_labels(plt.axes())
figure.savefig(os.path.join("output", "ROCcomparison.pdf"))
45 changes: 42 additions & 3 deletions bbyy_jet_classifier/process_data.py
Expand Up @@ -53,11 +53,22 @@ def load(input_filename, correct_treename, incorrect_treename, excluded_variable
y = np.concatenate((np.ones(correct_recarray_feats.shape[0]), np.zeros(incorrect_recarray_feats.shape[0])))
w = np.concatenate((correct_recarray["event_weight"], incorrect_recarray["event_weight"]))
mHmatch = np.concatenate((correct_recarray["idx_by_mH"] == 0, incorrect_recarray["idx_by_mH"] == 0))
pThigh = np.concatenate((correct_recarray["idx_by_pT"] == 0, incorrect_recarray["idx_by_pT"] == 0))
pThigh = np.concatenate((correct_recarray["idx_by_pT"] == 0, incorrect_recarray["idx_by_pT"] == 0))

# -- Construct training and test datasets, automatically permuted
X_train, X_test, y_train, y_test, w_train, w_test, _, mHmatch_test, _, pThigh_test = \
train_test_split(X, y, w, mHmatch, pThigh, train_size=training_fraction)
if training_fraction == 1:
ix = range(len(y))
import random
random.shuffle(ix)
X_train, y_train, w_train = X[ix], y[ix], w[ix]
y_test = w_test = mHmatch_test = pThigh_test = np.array([])
X_test = np.array([[]])
else:
X_train, X_test, y_train, y_test, w_train, w_test, _, mHmatch_test, _, pThigh_test = \
train_test_split(X, y, w, mHmatch, pThigh, train_size=training_fraction)

# -- Balance training weights
w_train = balance_weights(y_train, w_train)

# -- Put X, y and w into a dictionary to conveniently pass these objects around
train_data = {'X': X_train, 'y': y_train, 'w': w_train}
Expand Down Expand Up @@ -92,3 +103,31 @@ def feature_selection(train_data, features, k):

# -- Return names of top features
logging.getLogger("RunClassifier").info("The {} most important features are {}".format(k, [f for (_, f) in sorted(zip(tf.scores_, features), reverse=True)][:k]))


def balance_weights(y_train, w_train, targetN = 10000):
'''
Definition:
-----------
Function that rebalances the class weights
This is useful because we often train on datasets with very different quantities of signal and background
This allows us to bring the samples back to equal quantities of signal and background
Args:
-----
y_train = array of dim (# training examples) with target values
w_train = array of dim (# training examples) with the initial weights as extracted from the ntuple
targetN(optional, default to 10000) = target equal number of signal and background events
Returns:
--------
w_train = array of dim (# training examples) with the new rescaled weights
'''

for classID in np.unique(y_train):
w_train[y_train == classID] *= float(targetN) / float(np.sum(w_train[y_train == classID]))

return w_train



4 changes: 2 additions & 2 deletions bbyy_jet_classifier/strategies/base_strategy.py
Expand Up @@ -5,8 +5,8 @@ class BaseStrategy(object):
default_output_location = None

def __init__(self, output_directory):
self.name = self.__module__.split(".")[-1].replace("_", " ")
self.output_directory = os.path.join(output_directory, self.default_output_subdir) # output_directory if output_directory is not None else self.default_output_location
self.name = self.__module__.split(".")[-1]
self.output_directory = os.path.join(output_directory, self.default_output_subdir)
self.ensure_directory(self.output_directory)

@staticmethod
Expand Down
7 changes: 2 additions & 5 deletions bbyy_jet_classifier/strategies/skl_BDT.py
Expand Up @@ -34,9 +34,6 @@ def train(self, train_data, classification_variables, variable_dict):
classifier.fit(train_data['X'], train_data['y'], sample_weight=train_data['w'])

# -- Dump output to pickle
self.ensure_directory("{}/pickle/".format(self.output_directory))
joblib.dump(classifier, "{}/pickle/sklBDT_clf.pkl".format(self.output_directory), protocol=cPickle.HIGHEST_PROTOCOL)

self.ensure_directory(os.path.join(self.output_directory, "pickle"))
joblib.dump(classifier, os.path.join(self.output_directory, "pickle", "sklBDT_clf.pkl"), protocol=cPickle.HIGHEST_PROTOCOL)

Expand All @@ -62,8 +59,8 @@ def test(self, data, classification_variables, process):
logging.getLogger("sklBDT.test").info("Evaluating performance...")

# -- Load scikit classifier
classifier = joblib.load("{}/pickle/sklBDT_clf.pkl".format(self.output_directory))

classifier = joblib.load(os.path.join(self.output_directory, 'pickle', 'sklBDT_clf.pkl'))
# -- Get classifier predictions
yhat = classifier.predict_proba(data['X'])[:, 1]

Expand Down
4 changes: 2 additions & 2 deletions run_classifier.py
Expand Up @@ -33,8 +33,8 @@
process_data.load(args.input, args.correct_tree, args.incorrect_tree, args.exclude, args.ftrain)

#-- Plot input distributions
strategies.BaseStrategy.ensure_directory(args.output + "/classification_variables")
plot_inputs.input_distributions(classification_variables, train_data, test_data, directory=args.output + "/classification_variables")
strategies.BaseStrategy.ensure_directory(os.path.join(args.output, "classification_variables"))
plot_inputs.input_distributions(classification_variables, train_data, test_data, directory=os.path.join(args.output, "classification_variables"))

# -- Sequentially evaluate all the desired strategies on the same train/test sample
for strategy_name in args.strategy:
Expand Down
3 changes: 2 additions & 1 deletion viz/performance.py
Expand Up @@ -109,6 +109,7 @@ def ROC_plotter(curves, min_eff=0, max_eff=1, min_rej=1, max_rej=10**4, pp=False
plt.plot(data["efficiency"][sel], data["rejection"][sel], "-", label=r"" + tagger, color=data["color"], **kwargs)

# -- Plot legend/axes/etc.
plt.grid(b=True, which="both", alpha=0.5)
plt.legend()
plt.xlim(min_eff, max_eff)
plt.ylim(min_rej, max_rej)
Expand All @@ -121,7 +122,7 @@ def ROC_plotter(curves, min_eff=0, max_eff=1, min_rej=1, max_rej=10**4, pp=False
if pp:
plt.savefig(figure)
else:
return figure, axes
return figure


def add_curve(name, color, curve_pair):
Expand Down

0 comments on commit ccebb61

Please sign in to comment.