Skip to content

Commit

Permalink
Add weight rebalancing for equal class representation in training
Browse files Browse the repository at this point in the history
  • Loading branch information
mickypaganini committed May 12, 2016
1 parent 07f950b commit efaf599
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 6 deletions.
33 changes: 32 additions & 1 deletion bbyy_jet_classifier/process_data.py
Expand Up @@ -53,7 +53,7 @@ def load(input_filename, correct_treename, incorrect_treename, excluded_variable
y = np.concatenate((np.ones(correct_recarray_feats.shape[0]), np.zeros(incorrect_recarray_feats.shape[0])))
w = np.concatenate((correct_recarray["event_weight"], incorrect_recarray["event_weight"]))
mHmatch = np.concatenate((correct_recarray["idx_by_mH"] == 0, incorrect_recarray["idx_by_mH"] == 0))
pThigh = np.concatenate((correct_recarray["idx_by_pT"] == 0, incorrect_recarray["idx_by_pT"] == 0))
pThigh = np.concatenate((correct_recarray["idx_by_pT"] == 0, incorrect_recarray["idx_by_pT"] == 0))

# -- Construct training and test datasets, automatically permuted
if training_fraction == 1:
Expand All @@ -67,6 +67,9 @@ def load(input_filename, correct_treename, incorrect_treename, excluded_variable
X_train, X_test, y_train, y_test, w_train, w_test, _, mHmatch_test, _, pThigh_test = \
train_test_split(X, y, w, mHmatch, pThigh, train_size=training_fraction)

# -- Balance training weights
w_train = balance_weights(y_train, w_train)

# -- Put X, y and w into a dictionary to conveniently pass these objects around
train_data = {'X': X_train, 'y': y_train, 'w': w_train}
test_data = {'X': X_test, 'y': y_test, 'w': w_test}
Expand Down Expand Up @@ -100,3 +103,31 @@ def feature_selection(train_data, features, k):

# -- Return names of top features
logging.getLogger("RunClassifier").info("The {} most important features are {}".format(k, [f for (_, f) in sorted(zip(tf.scores_, features), reverse=True)][:k]))


def balance_weights(y_train, w_train, targetN = 10000):
'''
Definition:
-----------
Function that rebalances the class weights
This is useful because we often train on datasets with very different quantities of signal and background
This allows us to bring the samples back to equal quantities of signal and background
Args:
-----
y_train = array of dim (# training examples) with target values
w_train = array of dim (# training examples) with the initial weights as extracted from the ntuple
targetN(optional, default to 10000) = target equal number of signal and background events
Returns:
--------
w_train = array of dim (# training examples) with the new rescaled weights
'''

for classID in np.unique(y_train):
w_train[y_train == classID] *= float(targetN) / float(np.sum(w_train[y_train == classID]))

return w_train



7 changes: 2 additions & 5 deletions bbyy_jet_classifier/strategies/skl_BDT.py
Expand Up @@ -34,9 +34,6 @@ def train(self, train_data, classification_variables, variable_dict):
classifier.fit(train_data['X'], train_data['y'], sample_weight=train_data['w'])

# -- Dump output to pickle
self.ensure_directory("{}/pickle/".format(self.output_directory))
joblib.dump(classifier, "{}/pickle/sklBDT_clf.pkl".format(self.output_directory), protocol=cPickle.HIGHEST_PROTOCOL)

self.ensure_directory(os.path.join(self.output_directory, "pickle"))
joblib.dump(classifier, os.path.join(self.output_directory, "pickle", "sklBDT_clf.pkl"), protocol=cPickle.HIGHEST_PROTOCOL)

Expand All @@ -62,8 +59,8 @@ def test(self, data, classification_variables, process):
logging.getLogger("sklBDT.test").info("Evaluating performance...")

# -- Load scikit classifier
classifier = joblib.load("{}/pickle/sklBDT_clf.pkl".format(self.output_directory))

classifier = joblib.load(os.path.join(self.output_directory, 'pickle', 'sklBDT_clf.pkl'))
# -- Get classifier predictions
yhat = classifier.predict_proba(data['X'])[:, 1]

Expand Down

0 comments on commit efaf599

Please sign in to comment.