Add weight rebalancing for equal class representation in training

mickypaganini · May 12, 2016 · efaf599 · efaf599
1 parent 07f950b
commit efaf599
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 6 deletions.
diff --git a/bbyy_jet_classifier/process_data.py b/bbyy_jet_classifier/process_data.py
@@ -53,7 +53,7 @@ def load(input_filename, correct_treename, incorrect_treename, excluded_variable
     y = np.concatenate((np.ones(correct_recarray_feats.shape[0]), np.zeros(incorrect_recarray_feats.shape[0])))
     w = np.concatenate((correct_recarray["event_weight"], incorrect_recarray["event_weight"]))
     mHmatch = np.concatenate((correct_recarray["idx_by_mH"] == 0, incorrect_recarray["idx_by_mH"] == 0))
-    pThigh = np.concatenate((correct_recarray["idx_by_pT"] == 0, incorrect_recarray["idx_by_pT"] == 0))
+    pThigh = np.concatenate((correct_recarray["idx_by_pT"] == 0, incorrect_recarray["idx_by_pT"] == 0))    
 
     # -- Construct training and test datasets, automatically permuted
     if training_fraction == 1:
@@ -67,6 +67,9 @@ def load(input_filename, correct_treename, incorrect_treename, excluded_variable
         X_train, X_test, y_train, y_test, w_train, w_test, _, mHmatch_test, _, pThigh_test = \
             train_test_split(X, y, w, mHmatch, pThigh, train_size=training_fraction)
 
+    # -- Balance training weights
+    w_train = balance_weights(y_train, w_train)
+
     # -- Put X, y and w into a dictionary to conveniently pass these objects around
     train_data = {'X': X_train, 'y': y_train, 'w': w_train}
     test_data = {'X': X_test, 'y': y_test, 'w': w_test}
@@ -100,3 +103,31 @@ def feature_selection(train_data, features, k):
 
     # -- Return names of top features
     logging.getLogger("RunClassifier").info("The {} most important features are {}".format(k, [f for (_, f) in sorted(zip(tf.scores_, features), reverse=True)][:k]))
+
+
+def balance_weights(y_train, w_train, targetN = 10000):
+    '''
+    Definition:
+    -----------
+        Function that rebalances the class weights
+        This is useful because we often train on datasets with very different quantities of signal and background
+        This allows us to bring the samples back to equal quantities of signal and background
+
+    Args:
+    -----
+        y_train = array of dim (# training examples) with target values
+        w_train = array of dim (# training examples) with the initial weights as extracted from the ntuple
+        targetN(optional, default to 10000) = target equal number of signal and background events
+
+    Returns:
+    --------
+        w_train = array of dim (# training examples) with the new rescaled weights
+    '''
+
+    for classID in np.unique(y_train):
+        w_train[y_train == classID] *= float(targetN) / float(np.sum(w_train[y_train == classID]))
+
+    return w_train
+
+
+
diff --git a/bbyy_jet_classifier/strategies/skl_BDT.py b/bbyy_jet_classifier/strategies/skl_BDT.py
@@ -34,9 +34,6 @@ def train(self, train_data, classification_variables, variable_dict):
         classifier.fit(train_data['X'], train_data['y'], sample_weight=train_data['w'])
 
         # -- Dump output to pickle
-        self.ensure_directory("{}/pickle/".format(self.output_directory))
-        joblib.dump(classifier, "{}/pickle/sklBDT_clf.pkl".format(self.output_directory), protocol=cPickle.HIGHEST_PROTOCOL)
-
         self.ensure_directory(os.path.join(self.output_directory, "pickle"))
         joblib.dump(classifier, os.path.join(self.output_directory, "pickle", "sklBDT_clf.pkl"), protocol=cPickle.HIGHEST_PROTOCOL)
 
@@ -62,8 +59,8 @@ def test(self, data, classification_variables, process):
         logging.getLogger("sklBDT.test").info("Evaluating performance...")
 
         # -- Load scikit classifier
-        classifier = joblib.load("{}/pickle/sklBDT_clf.pkl".format(self.output_directory))
-
+        classifier = joblib.load(os.path.join(self.output_directory, 'pickle', 'sklBDT_clf.pkl'))
+        
         # -- Get classifier predictions
         yhat = classifier.predict_proba(data['X'])[:, 1]