From d70d7abc0d51f01b5bdd43a1bd01ce624d2141d0 Mon Sep 17 00:00:00 2001
From: Michela Paganini <michela.paganini@cern.ch>
Date: Tue, 25 Oct 2016 00:06:15 -0700
Subject: [PATCH] Minor changes

---
 bbyy_jet_classifier/plotting/plot_asimov.py |  5 +++--
 bbyy_jet_classifier/strategies/root_tmva.py |  2 +-
 bbyy_jet_classifier/strategies/skl_BDT.py   | 12 ++++++++----
 evaluate_event_performance.py               |  4 ++--
 requirements.txt                            |  6 ++++++
 5 files changed, 20 insertions(+), 9 deletions(-)
 create mode 100644 requirements.txt

diff --git a/bbyy_jet_classifier/plotting/plot_asimov.py b/bbyy_jet_classifier/plotting/plot_asimov.py
index adea08b..043ed6d 100644
--- a/bbyy_jet_classifier/plotting/plot_asimov.py
+++ b/bbyy_jet_classifier/plotting/plot_asimov.py
@@ -2,9 +2,10 @@
 import matplotlib
 import cPickle
 import numpy as np
+import os
 import plot_atlas
 
-def bdt_old_ratio(data, strategy, baseline_strategy, lower_bound):
+def bdt_old_ratio(data, category, strategy, baseline_strategy, lower_bound):
 
     plot_atlas.set_style()
     figure = plt.figure(figsize=(6, 6), dpi=100)
@@ -34,5 +35,5 @@ def bdt_old_ratio(data, strategy, baseline_strategy, lower_bound):
     plt.ylim(ymin=0.2, ymax=2.8)
 
     plt.legend(loc='upper left')
-    plt.savefig('threshold_ratio_{}.pdf'.format(strategy))
+    plt.savefig(os.path.join('output', 'threshold_ratio_{}_{}.pdf'.format(strategy, category)))
     plt.close(figure)
diff --git a/bbyy_jet_classifier/strategies/root_tmva.py b/bbyy_jet_classifier/strategies/root_tmva.py
index b228676..cc484fd 100644
--- a/bbyy_jet_classifier/strategies/root_tmva.py
+++ b/bbyy_jet_classifier/strategies/root_tmva.py
@@ -47,7 +47,7 @@ def train(self, train_data, classification_variables, variable_dict, sample_name
         #-- Define methods:
         # ["NTrees=200", "MinNodeSize=0.1", "MaxDepth=6", "BoostType=Grad", "SeparationType=GiniIndex",  "NegWeightTreatment=IgnoreNegWeightsInTraining"]
         factory.BookMethod(TMVA.Types.kBDT, "BDT", ":".join(
-            ["NTrees=300", "MinNodeSize=0.01", "MaxDepth=8", "BoostType=Grad", "SeparationType=GiniIndex",  "NegWeightTreatment=Pray"]
+            ["NTrees=300", "MinNodeSize=0.01", "MaxDepth=15", "BoostType=Grad", "SeparationType=GiniIndex", "NegWeightTreatment=IgnoreNegWeightsInTraining"]#  "NegWeightTreatment=Pray"]
         ))
 
         # -- Have we considered using a Fisher classifier?
diff --git a/bbyy_jet_classifier/strategies/skl_BDT.py b/bbyy_jet_classifier/strategies/skl_BDT.py
index 8d67ccc..b41f8b6 100644
--- a/bbyy_jet_classifier/strategies/skl_BDT.py
+++ b/bbyy_jet_classifier/strategies/skl_BDT.py
@@ -43,8 +43,12 @@ def train(self, train_data, classification_variables, variable_dict, sample_name
             fit_params = {"sample_weight":train_data["w"]}
             # Run grid search over provided ranges
             logging.getLogger("skl_BDT").info("Running grid search parameter optimisation...")
-            grid_search = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.2, min_samples_leaf=50, max_features="sqrt", subsample=0.8, random_state=10),
-                                       param_grid=parameters, fit_params=fit_params, scoring="roc_auc", n_jobs=1, iid=False, cv=3, verbose=1)
+            grid_search = GridSearchCV(
+                estimator=GradientBoostingClassifier(
+                    learning_rate=0.2, min_samples_leaf=50, max_features="sqrt", subsample=0.8, random_state=10
+                ),
+                param_grid=parameters, fit_params=fit_params, scoring="roc_auc", n_jobs=-1, iid=False, cv=3, verbose=1
+            )
             grid_search.fit(train_data["X"], train_data["y"])
             for param_name in parameters.keys():
                 if grid_search.best_params_[param_name] in [ parameters[param_name][0], parameters[param_name][-1] ]:
@@ -55,8 +59,8 @@ def train(self, train_data, classification_variables, variable_dict, sample_name
 
         else:
             classifier = GradientBoostingClassifier(
-                n_estimators=5, # was n_estimators=300
-                max_depth=6, # was max_depth=15
+                n_estimators=300, # was n_estimators=300
+                max_depth=10, # was max_depth=15
                 min_samples_leaf=40, # was min_samples_split=0.5 * len(train_data["y"])
                 verbose=1
                 )
diff --git a/evaluate_event_performance.py b/evaluate_event_performance.py
index a216abc..e0b1d5d 100755
--- a/evaluate_event_performance.py
+++ b/evaluate_event_performance.py
@@ -93,11 +93,11 @@ def main(strategy, category, lower_bound, intervals):
 
     # -- Write dictionary of Asimov significances to disk
     utils.ensure_directory(os.path.join("output", "pickles"))
-    with open(os.path.join("output", "pickles", "multi_proc_{}.pkl".format(strategy)), "wb") as f:
+    with open(os.path.join("output", "pickles", "multi_proc_{}_{}.pkl".format(strategy, category)), "wb") as f:
         cPickle.dump(asimov_dict, f)
 
     # -- Plot Z_BDT/Z_old for different threshold values
-    plot_asimov.bdt_old_ratio(asimov_dict, strategy, 'mHmatch', lower_bound)
+    plot_asimov.bdt_old_ratio(asimov_dict, category, strategy, 'mHmatch', lower_bound)
     
     # -- Print Asimov significance for different strategies and different samples in tabular form
     #    Each table corresponds to a different threshold value
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..7df5f15
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+joblib==0.10.2
+matplotlib==1.5.2
+numpy==1.11.1
+root_numpy==4.4.0
+rootpy==0.8.1
+scikit_learn==0.18