In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import catboost
import utils.train
import utils.sweight
import scipy.stats
from utils.evaluation import bootstrap_score
from utils.higgs import FEATURES
from sklearn.metrics import roc_auc_score
import pickle
import matplotlib.pyplot as plt

In [3]:
train_full = pd.read_hdf("data/higgs_split.hdf", "train")
test = pd.read_hdf("data/higgs_split.hdf", "test")

In [4]:
ITERATIONS = 1000
CPU_COUNT = 20

In [5]:
sig_mass_distr = scipy.stats.norm(loc=4, scale=1)
bck_mass_distr = scipy.stats.expon(scale=4)

In [6]:
common_catboost_params = {
    "iterations": ITERATIONS,
    "verbose": False,
    "leaf_estimation_method": "Gradient",
    "thread_count": CPU_COUNT
}

In [7]:
def get_score(train_fraction, seed):
    np.random.seed(seed)
    train_small = train_full.sample(frac=train_fraction)
    train_small_labels = train_small.label.values
    train_small = train_small.loc[:, FEATURES].values
    
    sWeights, mass_probas = utils.sweight.invent_sWeights(
        train_small_labels, sig_mass_distr, bck_mass_distr, return_probas=True)
    
    model_likelihood = catboost.CatBoostRegressor(loss_function="HonestLikelihood", **common_catboost_params)
    model_likelihood.fit(train_small, mass_probas)
    
    model_constrained_MSE = catboost.CatBoostRegressor(loss_function="ConstrainedRegression",
                                                       **common_catboost_params)
    model_constrained_MSE.fit(train_small, sWeights)
    
    model_naive_sWeight = catboost.CatBoostClassifier(**common_catboost_params)
    model_naive_sWeight = utils.train.train_on_sWeights_signal_vs_background_naive(
        train_small, sWeights, model_naive_sWeight)
    
    model_labels = catboost.CatBoostClassifier(**common_catboost_params).fit(
        train_small, train_small_labels)
    
    predictions = []
    for model in (model_likelihood, model_constrained_MSE, model_naive_sWeight, model_labels):
        predictions.append(utils.train.perdict_raw(model, test.loc[:, FEATURES].values))
    return predictions

In [8]:
train_size_fractions = np.array([1e-4, 1e-3, 1e-2, 1e-1, 1.])

In [None]:
# This will take some time
predictions = [[get_score(fraction, seed) for seed in range(10)] for fraction in train_size_fractions]

In [None]:
predictions = np.array(predictions)

In [None]:
np.save("Higgs_cb_predictions.npy", predictions)

In [None]:
scores = []
for predictions_by_size in predictions:
    scores_by_size = []
    for predictions_by_seed in predictions_by_size:
        scores_by_size.append([])
        for predictions_by_model in predictions_by_seed:
            scores_by_size[-1].append(bootstrap_score(
                test.label.values, predictions_by_model, roc_auc_score))
    scores_by_size = np.array(scores_by_size)
    mean_scores = np.mean(scores_by_size, axis=(0, 2))
    var_scores = np.var(scores_by_size, axis=(0, 2))
    scores.append([mean_scores, var_scores])

In [None]:
scores = np.array(scores)

In [None]:
scores_dict = {}
for model_index, model_name in enumerate((
    "Likelihood", "Constrained MSE", "sWeight", "True labels")):
    scores_dict[model_name] = scores[:, :, model_index].T

In [None]:
with open("Higgs_scores.pkl", "wb") as scores_file:
    pickle.dump([train_size_fractions*train_full.shape[0], scores_dict], scores_file)