In [1]:
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import scipy.stats

import catboost
import utils.train
import utils.sweight
from utils.evaluation import bootstrap_score
from utils.higgs import FEATURES, load_uci_higgs

In [2]:
train_full, test = load_uci_higgs("data/HIGGS.csv.gz", train_test_split_seed=123444)

In [3]:
ITERATIONS = 1000
CPU_COUNT = 20

In [4]:
SIGNAL_PEAK_MIDDLE = 4
sig_mass_distr = scipy.stats.norm(loc=SIGNAL_PEAK_MIDDLE, scale=1)
bck_mass_distr = scipy.stats.expon(scale=4)

In [5]:
common_catboost_params = {
    "iterations": ITERATIONS,
    "verbose": False,
    "leaf_estimation_method": "Gradient",
    "thread_count": CPU_COUNT
}

In [6]:
def get_score(train_fraction, seed):
    np.random.seed(seed)
    train_small = train_full.sample(frac=train_fraction)
    train_small_labels = train_small.label.values
    train_small = train_small.loc[:, FEATURES].values
    
    sWeights, mass_probas, masses = utils.sweight.invent_sWeights(
        train_small_labels, sig_mass_distr, bck_mass_distr,
        return_probas=True, return_masses=True)
    
    
    model_CWoLa = utils.train.train_cwola(train_small, masses, SIGNAL_PEAK_MIDDLE,
                                          catboost.CatBoostClassifier(**common_catboost_params))
    
    predictions = []
    for model in (model_CWoLa,):
        predictions.append(utils.train.perdict_raw(model, test.loc[:, FEATURES].values))
    return predictions

In [7]:
train_size_fractions = np.array([1e-4, 1e-3, 1e-2, 1e-1, 1.])

In [8]:
# This will take some time
predictions = [[get_score(fraction, seed) for seed in range(10)] for fraction in train_size_fractions]

In [9]:
predictions = np.array(predictions)

In [10]:
np.save("Higgs_cb_predictions_cwola.npy", predictions)

In [11]:
scores = []
for predictions_by_size in predictions:
    scores_by_size = []
    for predictions_by_seed in predictions_by_size:
        scores_by_size.append([])
        for predictions_by_model in predictions_by_seed:
            scores_by_size[-1].append(bootstrap_score(
                test.label.values, predictions_by_model, roc_auc_score))
    scores_by_size = np.array(scores_by_size)
    mean_scores = np.mean(scores_by_size, axis=(0, 2))
    var_scores = np.std(scores_by_size, axis=(0, 2), ddof=1)
    scores.append([mean_scores, var_scores])

In [12]:
scores = np.array(scores)

In [13]:
scores_dict = {}
for model_index, model_name in enumerate((
    "CWoLa",)):
    scores_dict[model_name] = scores[:, :, model_index].T

In [14]:
with open("Higgs_scores_cwola.pkl", "wb") as scores_file:
    pickle.dump([train_size_fractions*train_full.shape[0], scores_dict], scores_file)