In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from hep_ml.uboost import uBoostClassifier
import sys
sys.path.append('../fairboost')

from generate import generate_toys
from generate import show_variates
from plot import show_clf

# Show the data

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
show_variates(ax, generate_toys)


# Train a classifier

## normal bdt

In [None]:
X, Y, Z = generate_toys(10000)

clf = GradientBoostingClassifier(n_estimators=100, verbose=True)
clf.fit(X, Y)

show_clf(clf, generate_toys)

## uniform boosting

In [None]:
X, Y, Z = generate_toys(10000, pandas=True)

clf = uBoostClassifier(n_estimators=100, uniform_features=['z'], uniform_label=1, train_features=['x1', 'x2'])
clf.fit(X, Y)

show_clf(clf, generate_toys, pandas=True)

# Custom classifier

In [None]:
from scipy.special import expit
from scipy.special import logit
from sklearn.ensemble._gb_losses import BinomialDeviance

from sklearn.metrics import roc_auc_score


class FairboostClassifier:
    
    def __init__(self, Z, n_estimators=100, learning_rate=0.1, max_depth=3):
        
        self.Z = Z
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        #self.adversary = DecisionTreeRegressor()
        
        self._loss = BinomialDeviance(2)
        self._estimators = []
        self._gammas = []
        
    def _raw_prediction(self, X):
        
        # initial model TODO: this is loss function dependent
        raw_prediction = logit(self.prior) * np.ones(len(X))
        
        # loop over estimators
        for i, est in enumerate(self._estimators):
            
            # estimator response
            pred = est.predict(X)
            
            # add them up
            raw_prediction += self.learning_rate * pred
        
        return raw_prediction.reshape(-1,1)
    
    def fit(self, X, Y):
        
        # force type
        X = np.array(X, dtype=np.float32)
        
        # initial model
        self.prior = np.sum(Y==1) / len(Y)
        
        # fit the remaining estimators
        for i in range(self.n_estimators):
            
            # predictions from the previous models
            raw_prediction = self._raw_prediction(X)
            raw_prediction_copy = raw_prediction.copy() # see sklearn documentation
            
            # compute the gradient
            neg_grad = self._loss.negative_gradient(Y, raw_prediction_copy)
            
            # fit the new tree
            tree = DecisionTreeRegressor(max_depth=self.max_depth,
                                         criterion='friedman_mse')
            tree.fit(X, neg_grad)
            
            # line search for each leaf (done in the loss function method)
            sample_weight = np.ones(shape=Y.shape)
            sample_mask = np.ones(shape=Y.shape, dtype=bool)
            
            self._loss.update_terminal_regions(
                tree.tree_, X, Y, neg_grad, raw_prediction,
                sample_weight, sample_mask, self.learning_rate)
            
            # append results
            self._estimators.append(tree)
        
    def predict_proba(self, X):
        
        # make raw predictions
        raw_prediction = self._raw_prediction(X)
        
        # turn them into a probability
        proba = self._loss._raw_prediction_to_proba(raw_prediction)
        
        return proba

def compare_classifiers(N):
    
    # training and test sets
    X, Y, Z = generate_toys(N)
    X_test, Y_test, Z_test = generate_toys(N)
    
    # fit the models
    fb_clf = FairboostClassifier(Z, n_estimators=100)
    sk_clf = GradientBoostingClassifier(n_estimators=100)
    fb_clf.fit(X, Y)
    sk_clf.fit(X, Y)
    
    # predict
    fb_proba = fb_clf.predict_proba(X_test)[:,1]
    sk_proba = sk_clf.predict_proba(X_test)[:,1]
    
    # evaluate
    fb_score = roc_auc_score(Y_test, fb_proba)
    sk_score = roc_auc_score(Y_test, sk_proba)
    
    print('Fairboost classifier:', fb_score)
    show_clf(fb_clf, generate_toys)
    print('Sklearn classifier:', sk_score)
    show_clf(sk_clf, generate_toys)
    
compare_classifiers(10000)


In [None]:
# training and test sets
X, Y, Z = generate_toys(10000)
X_test, Y_test, Z_test = generate_toys(10000)

# fit the models
clf = FairboostClassifier(Z, n_estimators=100)
clf.fit(X, Y)
    
# predict
F_test = clf.predict_proba(X_test)[:,1]

In [None]:
import scipy.optimize as spo

class PolynomialModel:
    """
    Works only for one dimensional X...
    """
    
    def __init__(self, order=3):
        
        self.order = order
        self.coefficients = np.ones(shape=self.order+1)
    
    def fit(self, X, Y):

        # prepare objective
        def loss(coefficients):
            output = self._forward(X, *coefficients)
            return np.mean((Y-output)**2)
        
        # fit parameters
        res = spo.minimize(loss, x0=[1, 1, 1, 1])
        
        # save as coefficients
        self.coefficients = res.x
    
    def _forward(self, X, *coefficients):
        
        # loop over coefficients
        result = np.zeros(shape=X.shape)
        for i, a in enumerate(coefficients):
            result += a * X**i
        return result
    
    def predict(self, X):
        res = self._forward(X, *self.coefficients)
        return res
    
    def negative_gradient(self, X):
        pass

# only take the signal values
x = X_test[Y_test==1]
f = F_test[Y_test==1]
z = Z_test[Y_test==1]

# create bins
nbins=100
hist, edges = np.histogram(f, range=(0,1), bins=nbins)
centres = 0.5 * (edges[:-1] + edges[1:])

# compute metrics over bins
idx = np.digitize(f, edges)
bin_means = [np.mean(z[idx==i]) for i in range(1, len(edges))]
bin_stds = [np.std(z[idx==i])/np.sqrt(len(z[idx==i])) for i in range(1, len(edges))]

# fit the model
adv = PolynomialModel()
adv.fit(f, z)
adv.predict(f)

# create model response
xs = np.linspace(0,1,100)
ys = adv.predict(xs)

fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(f, z, alpha=0.1, color='darkblue')
ax.errorbar(centres, bin_means, xerr=0.5/nbins, yerr=bin_stds, color='orange')
ax.plot(xs, ys, color='red')
ax.set_xlabel('classifier output')
ax.set_ylabel('Z')
ax.set_xlim(0,1)
plt.show()
