# Compare low level, high level, and both features

In [None]:
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.ensemble import GradientBoostingClassifier

import sys
sys.path.insert(0, '../fairml')
import plotting
import generate as G
import models
import actions
import utils

# Generate the data

In [None]:
var_sets = ['low', 'high', 'both']

# get the scalers, and the convenince function
x_scaler, z_scaler, generate = {}, {}, {}

for v in var_sets:
    x_scaler[v], z_scaler[v], generate[v] = G.generate_hmumu(features=v)

# generate test data (a large, one time only thing)
n_test_samples = 100000
X, Y, Z, W, Z_plot = {}, {}, {}, {}, {}

for v in var_sets:
    X[v], Y[v], Z[v], W[v] = generate[v](n_test_samples, balanced=False) # need Global weights for testing
    Z_plot[v] = z_scaler[v].inverse_transform(Z[v], copy=True)
    

# Create a sklearn benchmark performance

In [None]:
n_train_samples = 10000

# create training samples
X_train, Y_train, W_train = {}, {}, {}
for v in var_sets:
    X_train[v], Y_train[v], _, W_train[v] = generate[v](n_train_samples, balanced=True)

# train
gbc100, preds100, gbc400, preds400 = {}, {}, {}, {}
fpr100, tpr100, fpr400, tpr400 = {}, {}, {}, {}

for v in var_sets:

    # train 100 and 400 benchmarks
    gbc100[v] = GradientBoostingClassifier(n_estimators=100)
    gbc100[v].fit(X_train[v], Y_train[v], sample_weight=W_train[v])
    preds100[v] = gbc100[v].predict_proba(X[v])[:, 1]

    gbc400[v] = GradientBoostingClassifier(n_estimators=400)
    gbc400[v].fit(X_train[v], Y_train[v], sample_weight=W_train[v])
    preds400[v] = gbc400[v].predict_proba(X[v])[:, 1]

    # roc curve
    fpr100[v], tpr100[v], _ = roc_curve(Y[v], preds100[v], sample_weight=W[v])
    fpr400[v], tpr400[v], _ = roc_curve(Y[v], preds400[v], sample_weight=W[v])

# plot
fig, ax = plt.subplots(figsize=(5,5))
cols = {'low':utils.light_blue, 'high':utils.blue, 'both':utils.oxford_blue}
for v in var_sets:
    ax.plot(1-fpr100[v], tpr100[v], color=cols[v], linestyle=':', label='GBC 100 ({})'.format(v))
    ax.plot(1-fpr400[v], tpr400[v], color=cols[v], linestyle='-', label='GBC 400 ({})'.format(v))
ax.legend(loc='best')
ax.set_xlabel('Background rejection')
ax.set_ylabel('Signal efficiency')
plt.show()

# save the benchmarks for the performance comparison
fprs = {v:fpr400[v] for v in var_sets}
tprs = {v:tpr400[v] for v in var_sets}
labels = {v:'GBC400 ({})'.format(v) for v in var_sets}
benchmarks = fprs, tprs, labels


# Train neural nets on low, high level features



In [None]:
sess = tf.InteractiveSession()
ctr = 0

In [None]:
n_samples = 1000
n_epochs = 10
learning_rate = 0.005
deep = True
ctr+=1
name = 'name'+str(ctr)

# input placeholders
x_in, y_in, z_in, w_in, inputs = {}, {}, {}, {}, {}
for v in var_sets:
    x_in[v] = tf.placeholder(tf.float32, shape=(None, X[v].shape[1]), name='X'+v)
    y_in[v] = tf.placeholder(tf.float32, shape=(None, ), name='Y'+v)
    z_in[v] = tf.placeholder(tf.float32, shape=(None, ), name='Z'+v)
    w_in[v] = tf.placeholder(tf.float32, shape=(None, ), name='W'+v)
    inputs[v] = [x_in[v], y_in[v], z_in[v], w_in[v]]


# create the classifier graph, loss, and optimisation
clf_output, vars_D, loss_D, opt_D = {}, {}, {}, {}
for v in var_sets:
    clf_output[v], vars_D[v] = models.classifier(x_in[v], name+'_clf'+v, deep=deep)
    loss_D[v] = models.classifier_loss(clf_output[v], y_in[v], w_in[v])
    opt_D[v] = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss_D[v], var_list=vars_D[v])

# initialise the variables
sess.run(tf.global_variables_initializer())

# train the classifiers
for e in range(n_epochs):
    
    # report on progress
    if e%10 == 0:
        print('{}/{}'.format(e, n_epochs))
    
    # training step and roc curve compuation
    npreds, nfprs, ntprs, nlabels = {}, {}, {}, {}
    for v in var_sets:
        actions.train(sess, opt_D[v], loss_D[v], inputs[v], generate[v], n_samples, 1, None)
        npreds[v] = utils.sigmoid(sess.run(clf_output[v], feed_dict={x_in[v]:X[v]}))
        nfprs[v], ntprs[v], _ = roc_curve(Y[v], npreds[v], sample_weight=W[v])
        nlabels[v] = 'NN {}'.format(v)
    
    nets = nfprs, ntprs, nlabels
    
    # plot
    pname = 'VarsComparison'
    dirn = 'media/plots/{}'.format(pname)
    if not os.path.exists(dirn):
        os.makedirs(dirn)
    path = '{d}/{n}_{c:03}.png'.format(d=dirn, n=pname, c=e)
    
    plotting.plot_var_sets(benchmarks, nets, path, batch=True)
    
    