# Example Unbinned Likelihood Fits

In [1]:
import tensorflow as tf
import numpy as np

import itertools # for fast looping
import time # for timing loop
# from iminuit import Minuit # http://iminuit.readthedocs.io/en/latest/installation.html
from scipy.optimize import minimize

# Show the package versions used
for package in [tf, np]:
    print('{} v{}'.format(package.__name__, package.__version__))

tensorflow v1.3.0
numpy v1.13.3


First we will define the function that we are going to sample from and then fit to

In [2]:
# This is probably not even close to how I should do things
import math
def my_formula(formula_lambda, *args):
    """
    Create and evaluate a function
    
    Args:
        formula_lambda: `lambda`
        *args: parameters to evaluate the formula with
    
    Returns:
        The formula passed evaluated at the parameters passed
    
    Example:
    >>> my_formula(lambda x, y: x * y, 1, 2)
    2
    >>> my_formula(lambda x, mu, sigma: \
                  math.exp(-1.0*(x - mu)*(x - mu)/math.sqrt(2*math.pi))/(sigma*math.sqrt(2*math.pi)),
                  0, 0, 1)
    0.3989422804014327
    """
    return formula_lambda(*args)

In [3]:
my_formula(lambda x, mu, sigma: \
           math.exp(-1.0*(x - mu)*(x - mu)/math.sqrt(2*math.pi))/(sigma*math.sqrt(2*math.pi)),
           0, 0, 1)

0.3989422804014327

However, let's just use TensorFlow's Normal distribution as it already exists

## Fit in TensorFlow

In [4]:
def sample_model(model, n_samples, TYPE=np.float32):
    """
    Sample the model n_samples times
    
    Args:
        model: `tf.distributions` The model
        n_samples: `int` The number of times the model is samples
    
    Returns:
        The sampled points and their values: x,y
        x: model.sample(n_samples)
        y: model.prob(x)
    """
    x = model.sample(n_samples)
    y = model.prob(x)
    
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        return sess.run(x), sess.run(y)

In [5]:
def normal_log(X, mu, sigma, TYPE=np.float32):
    """The log of Normal(X | mu, sigma)"""
    return -tf.log(tf.constant(np.sqrt(2 * np.pi), dtype=TYPE) * sigma) - \
        tf.pow(X - mu, 2) / (tf.constant(2, dtype=TYPE) * tf.pow(sigma, 2))

In [6]:
def nll(X, mu, sigma, TYPE=np.float32):
    """The NLL of Normal(X | mu, sigma)"""
    return -tf.reduce_sum(normal_log(X, mu, sigma, TYPE))

c.f. https://gist.github.com/ibab/45c3d886c182a1ea26d5

In [7]:
# pdf of Gaussian of variable x with mean mu and standard deviation sigma
mu = tf.Variable(0.)
sigma = tf.Variable(1.)
model_tf = tf.distributions.Normal(loc=mu, scale=sigma)

In [8]:
# My memory on my computer seems to fill up and stay full very fast by rerunning this
# so it seems that Python is not releasing anything

# MLE attempt
TYPE = np.float64 # This is required(?) for the fit to converge
# TYPE = np.float32

n_events = 1000000 # time of fit is very dependent on n_events
n_trials = 10

sess = tf.Session()

# def func(mu_, sigma_):
#     return sess.run(nll_, feed_dict={mu: mu_, sigma: sigma_})

def func(x):
    return sess.run(nll_, feed_dict={mu: x[0], sigma: x[1]})

mu_true = tf.Variable(0.5)
sigma_true = tf.Variable(1.5)
model_tf = tf.distributions.Normal(loc=mu_true, scale=sigma_true)

start_time = time.time()
for _ in itertools.repeat(None, n_trials):
    data = np.random.normal(0.5, 1.5, n_events)
#     data, _ = sample_model(model_tf, n_events) # this is wrong for some reason
    
    # Define data as a variable so that it will be cached
    X = tf.Variable(data, name='data')
    
    mu = tf.Variable(TYPE(1), name='mu')
    sigma = tf.Variable(TYPE(2), name='sigma')
    
    init = tf.global_variables_initializer()
    sess.run(init)
    
    nll_ = nll(X, mu, sigma, TYPE)
    
    # To guard against excessive output
    if n_trials > 1:
        print_level = 0
    else:
        print_level = 1
    
    ret = minimize(func, x0=[10, 10], bounds=[(-1, 100), (0.00001, 100)])
#     print(ret.x, ret.fun) # x is an array of fit values, fun is the value of the function passed
    
end_time = time.time()
time_duration = end_time - start_time
mean_fit_time = time_duration/n_trials

print("true mu = {}, true sigma = {}".format(sess.run(mu_true), sess.run(sigma_true)))
print("mu = {}, sigma = {}".format(ret.x[0], ret.x[1]))
      
sess.close()

print("\nfit {} points {} times in {} seconds".format(n_events, n_trials, time_duration))
print("The average fit time is {} seconds".format(mean_fit_time))

true mu = 0.5, true sigma = 1.5
mu = 0.4988746762836436, sigma = 1.5022315618086808

fit 1000000 points 10 times in 10.383291721343994 seconds
The average fit time is 1.0383291721343995 seconds


---

In [9]:
# Example
test_x, test_y = sample_model(model_tf, 50)

In [10]:
print(test_x)
print(test_y)

[-2.60797119  2.14600801 -0.38215697  0.08869576 -0.47034073 -0.09784842
  1.44825947  0.07784218 -1.56196523  0.62938935  0.87335515  1.11305213
  1.32686877 -2.9331851  -0.92176485  2.34848523  0.74615407 -0.99187303
  0.02834952  0.06921425  0.91782087  0.08715844  1.85106611 -0.64145851
 -1.97008777 -0.23839629 -0.67484021 -1.83718324  0.1749588  -0.11075425
  2.83260632  2.32785082  1.85071301  0.23090461  1.93667817 -2.0404954
 -1.59410286 -2.14791346 -0.33951831  0.32414171  1.0723927  -1.59547114
  0.18054867  0.67151153  1.65632772 -0.49255466  1.16626561 -1.24136126
  2.08359957 -0.76459765]
[ 0.12120605  0.25518087  0.25400105  0.19147547  0.25892249  0.20248026
  0.26479915  0.05258462  0.06306614  0.2659151   0.2575492   0.24161479
  0.07861906  0.08805612  0.11374611  0.2596983   0.25814852  0.26470211
  0.20939004  0.19198905  0.15916103  0.0249576   0.26557159  0.22997639
  0.26551747  0.25633875  0.19711402  0.01318751  0.26592603  0.26569301
  0.08894145  0.26344553  

## Fit in NumPy

In [None]:
TYPE = np.float64

mu_true = 0.5
sigma_true = 1.5

# model_np is the normal distribution from np

start_time = time.time()
for _ in itertools.repeat(None, n_trials):
    X = np.random.normal(mu_true, sigma_true, n_events).astype(TYPE)
    
    nll_ = nll(X, mu, sigma, TYPE)
    
    # To guard against excessive output
    if n_trials > 1:
        print_level = 0
    else:
        print_level = 1
    
    # Gilles example
    ret = minimize(func, x0=[10, 10], bounds=[(-1, 100), (0.00001, 100)])
#     print(ret.x, ret.fun) # x is an array of fit values, fun is the value of the function passed
    
end_time = time.time()
time_duration = end_time - start_time
mean_fit_time = time_duration/n_trials

print("true mu = {}, true sigma = {}".format(mu_true, sigma_true))
print("mu = {}, sigma = {}".format(ret.x[0], ret.x[1]))

print("\nfit {} points {} times in {} seconds".format(n_events, n_trials, time_duration))
print("The average fit time is {} seconds".format(mean_fit_time))

## Fit in HistFactory