In [None]:
import helper as hlp

import numpy as np

import matplotlib.pyplot as plt
import matplotlib.dates as mpldates
import matplotlib.gridspec as gridspec
from matplotlib.colors import LogNorm
%matplotlib inline

import scipy.interpolate as sci
import scipy.optimize as sco
import scipy.stats as scs

import json
import datetime
import pickle
from astropy.time import Time as astrotime

import sklearn.neighbors as skn
import sklearn.model_selection as skms  # Newer version of grid_search

from corner_hist import corner_hist
from anapymods3.plots.general import split_axis, get_binmids, hist_marginalize

# Some globals
hoursindays = 24.
secinday = hoursindays * 60. * 60. 

# Load data

Load IC86 data from epinat, which should be the usual IC86-I (2011) PS sample, but pull corrected and OneWeights corrected by number of events generated.

In [None]:
exp, mc, livetime = hlp.load_data()

# Get data livetime

Generate from good run list as stated here:
- http://icecube.wisc.edu/~coenders/html/build/html/ic86-bdt/muonL3.html
- https://wiki.icecube.wisc.edu/index.php/IC86_I_Point_Source_Analysis/Data_and_Simulation

It should be 332.61 days as stated by jefeintzeig and scoenders.
We create one bin per included run, with exactly that width.
Excluded runs are those with too high/low rate and without everything marked "good".

Livetime ist a bit higher, because we used a newer runlist from iclive instead of the old non-json v1.4.
See side test for that comparison.

In [None]:
run_list = hlp.get_run_list()
run_dict = hlp.get_run_dict(run_list)
inc_run_arr, _livetime = hlp.get_good_runs(run_dict)

# We don't use this livetime, but the ones from runlist v1.4
print("IC86-I livetime from iclive: ", _livetime)

# Bin BG according to runlist

Each run is one bin in the bg rate vs time plot.
The rate is normed to Hertz by dividing through the bin sizes in seconds.

In [None]:
# Store events in bins with run borders
exp_times = exp["timeMJD"]
start_mjd = inc_run_arr["start_mjd"]
stop_mjd = inc_run_arr["stop_mjd"]

tot = 0
evts_in_run = {}
for start, stop , runid in zip(start_mjd, stop_mjd, inc_run_arr["runID"]):
    mask = (exp_times >= start) & (exp_times < stop)
    evts_in_run[runid] = exp[mask]
    tot += np.sum(mask)
    
# Crosscheck, if we got all events and counted nothing double
print("Do we have all events? ", tot == len(exp))
print("  Events selected : ", tot)
print("  Events in exp   : ", len(exp))

# Create binmids and histogram values in each bin
binmids = 0.5 * (start_mjd + stop_mjd)
h = np.zeros(len(binmids), dtype=np.float)

for i, evts in enumerate(evts_in_run.values()):
    h[i] = len(evts)
    
# Now remove the 120 runs with zero rate that come from the differences
# in the runlist. See side_test for more
m = (h > 0)
print("\nRuns with 0 events :", np.sum(~m))
print("Runtime in those runs: ", np.sum(inc_run_arr["stop_mjd"][~m] -
                                        inc_run_arr["start_mjd"][~m]))

# Remove all zero event runs (artifacts from new run list) and calc the rate
stop_mjd, start_mjd = stop_mjd[m], start_mjd[m]
h = h[m]

print("\nHave all events after removing zero rates? ", np.sum(h) == len(exp))
print("  Events selected : ", int(np.sum(h)))
print("  Events in exp   : ", len(exp))

# Normalize to rate in Hz and calc yerrors for fitting later
h /= ((stop_mjd - start_mjd) * secinday)
binmids = binmids[m]
yerr = np.sqrt(h) / np.sqrt((stop_mjd - start_mjd) * secinday)

In [None]:
# Plot runs
fig, ax = plt.subplots(1, 1)

xerr = 0.5 * (stop_mjd - start_mjd)
ax.errorbar(binmids, h, xerr=xerr, yerr=yerr, fmt=",")

# Setup main axis
ax.set_xlim(start_mjd[0], stop_mjd[-1])
ax.set_ylim(0, None)
ax.set_xlabel("MJD")
ax.set_ylabel("Rate in Hz")
# Rotate bottom labels if needed
# def xlabels(x):
#     return ["{:5d}".format(int(xi)) for xi in x]
# ax.set_xticklabels(xlabels(ax.get_xticks()), rotation=60,
#                    horizontalalignment="right")

# Second xaxis on top with month and year.
# Convert MJD to datetimes, make dates for every month and convert to mjd
# http://stackoverflow.com/questions/22696662/ \
#   python-list-of-first-day-of-month-for-given-period
datetimes = astrotime(binmids, format="mjd").to_datetime()
dt, end = datetimes[0], datetimes[-1]
datetimes_ticks = []
while dt < end:
    if not dt.month % 12:
        dt = datetime.datetime(dt.year + 1, 1, 1)
    else:
        dt = datetime.datetime(dt.year, dt.month + 1, 1)
    datetimes_ticks.append(dt)
mjd_ticks = astrotime(datetimes_ticks, format="datetime").mjd
# New axis on top, make sure, we use the same range
ax2 = ax.twiny()
ax2.set_xlim(ax.get_xlim())
ax2.set_xticks(mjd_ticks)
ax2.set_xticklabels([dtt.strftime("%b '%y") for dtt in datetimes_ticks],
                    rotation=60, horizontalalignment="left")

fig.tight_layout()
plt.show()

# Time dependent rate function

**Note: I think it is unnecessary to use a time and declination dependent rate. The spatial part is injected from the data BG from KDE anyways. So we just need to have the rate to determine how much events we inject allsky.**

Rate ist time dependent because of seasonal variation.
We take this varariation into account by fitting a priodic function to the time resolved rate.

The data is built by calculating the rate in each run as seen before.
This rate is correctly normalized and smoothes local fluctuations.

### Peridoc function with a weighted least squares fit

See side_test for comparison to spline fits.
The function is a simple sinus scalable by 4 parameters to fit the shape of the rates:

$$
    f(x) = a\cdot \sin(b\cdot(x - c)) + d
$$

The least squares loss function is

$$
    R = \sum_i (w_i(y_i - f(x_i)))^2
$$

Weights are standard deviations from poisson histogram error.

$$
    w_i = \frac{1}{\sigma_i}
$$

Seed values are estimated from plot rate vs time.

- Period should be 365 days (MJD) because we have one year of data so we choose $b0 = 2\pi/365$.
- Amplitude is about $a_0=-0.0005$, because sinus seems to start with negative values.
- The x-offset is choose as the first start date, to get the right order of magnitude.
- The y-axis intersection $d$ schould be close to the weighted average, so we take this as a seed.

The bounds are motivated as follows (and if we don't hit them, it's OK to use them).

- Amplitude $a$ should be positive, this also resolves a degenracy between a-axis offset.
- The period $b$ should scatter around one year, a period larger than +-1 half a year is unphysical.
- The x-offset $c$ cannot be greater than the initial +- the period because we have a periodic function.
- The y-axis offset $d$ is arbitrarily constrained, but as seen from the plot it should not exceed 0.1. 

In [None]:
def f(x, args):
    a, b, c, d = args
    return a * np.sin(b * (x - c)) + d

def lstsq(pars, *args):
    """
    Weighted leastsquares min sum((wi * (yi - fi))**2)
    """
    # data x,y-values and weights are fixed
    x, y, w = args[0], args[1], args[2]
    # Params get fitted
    a, b, c, d = pars[0], pars[1], pars[2], pars[3]
    # Target function
    f = a * np.sin(b * (x - c)) + d
    # Least squares loss
    return np.sum((w * (y - f))**2)

In [None]:
# Seed values from consideration above.
a0 = -0.0005
b0 = 2. * np.pi / 365.  # We could restrict the period to one yr exact.
c0 = np.amin(start_mjd)
d0 = np.average(h, weights=yerr**2)

x0 = [a0, b0, c0, d0]
# Bounds as explained above
bounds = [[None, None], [0.5 * b0, 1.5 * b0], [c0 - b0, c0 + b0, ], [0, 0.01]]
# x, y values, weights
args = (binmids, h, 1. / yerr)

res = sco.minimize(fun=lstsq, x0=x0, args=args, bounds=bounds)
bf_pars = res.x

print("Amplitude   : {: 13.5f} in Hz".format(res.x[0]))
print("Period (d)  : {: 13.5f} in days".format(2 * np.pi / res.x[1]))
print("Offset (MJD): {: 13.5f} in MJD".format(res.x[2]))
print("Avg. rate   : {: 13.5f} in Hz".format(res.x[3]))

In [None]:
# Define the rate function:
def rate(t):
    return f(t, res.x)

In [None]:
# Plot runs
xerr = 0.5 * (stop_mjd - start_mjd)
plt.errorbar(binmids, h, xerr=0, yerr=yerr, fmt=",")
plt.ylim(0, None);

# Plot fit
x = np.linspace(start_mjd[0], stop_mjd[-1], 1000)
y = rate(x)
plt.plot(x, y, zorder=5)

# Plot y shift dashed to see baseline or years average
plt.axhline(bf_pars[3], 0, 1, color="C1", ls="--", label="")

plt.xlim(start_mjd[0], stop_mjd[-1])
plt.xlabel("MJD")
plt.ylabel("Rate in Hz")
plt.show()

# Let's make the BG pdf

Proceeding to section 6.3.1 Randomized BG Injection, p. 113.
Mrichmann draws events by:

1. Get number of bg events to be injected from a poisson distribution with expectation values drawn from the previously build bg temporal distribution.
   $$
   P_{\langle n_B\rangle}(N_m) = \frac{\langle n_B\rangle^{N_m}}{N_m\!}\cdot \exp(\langle n_B\rangle)
   $$
2. These events are then drawn from a 3D pdf in energy proxy, zenith proxy and sigma proxy.
   He does it by dividing 10x10x10 bins, first selecting energy, then zenith in that energy bin, then sigma in that zenith bin.
   
Here we create a smooth PDF using a kernel density estimator and obtain a sample by running a MCMC chain to create a sample a priori.
The bandwidth is set globally and cross validated to be robust.

**Some note on `numpy.histogramdd`:**

The input must be an array with shape (nDim, len(data)).

Shape of h is the same as the number of bins in each dim: (50, 40, 10)
So the first dimension picks a single logE slice -> h[i].shape = (40, 10)
Second dim picks a dec slice -> h[:, i].shape = (50, 10)
3rd picks a sigma slice -> h[:, :, i].shape = (50, 40)

This is important: meshgrid repeats in second axis on first array xx.
For the second array, the first axis is repeated.
But h iterates over energy in 1st axis. So if we don't transpose, we have the whole histogram flipped! Compare to plot in mrcihmanns thesis (cos(zen))

**Some notes on KDE:**

Sebastian has already made a tool for adaptive and asymmetric KDE.
1. The Kernel is the covariance matrix of the whole data set to regard different scales
    + Note: This may only be a problem, if one dim is spread with peaks, while the other is wide spread only. Then we cannot scale the Kernel to small to fit the peaks because the smooth dimension is preventing that.
2. Use Silvermans or Scotts rule as a first guess.
3. Run a second pass and vary the local bandwidth according to the first guess local density.

We could replace 1 and 2 by scaling the data with the inverse covariance and then using a cross validation to find the first guess bandwidth.
Then using a second pass to vary locally.

## 3D histogram of BG data
First we make a 3D histogram to better compare to mrichmann and to get an overview over the distribution.

In [None]:
# HANDTUNED scale parameter to "fit" KDE expectation to data...
# TODO: Use Adaptive kernel width and asymmetric gaus kernels
#       For sigma it might make sense to a take a restricted kernel [0, inf]
fac_logE = 1.5
fac_dec = 2.5
fac_sigma = 2.

logE = fac_logE * exp["logE"]
sigma = fac_sigma * np.rad2deg(exp["sigma"])
# np.cos(np.pi / 2. + exp["dec"]); dec is for {sin(dec), dec, cos(zen)}
dec = fac_dec * exp["dec"]

# Binning is rather arbitrary because we don't calc stuff with the hist
bins = [50, 50, 50]
# Range for sigma is picked by looking at the 1D distribution and cutting of
# the tail. This will be covered by the KDE tail anyway. Rest is default
r = [[np.amin(logE), np.amax(logE)],
     [np.amin(dec), np.amax(dec)],
     [0., fac_sigma * 5.]]

sample = np.vstack((logE, dec, sigma)).T
h, bins = np.histogramdd(sample=sample, bins=bins, range=r, normed=False)

# Make bin mids for later use
mids = []
for b in bins:
    mids.append(0.5 * (b[:-1] + b[1:]))

# Make a nice corner plot
fig, ax = corner_hist(h, bins=bins,
                      label=["logE", "dec", "sigma deg"],
                      hist2D_args={"cmap": "Greys"},
                      hist_args={"color":"#353132"})

## Kernel Density Estimation

We use scikit learn's cross validation with a gaussian kernel to get the most robust bandwidth.
Then we integrate with the same binning as above and compare to the 3D histogram.

This section relies heavily on [Jake van der Plas examples for KDE](https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/).
More info on how KDE cross validation works can be found in [Modern Nonparametric Methods](http://www2.stat.duke.edu/~wjang/teaching/S05-293/lecture/ch6.pdf).

In [None]:
# KDE CV is running on cluster and pickles the GridSearchCV
fname = "./kde_cv/KDE_model_selector_20_exp_IC86_I_followup_2nd_pass.pickle"
with open(fname, "rb") as f:
    model_selector = pickle.load(f)

kde = model_selector.best_estimator_
bw = model_selector.best_params_["bandwidth"]
print("Best bandwidth : {:.3f}".format(bw))

# We maybe just want to stick with the slightly overfitting kernel to
# be as close as possible to data
OVERFIT = True
if OVERFIT:
    bw = 0.075
    kde = skn.KernelDensity(bandwidth=bw, kernel="gaussian", rtol=1e-8)

print("Used bandwidth : {:.3f}".format(bw))
# Estimate pdf for data sample with best model
kde.fit(sample)

# Generate some BG samples to compare to the original data hist.
# Use more statistics, histograms get normalized and we want the best estimate
# for the pdf
nsamples_kde = int(1e7)
bg_samples = kde.sample(n_samples=nsamples_kde)

# Make histogram with same binning as original data
bg_h, bg_bins = np.histogramdd(sample=bg_samples, bins=bins, range=r, normed=True)

fig, ax = corner_hist(bg_h, bins=bg_bins,
                      label=["logE", "sin(dec)", "sigma deg"],
                      hist2D_args={"cmap": "Greys"},
                      hist_args={"color":"#353132"})

## Compare KDE to original data

Make a ratio histogram of the KDE sample and the original data sample.

### 2D marginalization

In [None]:
# Create 2D hists, by leaving out one parameter
xlabel = ["scaled " + s for s in ["logE", "logE", "dec"]]
ylabel = ["scaled " + s for s in ["dec", "sigma in °", "sigma in °"]]

for i, axes in enumerate([[0, 1], [0, 2], [1, 2]]):
    _b = np.array(bins)
    h_exp, b_exp = np.histogramdd(sample[:, axes],
                                  bins=_b[axes], normed=True)
    h_kde, b_kde = np.histogramdd(bg_samples[:, axes],
                                  bins=_b[axes], normed=True)
    
    # KDE is expectation, but sampled with much more events.
    # Weights would simply scale the total number of KDE events to match the
    # number of original events. That would be the mean for the poisson
    # distribution in each bin. So to get OK KDE expectation sqrt(n) errors
    # in each bin, we divide not by the number of drawn KDE but by the number
    # of original events.   
    # Again shapes of meshgrid and hist are transposed
    diffXX, _ = np.meshgrid(np.diff(_b[0]), np.diff(_b[1]))
    norm_kde = len(exp) * diffXX.T
    sigma_kde = np.sqrt(h_kde / norm_kde)

    # Make 3 different diff/ratio hists to estimate KDE quality in
    # 1D marginalization.
    m = (h_exp > 0.)
    ratio_h = np.zeros_like(h_exp)
    ratio_h[m] = h_kde[m] / h_exp[m]

    diff_h = h_kde - h_exp

    m = (sigma_kde > 0.)
    sigma_ratio_h = np.zeros_like(h_exp)
    sigma_ratio_h[m] = (h_exp[m] - h_kde[m]) / sigma_kde[m]

    # Bin mids and hist grid
    _b = b_exp
    m = get_binmids(_b)
    xx, yy = map(np.ravel, np.meshgrid(m[0], m[1]))
    
    
    # Big plot on the left and three right
    fig = plt.figure(figsize=(10, 6))
    gs = gridspec.GridSpec(3, 3)
    axl = fig.add_subplot(gs[:, :2])
    axrt = fig.add_subplot(gs[0, 2])
    axrc = fig.add_subplot(gs[1, 2])
    axrb = fig.add_subplot(gs[2, 2])
    
    # Steal space for colorbars
    caxl = split_axis(axl, "right")
    caxrt = split_axis(axrt, "left")
    caxrc = split_axis(axrc, "left")
    caxrb = split_axis(axrb, "left")

    # Unset top and center xticklabels as they are shared with the bottom plot
    axrt.set_xticklabels([])
    axrc.set_xticklabels([])
        
    # Left: Difference over KDE sigma
    # cbar_extr = max(np.amax(sigma_ratio_h),  # Center colormap to min/max
    #                         abs(np.amin(sigma_ratio_h)))
    _, _, _, imgl = axl.hist2d(xx, yy, bins=_b, weights=sigma_ratio_h.T.ravel(),
                               cmap="seismic", vmax=5, vmin=-5)
    cbarl = plt.colorbar(cax=caxl, mappable=imgl)
    axl.set_xlabel(xlabel[i])
    axl.set_ylabel(ylabel[i])
    axl.set_title("(exp - kde) / sigma_kde")
    
    # Right top: Ratio
    _, _, _, imgrt = axrt.hist2d(xx, yy, bins=_b, weights=ratio_h.T.ravel(),
                                 cmap="seismic", vmax=2, vmin=0);
    cbarrt = plt.colorbar(cax=caxrt, mappable=imgrt)
    axrt.set_title("kde / exp")

    # Right center: Data hist
    _, _, _, imgrc = axrc.hist2d(xx, yy, bins=_b, weights=h_exp.T.ravel(),
                                 cmap="Greys", norm=LogNorm());
    cbarrc = plt.colorbar(cax=caxrc, mappable=imgrc)
    axrc.set_title("exp logscale")

    # Right bottom: KDE hist, same colorbar scale as on data
    _, _, _, imgrb = axrb.hist2d(xx, yy, bins=_b, weights=h_kde.T.ravel(),
                                 cmap="Greys", norm=LogNorm());
    # Set with same colormap as on data
    imgrb.set_clim(cbarrc.get_clim())
    cbarrb = plt.colorbar(cax=caxrb, mappable=imgrb)
    axrb.set_title("kde logscale")
    
    # Set tick and label positions
    for ax in [caxrt, caxrc, caxrb]:
        ax.yaxis.set_label_position("right")
        ax.yaxis.tick_left()
    
    fig.tight_layout()
    plt.show()

### 1D marginalization

In [None]:
# Pseudo smooth marginalization is done by sampling many point from KDE an
# using a finely binned 1D histogram, so it looks smooth
xlabel = ["scaled " + s for s in ["logE", "dec", "sigma °"]]

for i, axes in enumerate([0, 1, 2]):
    _b = np.array(bins)
    h_exp, b_exp = np.histogram(sample[:, axes],
                                bins=_b[axes], normed=True)
    h_kde, b_kde = np.histogram(bg_samples[:, axes],
                                bins=_b[axes], normed=True)
    
#     h_exp, b_exp = hist_marginalize(h, bins, axes=axes)
#     h_kde, b_kde = hist_marginalize(bg_h, bg_bins, axes=axes)
      
    # KDE errorbars as in 2D case
    norm_kde = len(exp) * np.diff(b_kde)
    sigma_kde = np.sqrt(h_kde / norm_kde)

    # Make 3 different diff/ratio hists to estimate KDE quality in
    # 1D marginalization.
    m = (h_exp > 0.)
    ratio_h = np.zeros_like(h_exp)
    ratio_h[m] = h_kde[m] / h_exp[m]

    diff_h = h_kde - h_exp

    m = (sigma_kde > 0.)
    sigma_ratio_h = np.zeros_like(h_exp)
    sigma_ratio_h[m] = (h_exp[m] - h_kde[m]) / sigma_kde[m]

    # Bin mids
    _b = b_exp
    m = get_binmids([_b])[0]
    
    # Plot both and the ration normed. Big plot on the left and three right
    fig = plt.figure(figsize=(10, 6))
    gs = gridspec.GridSpec(3, 3)
    axl = fig.add_subplot(gs[:, :2])
    axrt = fig.add_subplot(gs[0, 2])
    axrc = fig.add_subplot(gs[1, 2])
    axrb = fig.add_subplot(gs[2, 2])

    axrt.set_xticklabels([])
    axrc.set_xticklabels([])

    # Set ticks and labels right
    for ax in [axrt, axrc, axrb]:
        ax.yaxis.set_label_position("right")
        ax.yaxis.tick_right()

    # Limits
    for ax in [axl, axrt, axrc, axrb]:
        ax.set_xlim(_b[0], _b[-1])
        
    # Main plot:
    # Plot more dense to mimic a smooth curve
    __h, __b = np.histogram(bg_samples[:, i], bins=500,
                            range=[_b[0], _b[-1]], density=True)
    __m = get_binmids([__b])[0]
    axl.plot(__m, __h, lw=3, alpha=0.5)
    
    _ = axl.hist(m, bins=_b, weights=h_exp, label="exp", histtype="step",
                 lw=2, color="k")
    _ = axl.errorbar(m, h_kde, yerr=sigma_kde, fmt=",", color="r")
    _ = axl.hist(m, bins=_b, weights=h_kde, label="kde", histtype="step",
                 lw=2, color="r")    
    
    axl.set_xlabel(xlabel[i])
    axl.legend(loc="upper right")

    # Top right: Difference
    _ = axrt.axhline(0, 0, 1, color="k", ls="-")
    _ = axrt.hlines([-.02, -.01, .01, .02], _b[0], _b[-1],
                    colors='#353132', linestyles='dashed')
    _ = axrt.hist(m, bins=_b, weights=diff_h, histtype="step", lw=2, color="r")
    axrt.set_ylim(-.05, +.05)
    axrt.set_ylabel("kde - exp")

    # Center right: Ratio
    _ = axrc.axhline(1, 0, 1, color="k", ls="-")
    _ = axrc.hlines([0.8, 0.9, 1.1, 1.2], _b[0], _b[-1],
                    colors='#353132', linestyles='dashed')
    _ = axrc.hist(m, bins=_b, weights=ratio_h, histtype="step", lw=2, color="r")
    axrc.set_ylim(.5, 1.5)
    axrc.set_ylabel("kde / exp")

    # Bottom right: Ratio of diff to sigma of expectation
    _ = axrb.axhline(0, 0, 1, color="k", ls="-")
    _ = axrb.hlines([-2, -1, 1, 2], _b[0], _b[-1],
                    colors='#353132', linestyles='dashed')
    _ = axrb.hist(m, bins=_b, weights=sigma_ratio_h, histtype="step", lw=2, color="r")
    axrb.set_ylim(-3, +3)
    axrb.set_ylabel("(exp-kde)/sigma_kde")
    plt.show()

# Define the Likelihoods

Here we define our Likelihoods.
We are given a source event occurance (can be GRB, GW, HESE or anything else) at a given position in space and time.
We want to search for a significant contribution of other events, within a predefined region in time and space around the source events.
For this we need to derive the expected signal and background contributions in that frame.

The Likelihood that describes this scenario can be derived from counting statistics.
If we expect $n_S$ signal and $n_B$ background events in the given frame, then the probability of observing $N$ events is given by a poisson pdf:

$$
    P_\text{Poisson}(N\ |\ n_S + n_B) = \mathcal{L}(N | n_S, n_b) = \frac{(n_S + n_B)^{-N}}{N!}\cdot \exp{-(n_S + n_B)}
$$

We want to fit for the number of signal events $n_S$ in the frame.
But each event doesn_t have the same probability of contributing to either signal or background, because we don't have that information on a per event basis.
So we include prior information on a per event basis to account for that.

$$
    \mathcal{L}(N | n_S, n_B) = \frac{(n_S + n_B)^{-N}}{N!}\cdot \exp{-(n_S + n_B)} \cdot \prod_{i=1}^N P_i
$$

Also the simple poisson pdf above only has one parameter, the total number of events, which can be fit for.
So we need to resolve this degeneracy in $n_S$, $n_B$ by giving additional information.
For that we include a weighted combination of the probability for an event to be signal, denoted by the PDF $S_i$ and for it to background, denoted by $B_i$.
Because the simple counting probabilities are $n_S / (n_S + n_B)$ to count a signal event and likewise $n_B / (n_S + n_B)$ to count a background event we construct the per event prior $P_i$ as:

$$
    P_i = \frac{n_S}{n_S + n_B}\cdot S_i + \frac{n_B}{n_S + n_B}\cdot B_i
        = \frac{n_S \cdot S_i + n_B \cdot B_i}{n_S + n_B}
$$

Note, that for equal probabilities $S_i$ and $B_i$, we simply and up with the normal poisson counting statistic.

Plugging that back into the likelihood we get:

$$
    \mathcal{L}(N | n_S, n_B) = \frac{(n_S + n_B)^{-N}}{N!}\cdot \exp{(-(n_S + n_B))} \cdot \prod_{i=1}^N \frac{n_S \cdot S_i + n_B \cdot B_i}{n_S + n_B}
$$

Taking the natrual logarithm to get the log-likelihood we arrive at:

$$
    \ln\mathcal{L}(N | n_S, n_B) = -(n_S + n_B) -\ln(N!) + \sum_{i=1}^N \ln((n_S + n_B) P_i)
$$

If we weight up $n_S$ then every events signal PDF is contributing a bit more than the background pdf.
So the fitter tries to find the combination of $n_S$ and $n_B$ that maximizes the likelihood.

To further simplify, we can use a measured and fixed background expectation rate $\langle n_B\rangle$ and fit only for the number of signal events.
Then we only fit for the number of signal events $n_S$.
The fixed background rate can be extracted from data by using the pdf of a larger timescale and average over that (or fit a function) to ensure that local fluctuations don't matter.

Then we end up with our full Likelihood (the denominator in $P_i$ cancels with the term from the poisson PDF):

$$
    \ln\mathcal{L}(N | n_S) = -(n_S + \langle n_B\rangle) -\ln(N!) + \sum_{i=1}^N \ln(n_S S_i + \langle n_B\rangle B_i)
$$

For the test statistic we want to test the hypothesis of having no signal $n_S=0$ vs. the alternative with a free parameter $n_S$:

$$
    \Lambda = \ln\frac{\mathcal(\hat{n}_S)}{\mathcal{n_S=0}}
            = \frac{-(\hat{n}_S + \langle n_B\rangle) -\ln(N!) + \sum_{i=1}^N \ln(\hat{n}_S S_i + \langle n_B\rangle B_i)}{-\langle n_B\rangle -\ln(N!) + \sum_{i=1}^N \ln(\langle n_B\rangle B_i)}
            = -\hat{n}_S + \sum_{i=1}^N \ln\left( \frac{\hat{n}_S S_i}{\langle n_B\rangle B_i} + 1 \right)
$$

The per event PDFs $S_i$ and $B_i$ can depend on arbitrary parameters.
The common choise here is to use a time, energy proxy and spatial proxy depency which has most seperation power:

$$
    S_i(x_i, t_i, E_i) = S_T(t_i) \cdot S_S(x_i) \cdot S_E(E_i) \\ 
    B_i(x_i, t_i, E_i) = B_T(t_i) \cdot B_S(x_i) \cdot B_E(E_i) 
$$

Because the Likelihood only contains ratios of the PDF, we only have to construct functions of the signal to background ratio for each time, spatial and energy distribution.

For the energy PDFs $S_E, B_E$ we use a 2D representation in reconstructed energy and declination because this has the most seperation power (see coenders & skylab models).
The spatial part $S_S, B_S$ is only depending on the distance from source to event, not on the absilute position on the sphere.
The time part $S_T, B_T$ is equivalent to that, only using the distance in time between source event and event.

**Note: It seems that in mrichmans analysis there has only been used a 1D energy only PDF. This lacks seperation power, when using both hemispheres, as in the southern sky the energy threshhold is much higher.**

## Time PDF ratio

Background in uniformly distributed in the time window.
Signal distribtution is falling off gaussian-like at both edges so normalization is different.
So the ratio $S_T / B_T$ is simply the the signal pdf divided by the uniform normalization $1 / (t_1 - t_0)$ in the time frame.

The signal PDFs written out explicitely, where $t_0$ is the source events time and $t$ the events time:

$$
    N \cdot S_T(t, t_0) = \begin{cases}
                     \frac{1}{\sqrt{2\pi}\sigma_T}\exp\left(-\frac{(t-T_0)^2}{2\sigma_T^2}
                     \right)&\quad\mathrm{, if }\ t \in [a, T_0]\\                
                     \frac{1}{\sqrt{2\pi}\sigma_T}&\quad\mathrm{, if }\ t \in [T_0, T_1]\\
                     \frac{1}{\sqrt{2\pi}\sigma_T}\exp\left(-\frac{(t-T_1)^2}
                     {2\sigma_T^2}\right)&\quad\mathrm{, if }\ t \in [T_1, b]\\ 
                    0 &\quad\mathrm{, else}
                  \end{cases}
$$

where $a, b$ are the bounds of the total time window, $T_0, T_1$ are the part, in which the signal is assumed to be uniformly distributed in time and $\sigma_T$ is the width of the gaussian edges.
The gaussian width $\sigma_T$ is as wide as the interval $T_1-T_0$ but constraint to the nearest value in $[2, 30]$ seconds if the frame gets too large or too small.
The total normalization $N$ is given by integrating over $S_T$ in $[a, b]$, resulting in:

$$
    N = \Phi(b) - \Phi(a) + \frac{T_1-T_0}{\sqrt{2\pi}\sigma_T}
$$

where

$$
    \Phi(x) = \int_{-\infty}^{x}\frac{1}{\sqrt{2\pi}\sigma_T}
      \exp\left(-\frac{(t-T_0)^2}{2\sigma_T^2}\right)\mathrm{d}t
$$
the CDF of the gaussian PDF.

The background PDF respectively is simply:

$$
    B_T(t, t_0) = \begin{cases}
                     \frac{1}{b-a}&\quad\mathrm{, if }\ t \in [a, b]\\ 
                    0 &\quad\mathrm{, else}
                  \end{cases}    
$$

To get finite support we truncate the gaussian edges at $n\cdot\sigma_T$.
Though arbitrarliy introduced the concrete cutoff of the doesn't really matter (so say 4, 5, 6 sigma, etc).

This is because in the LLH we get the product of $\langle b_B \rangle B_i$.
A larger cutoff make the normalization of the BG pdf larger, but in the same time makes the number of expected BG event get higher in the same linear fashion.
So as long as we choose a cutoff which ensures that $S \approx 0$ outside, we're good to go.

In [None]:
def time_soverb(t, t0, dt, nsig):
    """
    Time signal over background PDF.
    
    Signal and background PDFs are each normalized over seconds.
    Signal PDF has gaussian edges to smoothly let it fall of to zero, the
    stddev is dt when dt is in [2, 30]s, otherwise the nearest edge.

    To ensure finite support, the edges are truncated after nsig * dt.

    Parameters
    ----------
    t : array-like
        Times given in MJD for which we want to evaluate the ratio.
    t0 : float
        Time of the source event.
    dt : float
        Time window in seconds starting from t0 in which the signal pdf is
        assumed to be uniform. Must not be negative.
    nsig : float
        Clip the gaussian edges at nsig * dt
    """
    if dt < 0:
        raise ValueError("dt must not be negative.")

    secinday = 24. * 60. * 60.

    # Normalize relative to t0 in seconds (first multiply avoids rounding?)
    _t = t * secinday - t0 * secinday
   
    # Create signal PDF
    # Constrain sig_t to [2, 30]s regardless of uniform time window
    sig_t = np.clip(dt, 2, 30)
    sig_t_clip = nsig * sig_t
    gaus_norm = (np.sqrt(2 * np.pi) * sig_t)
    
    # Split in def regions gaus rising, uniform, gaus falling
    gr = (_t < 0) & (_t >= -sig_t_clip)
    gf = (_t > dt) & (_t <= dt + sig_t_clip)
    uni = (_t >= 0) & (_t <= dt)
    
    pdf = np.zeros_like(t, dtype=np.float)
    pdf[gr] = scs.norm.pdf(_t[gr], loc=0, scale=sig_t)
    pdf[gf] = scs.norm.pdf(_t[gf], loc=dt, scale=sig_t)
    # Connect smoothly with the gaussians
    pdf[uni] = 1. / gaus_norm
    
    # Normalize signal distribtuion
    dcdf = (scs.norm.cdf(dt + sig_t_clip, loc=dt, scale=sig_t) -
            scs.norm.cdf(-sig_t_clip, loc=0., scale=sig_t))
    norm = dcdf + dt / gaus_norm
    pdf /= norm
    
    # Calculate the ratio
    bg_pdf = 1. / (dt + 2 * sig_t_clip)
    ratio = pdf / bg_pdf
    return ratio

In [None]:
# Make a plot with ratios for different time windows as in the paper
# Arbitrary start date from data
t0 = start_mjd[100]
t0_sec = t0 * secinday

# dt from t0 in seconds, clip at 4 sigma
dts = [5, 50, 200]
nsig = 4

# Make t values for plotting in MJD around t0, fitting all in one plot
max_dt = np.amax(dts)
clip = np.clip(max_dt, 2, 30) * nsig
plt_rng = np.array([-clip, max_dt + clip])
t = np.linspace(t0_sec + 1.2 *plt_rng[0],
                t0_sec + 1.2 * plt_rng[1], 1000) / secinday
_t = t * secinday - t0 * secinday

# Mark event time
plt.axvline(0, 0, 1, c="#353132", ls="--", lw=2)

colors = ["C0", "C3", "C2"]
for i, dt in enumerate(dts):
    # Plot ratio S/B
    SoB = time_soverb(t, t0, dt, nsig)
    plt.plot(_t, SoB, lw=2, c=colors[i],
             label=r"$T_\mathrm{{uni}}$: {:>3d}s".format(dt))
    # Fill uniform part, might look nicely
    # fbtw = (_t > 0) & (_t < dt)
    # plt.fill_between(_t[fbtw], 0, SoB[fbtw], color="C7", alpha=0.1)

# Make it look like the paper plot, but with slightly extended borders, to
# nothing breaks outside the total time frame
plt.xlim(1.2 * plt_rng)
plt.ylim(0, 3)
plt.xlabel("t - t0 in sec")
plt.ylabel("S / B")
plt.legend(loc="upper right")
plt.grid(ls="--", lw=1)
plt.show()

## Energy-Space Pdf

In [None]:
# TODO skylab style

## Spatial Pdf

In [None]:
# TODO skylab style, but with Kent PDF