Here are some more side tests to clarify / justify details, that would clutter the main test notebook.

In [None]:
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.dates as mpldates
import matplotlib.gridspec as gridspec
from matplotlib.colors import LogNorm
%matplotlib inline

import scipy.interpolate as sci
import scipy.optimize as sco

import json
import datetime
import pickle
from astropy.time import Time as astrotime
from corner import corner

import sklearn.neighbors as skn
import sklearn.model_selection as skms  # Newer version of grid_search

from corner_hist import corner_hist
from anapymods3.plots.general import split_axis, get_binmids, hist_marginalize

# Load data

Load IC86 data from epinat, which should be the usual IC86-I (2011) PS sample, but pull corrected and OneWeights corrected by number of events generated.

In [None]:
exp = np.load("data/IC86_I_data.npy")
mc = np.load("data/IC86_I_mc.npy")

# Use the officially stated livetime, not the ones from below
livetime = 332.61

# Data livetime comparison to v1.4

Let's compare to the v1.4 list, as used by jfeintzig.
Oddly we have 0.2 days less livetime as he had.
The number of runs is correct though

In [None]:
# For comparison, also parse the v1.4 list
# Should be: 1081 runs, with a total livetime of 332.61 days.
with open("data/Prelim_IC86-I_v1.4a.txt",'r') as f:
    data = []
    for line in f.readlines():
        data.append(line.replace('\n',''))
        
# Skip to beginning of run info
data = data[73:]

# Split at white space
data = [d.split() for d in data]

dtype = [("runID", np.int), ("duration", np.float), ("IT", "|S2"),
         ("CONF", "|S7"), ("FLAG", "|S6")]
runlist = np.empty((len(data),), dtype=dtype)

runlist["runID"] = np.array([int(d[0]) for d in data])
runlist["duration"] = np.array([float(d[3]) for d in data])
runlist["IT"] = np.array([d[5] for d in data])
runlist["CONF"] = np.array([d[6] for d in data])
runlist["FLAG"] = np.array([d[7] for d in data])

# Now filter: Include IT=it, CONF=full, FLAG=GOOD, exclude strange rate runs
exclude_rate = [120028, 120029, 120030, 120087, 120156, 120157]
itgood = runlist["IT"] == b"IT"  # Somehow only bitwise comparison is non-empty
confgood = runlist["CONF"] == b"full"
flaggood = runlist["FLAG"] == b"GOOD"
ratebad = np.in1d(runlist["runID"], exclude_rate)

include = itgood & confgood & flaggood & ~ratebad
runlist_inc = runlist[include]

# Get the livetime of the sample in days
hoursindays = 24.
_livetime = np.sum(runlist_inc["duration"]) / hoursindays

print("Total runs from v1.4     : ", len(runlist_inc))
print("Total livetime from v1.4 : ", _livetime)

# Time - Dec expectation

Instead of just using the background rate dependent on the events time, we can make it 
dependent on the events position too.

1. Technique is to use a spline fit to a histogram.
   Robust and easy to average out the unwanted small scale fluctuations, but bin depent.
2. Technique is to use 2D KDE
   Use the same KDE technique to describe our background rate in 2 dimensions as we do with the per event PDF in 3D. Needs more work at the edges due to the hard cut in time. Again depends on bandwidth, smoothing is more difficult ot control.

In [None]:
time = exp["timeMJD"]
dec = exp["dec"]

# Normalize time to match scale 
time_norm = (time - np.amin(time)) / (np.amax(time) - np.amin(time))

sample = np.vstack((time_norm, dec)).T
_ = corner(sample, bins=[20, 50], plot_datapoints=False, plot_contours=False)

Problem with the KDE ist, that time is hard cut bounded at both sides.
One way around this would be to mirror the times at the edges to ensure continuity and then cut of values outside the range when evaluating the PDF to get the background rate.
This is OK, because time is somewhat periodic.

Declination falls of to the sides, so theres no need to do that.

In [None]:
kde = skn.KernelDensity(bandwidth=0.05, rtol=1e-8)
kde.fit(sample)

kde_sample = kde.sample(int(1e7))
_ = corner(kde_sample, bins=[50, 50], plot_datapoints=False, plot_contours=False)

# Let's make the BG pdf

## Justify the sigma cut

Only few higher energy events from the sothern sky are excluded (see cut=10).
But really bad reconstructed events tend to have higher energies (see cut=90).
Still it should be OK to remove those > 10 because they have not so much spatial information.

In [None]:
# Show the leftover event s after a sigma cut
sig_cut = 10
m = exp["sigma"] > np.deg2rad(sig_cut)

_ = plt.hist2d(exp["logE"][m], np.rad2deg(exp["dec"][m]),
               bins=30, cmap="inferno")
plt.colorbar()
plt.title("Total Evts w sigma > {:d}°: {:d} ({:.3f}%)".format(
        sig_cut, np.sum(m), np.sum(m) / len(exp) * 100))
plt.xlabel("logE")
plt.ylabel("dec in °")
plt.show()

# Show the skewed sigma distribution with the cut applied and mean vs median

## Test the marginalize_hist method.

It should be equivalent to use one of the following methods to create a 1D histogram from the original 3D data pdf in logE, dec and sigma:

1. Simply use the original 1D data in any variable, e.g. simply histogram logE
2. Create the complete 3D histogram and marginalize by summing over remaining dimensions.

When using unnormalized hists, 2. is simply summing up all other counts.

When using normalized hists, we need to sum with respect to the binwidths in the current dimension to keep the normalization intact.
This is only useful, when only the histogram is available and not the original sample.

We want to compare if both methods are equivalent
As we can see, all ratios are one, so methods are equal.

#### Helper Functions

In [None]:
def make_hist_ratio(h1, h2):
    """Return the ratio h1 / h2. Return 0 where h2 is 0."""
    m = (h2 > 0)
    ratio = np.zeros_like(h1)
    ratio[m] = h1[m] / h2[m]
    return ratio

### Unnormalized
First the unnormalized version. Simply sum over the other axes of the 3D hist.

In [None]:
# Plot each variable in a single plot and the ratios seperately
fig, [[axtl, axtr], [axbl, axbr]] = plt.subplots(2, 2, figsize=(10, 8))

# We also make a cut < 10° in sigma, because there are some outliers
m = exp["sigma"] <= np.deg2rad(10)
sigma = np.rad2deg(exp["sigma"][m])
logE = exp["logE"][m]
dec = np.sin(exp["dec"][m])

logE_nbins = 50
dec_nbins = 40
sigma_nbins = 30

# Make the 3D hist
sample = np.vstack((logE, dec, sigma)).T
nbins = [logE_nbins, dec_nbins, sigma_nbins]
h, b = np.histogramdd(sample, bins=nbins,)

# Get binmids for plotting
m = get_binmids(b)

# Common hist settings
h1 = {"lw": 2, "color": "k", "histtype": "step"}
h2 = {"lw": 2, "color": "r", "histtype": "step", "alpha": 0.5}

# logE
logE_h, logE_b, _ = axtl.hist(logE, bins=logE_nbins, **h1)
logE_hm = np.sum(h, axis=(1, 2))
_ = axtl.hist(m[0], bins=b[0], weights=logE_hm, **h2)
# Ratio plot below
axtl_sec = split_axis(axtl, "bottom", "20%", cbar=False)
axtl_sec.hist(m[0], b[0], weights=make_hist_ratio(logE_h, logE_hm), **h2)
axtl_sec.axhline(1, 0, 1, color="k")
axtl_sec.set_ylim(0, 2)

# dec
dec_h, dec_b, _ = axbl.hist(dec, bins=dec_nbins, **h1)
dec_hm = np.sum(h, axis=(0, 2))
_ = axbl.hist(m[1], bins=b[1], weights=dec_hm, **h2)

axbl_sec = split_axis(axbl, "bottom", "20%", cbar=None)
axbl_sec.hist(m[1], b[1], weights=make_hist_ratio(dec_h, dec_hm), **h2)
axbl_sec.axhline(1, 0, 1, color="k")
axbl_sec.set_ylim(0, 2)

# sigma
sigma_h, sigma_b, _ = axtr.hist(sigma, bins=sigma_nbins, **h1)
sigma_hm = np.sum(h, axis=(0, 1))
_ = axtr.hist(m[2], bins=b[2], weights=sigma_hm, **h2)

axtr_sec = split_axis(axtr, "bottom", "20%", cbar=None)
axtr_sec.hist(m[2], b[2], weights=make_hist_ratio(sigma_h, sigma_hm), **h2)
axtr_sec.axhline(1, 0, 1, color="k")
axtr_sec.set_ylim(0, 2)

axbr.set_visible(False)

fig.suptitle("Black: 1D, Red: Margin", fontsize=15);

### Normalized
Sum over the other axes of the 3D hist and multiply by bin widths.

In [None]:
# Plot each variable in a single plot and the ratios seperately
fig, [[axtl, axtr], [axbl, axbr]] = plt.subplots(2, 2, figsize=(10, 8))

# Now make it normed
h, b = np.histogramdd(sample, bins=nbins, normed=True)

# Get binmids for plotting
m = get_binmids(b)

# logE
logE_h, logE_b, _ = axtl.hist(logE, bins=logE_nbins, normed=True, **h1)
logE_hm = hist_marginalize(h=h, bins=b, axes=(1, 2))[0]
_ = axtl.hist(m[0], bins=b[0], weights=logE_hm, **h2)
# Ratio plot below
axtl_sec = split_axis(axtl, "bottom", "20%", cbar=False)
axtl_sec.hist(m[0], b[0], weights=make_hist_ratio(logE_h, logE_hm), **h2)
axtl_sec.axhline(1, 0, 1, color="k")
axtl_sec.set_ylim(0, 2)

# dec
dec_h, dec_b, _ = axbl.hist(dec, bins=dec_nbins, normed=True, **h1)
dec_hm = hist_marginalize(h=h, bins=b, axes=(0, 2))[0]
_ = axbl.hist(m[1], bins=b[1], weights=dec_hm, **h2)

axbl_sec = split_axis(axbl, "bottom", "20%", cbar=None)
axbl_sec.hist(m[1], b[1], weights=make_hist_ratio(dec_h, dec_hm), **h2)
axbl_sec.axhline(1, 0, 1, color="k")
axbl_sec.set_ylim(0, 2)

# sigma
sigma_h, sigma_b, _ = axtr.hist(sigma, bins=sigma_nbins, normed=True, **h1)
sigma_hm = hist_marginalize(h=h, bins=b, axes=(0, 1))[0]
_ = axtr.hist(m[2], bins=b[2], weights=sigma_hm, **h2)

axtr_sec = split_axis(axtr, "bottom", "20%", cbar=None)
axtr_sec.hist(m[2], b[2], weights=make_hist_ratio(sigma_h, sigma_hm), **h2)
axtr_sec.axhline(1, 0, 1, color="k")
axtr_sec.set_ylim(0, 2)

axbr.set_visible(False)

fig.suptitle("Black: 1D, Red: Margin", fontsize=15);