In [None]:
import numpy
import matplotlib.pyplot as plt
import matplotlib.dates as mpldates
%matplotlib inline
import scipy.interpolate as sci
import scipy.optimize as sco
import json
from astropy.time import Time as astrotime
import datetime
from corner_hist import corner_hist
import sklearn.neighbors as skn
import sklearn.model_selection as skms  # Newer version of grid_searchss

# Load data

Load IC86 data from epinat, which should be the usual IC86-I (2011) PS sample, but pull corrected and OneWeights corrected by number of events generated.

In [None]:
exp = np.load("data/IC86_I_data.npy")
mc = np.load("data/IC86_I_mc.npy")

# Use the officially stated livetime, not the ones from below
livetime = 332.61

# Get data livetime

Generate from good run list as stated here:
- http://icecube.wisc.edu/~coenders/html/build/html/ic86-bdt/muonL3.html
- https://wiki.icecube.wisc.edu/index.php/IC86_I_Point_Source_Analysis/Data_and_Simulation

It should be 332.61 days as stated by jefeintzeig and scoenders.
We create one bin per included run, with exactly that width.
Excluded runs are those with too high/low rate and without everything marked "good".

Livetime ist a bit higher, because we used a newer runlist from iclive instead of the old non-json v1.4.
See cell fruther below for a script that parses that list.

In [None]:
# Grab from json
jsonFile = open('data/ic86-i-goodrunlist.json', 'r')
grlist = json.load(jsonFile)
jsonFile.close()

# This is a list of dicts (one dict per run)
runs = grlist["runs"]
# This is a dict of arrays (all run values in an array per keyword)
run_dict = dict(zip(runs[0].keys(), zip(*[r.values() for r in runs])))
for k in run_dict.keys():
    run_dict[k] = np.array(run_dict[k])

In [None]:
# Now compile runs as stated on jfeintzeigs page

# Transform livetimes to MJD floats
start_mjd = astrotime(run_dict["good_tstart"]).mjd
stop_mjd = astrotime(run_dict["good_tstop"]).mjd

# Create recarry to apply mask, only keep start, stop and runID
dtype = [("start_mjd", np.float), ("stop_mjd", np.float), ("runID", np.int)]
run_arr = np.array(list(zip(start_mjd, stop_mjd, run_dict["run"])), dtype=dtype)

# Note: The last 2 runs aren't included anyway, so he left them out in
# the reported run list. This fits here, as the other 4 runs are found
# in the list.
exclude_rate = [120028, 120029, 120030, 120087, 120156, 120157]
i3good = run_dict["good_i3"] == True
itgood = run_dict["good_it"] == True
ratebad = np.in1d(run_dict["run"], exclude_rate)

# Include if it & i3 good and rate is good
include = i3good & itgood & ~ratebad
inc_run_arr = run_arr[include]

# Get the total and per run livetimes in mjd
runtimes_mjd = inc_run_arr["stop_mjd"] - inc_run_arr["start_mjd"]
_livetime = np.sum(runtimes_mjd)

print("IC86-I livetime from iclive: ", _livetime)

Let's compare to the v1.4 list, as used by jfeintzig.
Oddly we have 0.2 days less livetime as he had.
The number of runs is correct though

In [None]:
# For comparison, also parse the v1.4 list
# Should be: 1081 runs, with a total livetime of 332.61 days.
with open("data/Prelim_IC86-I_v1.4a.txt",'r') as f:
    data = []
    for line in f.readlines():
        data.append(line.replace('\n',''))
        
# Skip to beginning of run info
data = data[73:]

# Split at white space
data = [d.split() for d in data]

dtype = [("runID", np.int), ("duration", np.float), ("IT", "|S2"),
         ("CONF", "|S7"), ("FLAG", "|S6")]
runlist = np.empty((len(data),), dtype=dtype)

runlist["runID"] = np.array([int(d[0]) for d in data])
runlist["duration"] = np.array([float(d[3]) for d in data])
runlist["IT"] = np.array([d[5] for d in data])
runlist["CONF"] = np.array([d[6] for d in data])
runlist["FLAG"] = np.array([d[7] for d in data])

# Now filter: Include IT=it, CONF=full, FLAG=GOOD, exclude strange rate runs
exclude_rate = [120028, 120029, 120030, 120087, 120156, 120157]
itgood = runlist["IT"] == b"IT"  # Somehow only bitwise comparison is non-empty
confgood = runlist["CONF"] == b"full"
flaggood = runlist["FLAG"] == b"GOOD"
ratebad = np.in1d(runlist["runID"], exclude_rate)

include = itgood & confgood & flaggood & ~ratebad
runlist_inc = runlist[include]

# Get the livetime of the sample in days
hoursindays = 24.
_livetime = np.sum(runlist_inc["duration"]) / hoursindays

print("Total runs from v1.4     : ", len(runlist_inc))
print("Total livetime from v1.4 : ", _livetime)

# Bin BG according to runlist

Each run is one bin in the bg rate vs time plot.
The rate is normed to Hertz by dividing through the bin sizes in seconds.

In [None]:
# Store events in bins with run borders
exp_times = exp["timeMJD"]
start_mjd = inc_run_arr["start_mjd"]
stop_mjd = inc_run_arr["stop_mjd"]

tot = 0
evts_in_run = {}
for start, stop , runid in zip(start_mjd, stop_mjd, inc_run_arr["runID"]):
    mask = (exp_times >= start) & ( exp_times < stop)
    evts_in_run[runid] = exp[mask]
    tot += np.sum(mask)
    
# Crosscheck, if we got all events and counted nothing double
print("Do we have all events? ", tot == len(exp))
print("  Events selected : ", tot)
print("  Events in exp   : ", len(exp))

In [None]:
# Create binmids and histogram values in each bin
binmids = 0.5 * (start_mjd + stop_mjd)
h = np.zeros(len(binmids), dtype=np.float)

for i, evts in enumerate(evts_in_run.values()):
    h[i] = len(evts)
    
# Mask those with zero rate
m = h > 0.
binmids = binmids[m]
h = h[m]
    
# Create plot arrays
xerr = runtimes_mjd[m] / 2.
yerr = np.sqrt(h)

# Show in Hertz, so go from MJD days to seconds in bin widths
secsinday = 24. * 60. * 60
norm = (stop_mjd[m] - start_mjd[m]) * secsinday
h_norm = h / norm
# Poisson errors just get scaled
yerr_norm = yerr / norm

# Weights only for the weighted average
weights = np.ones_like(yerr)
weights[yerr_norm == 0] = 0
weights[yerr_norm != 0] = 1 / yerr[yerr_norm != 0]
def f(x, a, b, c):
    """Fix baseline to wighted average"""
    return a * np.sin(b * (normed - c)) + np.average(h_norm, weights=weights)
normed = (binmids - binmids.min()) / (binmids.max() - binmids.min())

# Scaled seed from handcrafted guess in cell below
p0 = [-0.0005, 2 * np.pi, 0.1]

# Fit a poly to the rate. No weights, because we threw out entries with 0
# Also with weight, the period is only have despite the good seed values...
res = sco.curve_fit(f=f, xdata=normed, ydata=h_norm, p0=p0)
pars = res[0]

print("Best fit pars : ", pars)

In [None]:
# Plot like mrichman did on p. 113
# Note: Date plots are THE MOST DIFFICULT AND LEAST FUN THING TODO...
fig, ax = plt.subplots(1, 1)

# Show dates on x axis
datetimes = astrotime(binmids, format="mjd").to_datetime()
dates = mpldates.date2num([dt.date() for dt in datetimes])

# Every month, first day
months = mpldates.MonthLocator(bymonth=np.arange(1, 13), bymonthday=1)
monthsFmt = mpldates.DateFormatter("%b %Y")
ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(monthsFmt)

ax.errorbar(dates, h_norm, fmt=".", xerr=xerr, yerr=yerr_norm)
ax.set_xlabel("Date")
ax.set_ylabel("Rate in HZ")
ax.set_xlim(dates[0], dates[-1])
ax.set_ylim(0., None)

# Plot polyfit
delta_days = (datetimes[-1] - datetimes[0]).days
xdatetimes = [datetimes[0] + datetime.timedelta(days=int(x))for x in
              np.arange(0, delta_days)]
xtimes_mjd = astrotime(xdatetimes).mjd
normed = (xtimes_mjd - binmids.min()) / (binmids.max() - binmids.min())
y = f(normed, *pars)

# Handcrafted seed trial & error
# s = [-0.0005, 2 * np.pi, 0.1]
# y = s[0] * np.sin(s[1] * (normed + s[2])) + np.average(h_norm, weights=weights)

# Convert back to mpl dates
xdates = mpldates.date2num([xd.date() for xd in xdatetimes])
ax.plot(xdates, y, "r-", zorder=5)
ax.axhline(np.average(h_norm, weights=weights), 0, 1, color="k", ls="--", zorder=5)

# Autoprettify main xlabels
fig.autofmt_xdate(rotation=60)

# Show mjd on top
def ax2ticker(x):
    dates = mpldates.num2date(x)
    mjd = astrotime(dates).mjd
    return mjd
ax2 = ax.twiny()
ax2.set_xticks(ax.get_xticks())
ax2.set_xbound(ax.get_xbound())
ax2.set_xticklabels(ax2ticker(ax.get_xticks()),
                    rotation=60, horizontalalignment="left")
ax2.set_xlabel("MJD")


# Let's make the BG pdf

Proceeding to section 6.3.1 Randomized BG Injection, p. 113.
Mrichmann draws events by:

1. Get number of bg events to be injected from a poisson distribution with expectation values drawn from the previously build bg temporal distribution.
   $$
   P_{\langle n_B\rangle}(N_m) = \frac{\langle n_B\rangle^{N_m}}{N_m\!}\cdot \exp(\langle n_B\rangle)
   $$
2. These events are then drawn from a 3D pdf in energy proxy, zenith proxy and sigma proxy.
   He does it by dividing 10x10x10 bins, first selecting energy, then zenith in that energy bin, then sigma in that zenith bin.
   
Here we create a smooth PDF using a kernel density estimator and obtain a sample by running a MCMC chain to create a sample a priori.
The bandwidth is set globally and cross validated to be robust.

**Some note on `numpy.histogramdd`:**

The input must be an array with shape (nDim, len(data)).

Shape of h is the same as the number of bins in each dim: (50, 40, 10)
So the first dimension picks a single logE slice -> h[i].shape = (40, 10)
Second dim picks a dec slice -> h[:, i].shape = (50, 10)
3rd picks a sigma slice -> h[:, :, i].shape = (50, 40)

This is important: meshgrid repeats in second axis on first array xx.
For the second array, the first axis is repeated.
But h iterates over energy in 1st axis. So if we don't transpose, we have the whole histogram flipped! Compare to plot in mrcihmanns thesis (cos(zen))

## 3D histogram
First we make a 3D histogram to better compare to mrichmann and to get an overview over the distribution.

In [None]:
# First let's make the 3D histogram. We need to tune the bin sizes, so we
# have enough statistics, but don't loose too much resolution.
# Check with a difference histogram in the end.
# np.cos(np.pi / 2. + exp["dec"])  # cos(zenith)
# np.sin(exp["dec"])               # sin(dec)
sample = np.vstack((exp["logE"], np.sin(exp["dec"]), np.rad2deg(exp["sigma"]))).T
h, (logE_bins, dec_bins, sigma_bins) = np.histogramdd(sample=sample,
                                                      bins=[50, 40, 10])

# Do it again to lazy-get bins and create ranges in sigma easily
r = [[logE_bins[0], logE_bins[-1]],
     [dec_bins[0], dec_bins[-1]],
     [0, 5]]

h, bins = np.histogramdd(sample=sample, bins=[50, 40, 20], range=r, normed=True)
(logE_bins, dec_bins, sigma_bins) = bins

fig, ax = corner_hist(h, bins=bins, label=["logE", "sin(dec)", "sigma deg"],
                      hist2D_args={"cmap": "Greys"}, hist_args={"color":"#353132"})


## Kernel Density Estimation

We use scikit learns cross validation with a gaussian kernel to get the most robust bandwidth.
Then we integrate with the same binning as above and compare to the 3D histogram.

This section relies heavily on [Jake van der Plas examples for KDE](https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/).
More info on how KDE cross validation works can be found in [Modern Nonparametric Methods](http://www2.stat.duke.edu/~wjang/teaching/S05-293/lecture/ch6.pdf).

In [None]:
# Optimize bandwidth in a 10-fold cross validation.
# Our data has apprximately equal scales, which is good
kde_estimator = skn.KernelDensity(kernel="gaussian", rtol=1e-8)
model_selector = skms.GridSearchCV(estimator=kde_estimator, cv=10,
                        param_grid={"bandwidth" : np.arange(0.1, 2.1, 0.1)},)

# fit takes the data points as shape=(n_samples, n_features), so exactly like
# the already used sample in the np.histogramdd method
model_selector.fit(sample)