<center><strong><font size=+3>Applications of robust 2D median estimators to HERA data</font></center>
<br><br>
</center>
<center><strong><font size=+2>Matyas Molnar and Bojan Nikolic</font><br></strong></center>
<br><center><strong><font size=+1>Astrophysics Group, Cavendish Laboratory, University of Cambridge</font></strong></center>

In [None]:
import os
import sys

import numpy as np
from scipy import stats

from matplotlib import pyplot as plt
from scipy.stats.mstats import gmean as geometric_mean

from hera_cal.io import HERAData
from hera_cal.redcal import get_reds

from robstat.utils import DATAPATH
from robstat.robstat import geometric_median, mardia_median, mv_median, tukey_median

In [None]:
plt.rcParams['figure.figsize'] = (12, 8)
%matplotlib inline

In [None]:
plot_figs = False
if plot_figs:
    import matplotlib as mpl
    mpl.rcParams['figure.dpi'] = 300

### Load HERA visibility data

In [None]:
sample_data = os.path.join(DATAPATH, 'zen.2458098.43869.HH.OCRSA.uvh5')

hd = HERAData(sample_data)
data, flags, _ = hd.read()

reds = get_reds(hd.antpos, pols=hd.pols)
flat_bls = [bl for grp in reds for bl in grp if bl in data.keys()]
reds = [grp for grp in reds if set(grp).issubset(flat_bls)]
bl_dict = {k: i for i, k in enumerate(flat_bls)}

data = {k: np.ma.array(v, mask=flags[k], fill_value=np.nan) for k, v \
        in data.items()}
mdata = np.ma.empty((hd.Nfreqs, hd.Ntimes, hd.Nbls), fill_value=np.nan, \
                     dtype=complex)
for i, bl in enumerate(flat_bls):
    mdata[..., i] = data[bl].transpose()
    
data = mdata.filled()
flags = mdata.mask

### Redundant averaging

In [None]:
slct_bls = reds[0]
slct_bl_idxs = np.array([bl_dict[slct_bl] for slct_bl in slct_bls])
slct_data = data[..., slct_bl_idxs]
slct_flags = flags[..., slct_bl_idxs]
assert slct_flags.sum() == np.isnan(slct_data).sum()
print('Looking at baselines redundant to {}'.format(slct_bls[0]))

In [None]:
# Look at one time integration / frequency slice with high variance
idxs = np.unravel_index(np.nanargmax(np.nanstd(slct_data, axis=-1)), \
                        slct_data.shape[:2])
print('Selecting time / freq slice {}'.format(idxs))
slct_data_slice = slct_data[idxs[0], idxs[1], :]

flt_nan = lambda x: x[~np.isnan(x)]
sample_gmean = geometric_mean(flt_nan(slct_data_slice))
sample_gmed = geometric_median(slct_data_slice, weights=None)
sample_tmed = tukey_median(slct_data_slice)['barycenter']
bad_med = lambda x : np.nanmedian(x.real) + np.nanmedian(x.imag)*1j
sample_bmed = bad_med(slct_data_slice)

In [None]:
med_ests = list(zip([sample_gmean, sample_gmed, sample_tmed, sample_bmed], 
               ['Geometric Mean', 'Geometric Median', 'Tukey Median', 'Separate Median']))
for me in med_ests:
    print('{:17s}: {:4f}'.format(me[1], me[0]))

In [None]:
fig, ax = plt.subplots(figsize=(6, 6), dpi=100)

ax.scatter(slct_data_slice.real, slct_data_slice.imag, alpha=0.5)
ax.plot(sample_gmean.real, sample_gmean.imag, 'ro', label='gmean')
ax.plot(sample_gmed.real, sample_gmed.imag, 'co', label='gmed')
ax.plot(sample_tmed.real, sample_tmed.imag, 'yo', label='tmed')
ax.plot(sample_bmed.real, sample_bmed.imag, 'bo', label='bmed')
ax.annotate(format(slct_bls[0]), xy=(0.05, 0.05), xycoords='axes fraction')
ax.set_xlabel(r'$\mathfrak{Re} \; (V)$')
ax.set_ylabel(r'$\mathfrak{Im}(V)$')

plt.legend()
plt.show()

In [None]:
# TODO
# find rmeds for all freqs and times separately
# in aligned data do for time and then do for time & bl together
# literature on median of median?