<center><strong><font size=+3>Speeding up RMD-clipping</font></center>
<br><br>
</center>
<center><strong><font size=+2>Matyas Molnar and Bojan Nikolic</font><br></strong></center>
<br><center><strong><font size=+1>Astrophysics Group, Cavendish Laboratory, University of Cambridge</font></strong></center>

In [None]:
import os

import matplotlib as mpl
import matplotlib.patches as patches
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import special, stats
from sklearn.covariance import MinCovDet

from robstat.stdstat import mad_clip
from robstat.utils import DATAPATH, decomposeCArray, flt_nan

In [None]:
%matplotlib inline

In [None]:
mpl.rcParams['figure.figsize'] = (5, 3)
mpl.rcParams['figure.dpi'] = 125

mpl.rc('font',**{'family':'serif','serif':['cm']})
mpl.rc('text', usetex=True)
mpl.rc('text.latex', preamble=r'\usepackage{amssymb} \usepackage{amsmath}')

In [None]:
vis_file = os.path.join(DATAPATH, 'sample_vis_data.npz')
vis_data = np.load(vis_file)

In [None]:
data = vis_data['data']
flags = np.isnan(data)
redg = vis_data['redg']
pol = vis_data['pol']
lsts = vis_data['lsts']
JDs = vis_data['JDs']
chans = vis_data['chans']
freqs = vis_data['freqs']

In [None]:
# parameters
sigma = 5.0  # number of normal standard deviations for clipping
min_N = 5  # minimum length of array to clip, below which no clipping is performed.

In [None]:
eg_data = data[:, 0, 0, 0]
points = decomposeCArray(flt_nan(eg_data.flatten()))

In [None]:
# relate in terms of probabilities:
# the probability that a normal deviate lies in the range between  \mu - n*\sigma and \mu + n*\sigma:
chi2_p = special.erf(sigma/np.sqrt(2))
# transform this probability to chi^2 quantile
chi2_q = stats.chi2.ppf(chi2_p, df=points.shape[1])

print('χ^2 quantile corresponding to {}σ (p = {:.7f}) is {:.7f}'.\
      format(sigma, chi2_p, chi2_q))

In [None]:
robust_cov = MinCovDet(random_state=0).fit(points)
rmd_outliers = np.where(robust_cov.mahalanobis(points) > chi2_q)[0]

In [None]:
%%timeit
# RMD-clipping speed
robust_cov = MinCovDet(random_state=0).fit(points)
rmd_outliers = np.where(robust_cov.mahalanobis(points) > chi2_q)[0]

In [None]:
%%timeit
# MAD-clipping speed, for comparison.
# Note that MAD-clipping can be vectorized, while RMD-clipping needs to be looped (see later)
_, f_r = mad_clip(points[:, 0], sigma=sigma, min_N=min_N)
_, f_i = mad_clip(points[:, 1], sigma=sigma, min_N=min_N)

mad_outliers = np.where(f_r + f_i)[0]

In [None]:
# get RMD ellipse parameters from covariance matrix
eig_vals, eig_vecs = np.linalg.eig(robust_cov.covariance_)
radii = np.sqrt(eig_vals)
lrg_ev = eig_vecs[np.argmax(eig_vals)]
alpha = np.arctan2(eig_vals[0] - robust_cov.covariance_[0][0], robust_cov.covariance_[0][1])

In [None]:
real_lab = r'$\mathfrak{Re} (V)$'
imag_lab = r'$\mathfrak{Im} (V)$'


fig, ax = plt.subplots(figsize=(7, 5))

ax.set_facecolor('red')
ax.patch.set_alpha(0.25)

z = np.sqrt(chi2_q)
ellipse = patches.Ellipse(xy=robust_cov.location_, width=2*z*radii[0], height=2*z*radii[1], \
                          angle=alpha*180/np.pi, edgecolor='None', fc='white', lw=2, ls='--', 
                          zorder=0)
ax.add_patch(ellipse)

inliers = np.delete(points, rmd_outliers, axis=0)
sns.scatterplot(x=inliers[:, 0], y=inliers[:, 1], ax=ax, label='Inliers', alpha=0.8)
sns.scatterplot(x=points[rmd_outliers, 0], y=points[rmd_outliers, 1], color='red', ax=ax, \
                label='Outliers', alpha=0.8, zorder=2)
sns.scatterplot(x=[robust_cov.location_[0]], y=[robust_cov.location_[1]], color='darkorange', \
                ax=ax, label='MCD location', marker='+', zorder=2)

# Create meshgrid of feature values
xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 1001),
                     np.linspace(plt.ylim()[0], plt.ylim()[1], 1001))
zz = np.c_[xx.ravel(), yy.ravel()]

# Calculate the MCD based Mahalanobis distances
mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = ax.contour(xx, yy, np.sqrt(mahal_robust_cov), cmap=plt.cm.YlOrBr_r, \
                            linestyles='--', zorder=0)
ax.clabel(robust_contour, robust_contour.levels, inline=True, fontsize=10)
thresh_contour = ax.contour(xx, yy, np.sqrt(mahal_robust_cov), [np.sqrt(chi2_q)], colors='red', \
                            linewidths=2, zorder=0)

ax.annotate('Robust Mahalanobis Distance', xy=(0.62, 0.10), xycoords='axes fraction', \
            bbox=dict(boxstyle='round', facecolor='white'), size=10, color='darkorange')

ax.annotate(r'$\chi_{\mathrm{thresh}} =$'+' {0:.3f}'.format(np.sqrt(chi2_q)), xy=(0.62, 0.03), \
            xycoords='axes fraction', bbox=dict(boxstyle='round', facecolor='white'), size=10, color='red')

ax.set_xlabel(real_lab)
ax.set_ylabel(imag_lab)

rmd_lims = [ax.get_xlim(), ax.get_ylim()]

ax.legend(loc='upper right')

fig.tight_layout()
plt.show()

### Run RMD for all data

In [None]:
import multiprocess as multiprocessing

# require a shared ctype array in order to fill in a numpy array in parallel

def create_mp_array(arr):
    shared_arr = multiprocessing.RawArray(np.ctypeslib.as_ctypes_type(arr.dtype), int(np.prod(arr.shape)))
    new_arr = np.frombuffer(shared_arr, arr.dtype).reshape(arr.shape)  # shared_arr and new_arr the same memory
    new_arr[...] = arr
    return shared_arr, new_arr

def mp_init(shared_arr_, sharred_arr_shape_, sharred_arr_dtype_):
    global shared_arr, sharred_arr_shape, sharred_arr_dtype
    shared_arr = shared_arr_
    sharred_arr_shape = sharred_arr_shape_
    sharred_arr_dtype = sharred_arr_dtype_

def mp_iter(s):
    d = data[:, s[0], s[1], s[2]]
    if not np.isnan(d).all():
        
        isfinite = np.isfinite(d).nonzero()[0]
        d = decomposeCArray(flt_nan(d))
        robust_cov = MinCovDet(random_state=0).fit(d)
        outliers = robust_cov.mahalanobis(d) > chi2_q

        rmd_clip_f = np.frombuffer(shared_arr, dtype).reshape(shape)
        rmd_clip_f[isfinite, s[0], s[1], s[2]] = outliers

In [None]:
rmd_clip_f_fn = os.path.join(DATAPATH, 'rmd_clip_test.npz')

if not os.path.exists(rmd_clip_f_fn):
    
    rmd_clip_f = np.ones_like(data, dtype=bool)
    d_shared, rmd_clip_f = create_mp_array(rmd_clip_f)
    dtype = rmd_clip_f.dtype
    shape = rmd_clip_f.shape

    m_pool = multiprocessing.Pool(multiprocessing.cpu_count(), initializer=mp_init, \
                                  initargs=(d_shared, dtype, shape))
    _ = m_pool.map(mp_iter, np.ndindex(data.shape[1:]))
    m_pool.close()
    m_pool.join()

    rmd_clip_f = rmd_clip_f ^ flags
    
    np.savez(rmd_clip_f_fn, flags=rmd_clip_f)

else:
    rmd_clip_f = np.load(rmd_clip_f_fn)['flags']

# apply min_N condition
mad_f_min_n = np.logical_not(flags).sum(axis=0) < min_N
mad_f_min_n = np.expand_dims(mad_f_min_n, axis=0)
mad_f_min_n = np.repeat(mad_f_min_n, flags.shape[0], axis=0)
rmd_clip_f[mad_f_min_n] = False

print('Number of data point flagged from RMD-clipping: {:,}'.format(rmd_clip_f.sum()))

### Run MAD-clipping for all data (for comparison)

In [None]:
mad_clip_f_fn = os.path.join(DATAPATH, 'mad_clip_test.npz')

if not os.path.exists(mad_clip_f_fn):
    _, f_r = mad_clip(data.real, axis=0, sigma=sigma)
    _, f_i = mad_clip(data.imag, axis=0, sigma=sigma)

    mad_clip_f = f_r + f_i
    mad_clip_f = mad_clip_f ^ flags
    
    np.savez(mad_clip_f_fn, flags=mad_clip_f)

else:
    mad_clip_f = np.load(mad_clip_f_fn)['flags']
    
# apply min_N condition
mad_clip_f[mad_f_min_n] = False

print('Number of data point flagged from MAD-clipping: {:,}'.format(mad_clip_f.sum()))