### Generating a training set
- The goal of this notebook is to generate a training set for a ML algorithm.
- The first approach will be simple.
    - Using the DQ as the label, train up a binary classifier to find cosmic rays

In [None]:
%matplotlib notebook
from astropy.io import fits
import pandas as pd
import numpy as np
import os
import glob
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
from astropy.visualization import SqrtStretch
from astropy.visualization import LogStretch, LinearStretch, ZScaleInterval
from astropy.visualization.mpl_normalize import ImageNormalize
from sklearn.preprocessing import StandardScaler
from scipy import ndimage
import sys
import skimage.segmentation as segment
from photutils.utils import random_cmap
plt.style.use('ggplot')

In [None]:
sys.path.append('/Users/nmiles/hst_cosmic_rays/lib/')
from CosmicRayLabel import CosmicRayLabel
from ComputeStats import ComputeStats

In [None]:
fname = './../data/jd4wemc8q_flt.fits'

In [None]:
c = CosmicRayLabel(fname)

In [None]:
c.generate_label()

In [None]:
with fits.open(fname) as hdu:
    sci2 = hdu[1].data
    sci1 = hdu[4].data
sci = np.concatenate([sci2, sci1])

In [None]:
sci

In [None]:
stats = ComputeStats(fname,c.label)

In [None]:
sizes = stats.compute_size()

In [None]:
sizes

In [None]:
max_size_idx = np.where(np.asarray(list(sizes.values())) > 15.)

In [None]:
max_size_idx = max_size_idx[0] + 1

In [None]:
idx = max_size_idx[1]

In [None]:
sizes[idx]

In [None]:
stats.cr_locs[idx-1]

In [None]:
box_data = sci[stats.cr_locs[idx-1]]

In [None]:
box_data.shape

In [None]:
coords = np.where(box_data > 4*np.median(box_data))
coords = list(zip(coords[0], coords[1]))

In [None]:
box_data_dq = c.dq[stats.cr_locs[idx-1]]

In [None]:
def mk_patch(r, c='red'):
    CR_center = patches.Rectangle((r[1]-0.5,r[0]-0.5), 
                              width=1, height=1, 
                              alpha=1.0, fill=False,
                              linewidth=1.75, color=c)
    return CR_center

In [None]:
norm = ImageNormalize(sci, stretch=LogStretch(a=5.), interval=ZScaleInterval())
fig = plt.figure(figsize=(5,3))
ax1 = fig.add_subplot(1,2,1)
ax2 =  fig.add_subplot(1,2,2,sharex=ax1, sharey=ax1)
ax1.imshow(box_data, cmap='gray', origin='lower', norm=norm)
ax2.imshow(box_data_dq, cmap='bone', interpolation='nearest', origin='lower')
for coord in coords:
    patch1 = mk_patch(coord)
    patch2 = mk_patch(coord)
    ax1.add_patch(patch1)
    ax2.add_patch(patch2)

In [None]:
np.where(box_data < -10)

In [None]:
num_pix = len(box_data.flatten())

In [None]:
data = {'sci':box_data.flatten(),'dq':box_data_dq.flatten(),'pix':np.linspace(1, num_pix, num_pix)} 

In [None]:
data

In [None]:
print(len(data['sci']), len(data['dq']), len(data['pix']))

In [None]:
df = pd.DataFrame(data)

In [None]:
df.plot(kind='scatter',x='pix',y='sci',c='dq',colormap=plt.get_cmap('inferno_r'), alpha=0.65)

In [None]:
d = [[1,2,3,4,5],[6,7,8,9,10]]
d = np.asarray(d)

In [None]:
d.flatten()

### Generate a multipage pdf of cutouts for cosmic rays to show what cosmic rays of different sizes look like

In [None]:
avg_size = np.nanmean(list(sizes.values()))
std_size = np.nanstd(list(sizes.values()))

In [None]:
avg_size, std_size

In [None]:
max_size_idx = np.where(np.asarray(list(sizes.values())) > 5.)