In [None]:
# If having depedency issues you may need to run this one time.
!pip install --upgrade 'google-cloud-bigquery[bqstorage,pandas]'

# PANOPTES Utils Data Explorer

The tools in the `panoptes.utils.data` modules are designed to help you easily find and start using PANOPTES data.


The module primarly offers an interface that allows you to search and find any observation metadata from the PANOPTES network. The module also offers convenient methods for downloading the raw data.

In [None]:
import os
import sys
import glob
from contextlib import suppress

import numpy as np
import pandas as pd

import holoviews as hv
from holoviews import opts
import hvplot.pandas
import seaborn as sb

from tqdm import tqdm

from google.cloud import bigquery
from astropy.coordinates import SkyCoord
from astropy.stats import sigma_clip

from panoptes.utils.images import fits as fits_utils
from panoptes.utils.images import crop_data
from panoptes.utils.logging import logger

from panoptes.pipeline.utils.metadata import search_observations, get_metadata
from panoptes.pipeline.utils import sources
from panoptes.pipeline.utils import processing
from panoptes.pipeline.utils.gcp.bigquery import get_bq_clients

from IPython.display import Image
from IPython.core.display import HTML 

hv.extension('bokeh')

# Set up the logger for notebook viewing
logger.enable('panoptes')
logger.remove()
_ = logger.add(sys.stdout, format='<{level}> {message}', level='INFO')

In [None]:
bq_client, bqstorage_client = get_bq_clients()

In [None]:
# Holoviews styling options
opts.defaults(
    opts.Image(cmap='viridis', tools=['hover'], width=400, height=400),
    opts.Labels(text_color='white', text_font_size='8pt', text_align='left', text_baseline='bottom'),
)

## Getting Started

We need three sets of data to work with:

`observations_df`: Metadata for observations. Used to select which observations to do analysis by narrowing down the list of `sequence_id`s.
`images_df`: Metadata for each individual image. Used for looking up FITS files. Has additional camera data that could be used but currently isn't.
`point_sources`: Catalog and extracted information for stellar sources.  Used for getting the flux data.

### Search for observations

We can search for observations in a variety of ways. Here we just lookup the M42 coordinates and search within the default 5° radius.

In [None]:
# Get from existing coords
target_coords = SkyCoord.from_name('Wasp 3')

observations_df = search_observations(
    coords=target_coords, 
    min_num_images=10, 
    radius=20,
#     unit_id='PAN012',
    status='matched'
)

In [None]:
# Group by unit and field and sum the metrics
field_sums = observations_df.groupby(['unit_id', 'field_name']).sum().reset_index()
field_sums

In [None]:
# Look at a random set
observations_df.sort_values('total_minutes_exptime', ascending=False).head(20)

### Filter to needs

Not all of the search results will be relevant.  Here we see that we have a lot within our radius.  Let's get just M42.

In [None]:
# target_df = search_results.query('field_name == "M42"')

In [None]:
# print(f'Total minutes exptime: {target_df.total_minutes_exptime.sum()}')

## Select observation(s)

Initial processing is done on each observation separately. an observation is a nearly contiguous series of images taken from a single camera of a single FOV in which there is no intentional movement of the mount, including a meridian flip. The same FOV of view imaged before and after a meridian flip would count as separate observations for each camera.

Observations are identified by their `sequence_id`, which is the id of the PANOPTES unit and the camera as well as a timestamp of when the observation sequence started.

Metadata is extracted from from all of the images in the observation sequence, regardless of data quality.

### Get image metadata for observations

Lookup the image-level metadata for all images in the observation.

In [None]:
sequence_id = 'PAN001_14d3bd_20200526T082233'

In [None]:
images_df = get_metadata(sequence_id=sequence_id)
images_df.head()

We can do a quick spot on the quality of each image by looking at the changing color temperature of the images. While these can vary considerably depending on the viewing conditions, any major deviations from the the other images should stand out.

In [None]:
(images_df.camera_colortemp.hvplot.scatter() * images_df.camera_colortemp.hvplot.line()) 

Here it looks like the last couple of frames have a significant change, so we look at the jpg for the last images.

In [None]:
jpg_url = images_df.iloc[-10].public_url.replace('.fits.fz', '.jpg')
Image(url=jpg_url, width=600)

In [None]:
# Example of plotting data
# images_df[['camera_blue_balance', 'camera_red_balance']].plot(marker='.')

### Get the actual images.

We now have a list of image metadata and want to download the FITS files for all those images that have been properly solved.

We can use the `getdata` utility function to get both the data and the header from a url.


In [None]:
frame_slice = slice(5, None)
# frame_slice = slice(None, None)

In [None]:
fits_file_list = images_df.sort_values(by='time')[frame_slice].public_url.dropna()
len(fits_file_list)

In [None]:
base_dir = '/home/jupyter/data'

In [None]:
image_dir = f'{base_dir}/{sequence_id}/fits'

In [None]:
os.makedirs(image_dir, exist_ok=True)

fits_files = list()
    
for fits_file in tqdm(fits_file_list):
    base = os.path.basename(fits_file)
    unpacked = base.replace('.fz', '')

    if not os.path.exists(f'{image_dir}/{base}'):
        if not os.path.exists(f'{image_dir}/{unpacked}'):
            !wget -q {fits_file} -O {image_dir}/{base}
    
    # Unpack the file if packed version exists locally.
    if os.path.exists(f'{image_dir}/{base}'):
        fits_files.append(fits_utils.funpack(f'{image_dir}/{base}'))        

In [None]:
fits_files = sorted(glob.glob(f'{image_dir}/*.fits'))
f'{len(fits_files)} files downloaded'

### Plate-solve and catalog match

In [None]:
fits_utils.getdata(fits_files[0]).shape

In [None]:
wcs0 = fits_utils.getwcs(fits_files[0])
wcs0

#### Lookup catalog stars

This will use the WCS from the first image to find all the stars listed in the PANOPTES catalog. 

This can be a large number of stars and goes to $4 <= V_{mag} < 17$ so includes many sources that will not appear in an individual image.

In [None]:
catalog_stars_fn = f'{base_dir}/{sequence_id}/catalog-stars.parquet'

try:
    catalog_stars_df = pd.read_parquet(catalog_stars_fn).convert_dtypes()
except (FileNotFoundError, OSError):
    catalog_stars_df = sources.get_stars_from_wcs(wcs0, 
                                                  bq_client=bq_client, 
                                                  bqstorage_client=bqstorage_client)
    catalog_stars_df.to_parquet(catalog_stars_fn, index=False)    

In [None]:
catalog_stars_df

In [None]:
catalog_stars_df.picid = catalog_stars_df.picid.astype('int')

In [None]:
sb.distplot(catalog_stars_df.catalog_vmag, kde=False);

#### Lookup point sources in image

A catalog lookup is performed for the FOV of the images, as determined by the WCS of the first image. Each image from the sequence is run against `source-extractor`, with a liberal set of detection values.

In [None]:
extractor_config_path = '/home/jupyter/panoptes-pipeline/resources/source-extractor/panoptes.conf'
extractor_param_path = '/home/jupyter/panoptes-pipeline/resources/source-extractor/panoptes.param'
extractor_filter_path = '/usr/share/sextractor/default.conv'

In [None]:
# Set up output directories.
sources_dir = f'{base_dir}/{sequence_id}/sources'
os.makedirs(sources_dir, exist_ok=True)

In [None]:
measured_params = [
    '-c', extractor_config_path,
    '-PARAMETERS_NAME', extractor_param_path,
    '-FILTER_NAME', extractor_filter_path,    
#     '-CATALOG_NAME', catalog_filename,
]

In [None]:
def load_sources(fits_file, sources_filename, catalog_stars, force_new=False):
    if not os.path.exists(sources_filename) or force_new:
#         print('Searching for sources')
        sources_found = sources.lookup_point_sources(
            fits_file,
            catalog_match=True,
            return_unmatched=False,
            catalog_stars=catalog_stars,
            bq_client=bq_client,
            measured_params=measured_params,
            force_new=force_new
#             max_separation_arcsec=20
        )
        sources_found.to_parquet(sources_filename, index=False)
    else:
        sources_found = pd.read_parquet(sources_filename)    
        
    return sources_found

In [None]:
# unmatched_stars = catalog_stars_df[~catalog_stars_df.picid.isin(sources_found.picid)]

In [None]:
# unmatched_stars.hvplot.scatter(x='catalog_x', y='catalog_y', color='catalog_vmag')

In [None]:
# sources_found.catalog_vmag.hist()

In [None]:
# sources_found.hvplot.scatter(x='measured_x', y='measured_y', color='measured_mag_best')

In [None]:
force_new = True

In [None]:
for fits_file in tqdm(fits_files):
    image_id = fits_utils.getval(fits_file, 'IMAGEID')
    sources_filename = f'{sources_dir}/{image_id}-metadata.parquet'

    try:
        sources_found = load_sources(fits_file, 
                                     sources_filename, 
                                     catalog_stars_df, 
                                     force_new=force_new)
    except Exception as e:
        tqdm.write(f'Error: {fits_file} {e!r}')

In [None]:
obs_sources_df = pd.concat([
    pd.read_parquet(fn) 
    for fn 
    in glob.glob(f'{sources_dir}/*metadata.parquet')
])

### Merge data

In [None]:
# Save or load the full observation metadata
full_sources_fn = f'{base_dir}/{sequence_id}/observation-metadata.parquet'

if not os.path.exists(full_sources_fn) or force_new:
    # Fix the datatypes for columns we will merge on.
    obs_sources_df.picid = obs_sources_df.picid.astype('int')
    obs_sources_df.time = pd.to_datetime(obs_sources_df.time, utc=True)

    images_df.time = pd.to_datetime(images_df.time, utc=True)    

    # Merge individual image metadata with full observation metadata.
    obs_sources_full_df = obs_sources_df.merge(images_df, on=['time', 'unit_id'])    
    
    # Save to file.
    obs_sources_full_df.to_parquet(full_sources_fn, index=False)
else:
    obs_sources_full_df = pd.read_parquet(full_sources_fn).convert_dtypes()

In [None]:
obs_sources_full_df.picid = obs_sources_full_df.picid.astype('int')

### Make stamps

In [None]:
# Make xy catalog with the average positions from all measured frames.
xy_catalog = obs_sources_full_df.filter(regex='picid|measured_x$|measured_y$')

# Get just the position columns
xy_mean = xy_catalog.groupby('picid').mean()
xy_var = xy_catalog.groupby('picid').var().rename(columns=dict(measured_x='x_var', 
                                                               measured_y='y_var'))

xy_catalog = xy_mean.merge(xy_var, on='picid').reset_index()

In [None]:
xy_catalog.describe().T

#### Make stamp csv files

Makes one csv file per image with the postage stamp from all sources

In [None]:
x_min_size = xy_catalog.x_var.mean()
y_min_size = xy_catalog.y_var.mean()
print(f'x_var={x_min_size:.02f}')
print(f'y_var={y_min_size:.02f}')

In [None]:
# Based on above changes.
stamp_size = 10

In [None]:
# Get slices
image_slice_files = list()
for fits_file in tqdm(fits_files):
    image_id = fits_utils.getval(fits_file, 'IMAGEID')
    psc_fn = f'{sources_dir}/{image_id}-stamps.csv'
    csv_fn = processing.get_postage_stamps(xy_catalog, 
                                           fits_file, 
                                           stamp_size=stamp_size,
                                           output_fn=psc_fn, 
                                           force=force_new) 
    image_slice_files.append(csv_fn)

In [None]:
stamps = processing.load_stamps(image_slice_files)

In [None]:
stamps