<h1 style="font-family:Impact,Arial;font-size:50px">Load</h1>
<p> This code loads the raw data from the VAST pipeline for the MWATS survey and adds additional columns that are needed for the analysis. These are:
    
    1.) Image pointing centre (for each flux measurement).
    2.) Distance from the pointing centre for each flux measurement (in degrees). 
    3.) Image gain. The mulitplicative factor that has been applied to the image to give the raw_peak_flux value. 

In [1]:
import math
import dask.dataframe as dd
import pandas as pd
import numpy as np

In [2]:
def vectorized_distance_on_unit_sphere(df):
    degrees_to_radians = np.pi/180.0
    phi1 = df.ra*degrees_to_radians
    phi2 = df.im_ra*degrees_to_radians

    theta1 = df.dec*degrees_to_radians
    theta2 = df.im_dec*degrees_to_radians
    
    cosine = (np.cos(phi1)*np.cos(phi2)*np.cos(theta1 - theta2) +
           np.sin(phi1)*np.sin(phi2))
    dist_from_centre = np.arccos(cosine)
    return (dist_from_centre/3.142)*180

## Load the data

In [3]:
%%time
# Load the raw data file. 
raw_data = dd.read_parquet('mwats_27_sept_full.parq').compute()
raw_data.drop(columns=['fit', 'blind_detection', 'polarisation', 'band', 'image_id', 'extname', 'cube_id', 'good_fit','fit_flags', 'flux_gain', 'pbcorr', 'peak_pixel'], axis=1, inplace=True)
raw_data['raw_peak_flux'] = raw_data['raw_peak_flux']*(1.0/1000.0) # Conversion to Jy
raw_data['datetime'] = pd.to_datetime(raw_data.time)

CPU times: user 25.9 s, sys: 18.9 s, total: 44.8 s
Wall time: 51.8 s


## Load the data file containing the locations of the image image-centres 

In [4]:
pointing = pd.read_table('pointing_centres.txt', ',')
pointing['Image'] = pointing['Image'].str.strip()

## Merge the image centres data file

In [5]:
%%time
raw_data = pd.merge(raw_data, pointing, left_on='cube_name', right_on='Image')
raw_data.rename(columns = {'RA':'im_ra'}, inplace = True) # Rename the columns
raw_data.rename(columns = {'DEC':'im_dec'}, inplace = True)
raw_data.drop(columns=['cube_name'], axis=1, inplace=True) # Get rid of the "cube_name" column as we now have "Image"

CPU times: user 31 s, sys: 41.3 s, total: 1min 12s
Wall time: 1min 19s


## Calculate the distance between the source and the image centre ( then create column)

In [6]:
%%time
raw_data['distance'] = vectorized_distance_on_unit_sphere(raw_data)

CPU times: user 707 ms, sys: 776 ms, total: 1.48 s
Wall time: 1.51 s


## Add in the gains to the data file

In [7]:
gains = pd.read_table('all_gains.txt', ',')
raw_data = pd.merge(raw_data, gains, left_on='Image', right_on='Image')

## Save the reduced data file

In [8]:
raw_data.to_feather('mwats_raw_data.fth')