In [1]:
# TODO
#   1. open tphot, egs_candles, and egs_merged to read
#   2. match RA and DEC between egs_merged and tphot
#   3. get ID from egs_merged for matches (#2) and match to egs_candles
#   4. make catalog with all data
#   5. make catalog with only specific columns 

# note: there may be 2 matches in #2, make note of these sources. We will have to match by eye. 

In [2]:
## imports
# astronomy
from astropy.io import fits
from astropy.io import ascii
from astropy.table import Table
from astropy.coordinates import SkyCoord
import astropy.units as u
# plotting
import matplotlib.pyplot as plt
# data 
import numpy as np
import pandas as pd 

# 1. Read Data

In [3]:
# file paths 
path_catalogs = 'C:\\Users\\polar\\OneDrive - The University of Kansas\\AGNerds\\Catalogs'

In [4]:
# open tphot
tphot_data = ascii.read(path_catalogs+'\\tphot.cat')
tphot_cols = tphot_data.colnames

# show table
# tphot_data.show_in_notebook()
# print(tphot_cols)

# important columns:
# name = 'ra'
# name = 'dec'

In [5]:
# open egs_merged 
egs_merged_data = Table.read(path_catalogs+'\\egs_merged_v1.1.fits')
egs_merged_cols = egs_merged_data.columns

# print all columns
# for col in egs_merged_cols:
#     print(col)

# important columns:
# name = 'ID'; format = 'K'
# name = 'RA'; format = 'D'; unit = 'deg'
# name = 'DEC'; format = 'D'; unit = 'deg'
# name = 'z_best'; format = 'D'

In [6]:
# open egs_candels 
egs_candels_data_bySrc = np.genfromtxt(path_catalogs+'\\egs_candels_checkage_official.ir_fitting')
egs_candels_data_byCol = egs_candels_data_bySrc.T # transpose 
# manually type column names
egs_candels_cols = ['Source','z','L(8-1000)','er','L(3-1100)','er','L1_1','L1_2','L1_3','L1_4','L2_1','L2_2','L2_3','L2_4','L_008','er',
                    'L_012','er','L_015','er','L_024','er','SFR_TIR','er','SFR_008','er','SFR_012','er','SFR_015','er','SFR_024','er',
                    'SFRonly24','SFR_R+09','SFR_R+13','SFR_E+11','er','SFR_W+11','SFR1600','SFR2800','UV_beta','A(V)','SFR1600c',
                    'SFR2800c','A(V)e','SFR1600ce','SFR2800ce','qPAH_DL+07','Umin_DL+07','gamma_DL+07','Mdust_DL+07','temp_1','temp_2',
                    'temp_3','temp_4','temp_only24','factor_1','factor_2','factor_3','factor_4','factor_only24','F(24)','R+09_valA',
                    'R+09_valB','R+13_valA','R+13_valB','W+11_val','l_IRAC_58','f_IRAC_58','l_IRAC_80','f_IRAC_80','l_MIPS24','f_MIPS24',
                    'l_MIPS70','f_MIPS70','l_PACS_100','f_PACS_100','l_PACS_160','f_PACS_160','l_SPIRE_250','f_SPIRE_250','l_SPIRE_350',
                    'f_SPIRE_350','l_SPIRE_500','f_SPIRE_500','plot_maxy','Nfit','Nobs']

# important columns (access by index)
EGSCAN_SOURCE = 0 # Source    1

# 2. Match RA and DEC 

In [7]:
## get RA and DEC from catalogs

# get RA and DEC from tphot and make array
tphot_RA  = np.array(tphot_data['ra'])
tphot_DEC = np.array(tphot_data['dec'])
# get RA and DEC from egs_merged (already an array)
egs_merged_RA  = np.array(egs_merged_data['RA'])
egs_merged_DEC = np.array(egs_merged_data['DEC'])

# get coordinants 
tphot_coord = SkyCoord(ra=tphot_RA*u.deg, dec=tphot_DEC*u.deg)
egs_merged_coord = SkyCoord(ra=egs_merged_RA*u.deg, dec=egs_merged_DEC*u.deg)

In [8]:
print('Number of tphot sources:\t', len(tphot_RA))
print('Number of egs_merged sources:\t', len(egs_merged_RA))

Number of tphot sources:	 1734
Number of egs_merged sources:	 41457


In [9]:
## match RA and DEC between catalogs
# DOC: https://docs.astropy.org/en/stable/coordinates/matchsep.html

# idx are indices into catalog that are the closest objects to each of the coordinates in c, 
# d2d are the on-sky distances between them, and 
# d3d are the 3-dimensional distances. 
idx, d2d, d3d = egs_merged_coord.match_to_catalog_sky(tphot_coord) # idx, d2d, d3d = c.match_to_catalog_sky(catalog)

# separation constraint
max_sep = 1.0 * u.arcsec
# max_sep = 0.5 * u.arcsec
sep_constraint = d2d < max_sep  # use on 'c' (egs_merged_coord)
idx_sep = idx[sep_constraint]   # use on 'catalog' (tphot)

# get matches
egs_merged_coord_matches = egs_merged_coord[sep_constraint]
tphot_coord_matches = tphot_coord[idx_sep]

# print length  
print('Number of matches:\t', len(egs_merged_coord_matches))
print('Number of matches:\t', len(tphot_coord_matches))

Number of matches:	 1661
Number of matches:	 1661


In [10]:
# test match
i=12
print(egs_merged_coord_matches[i])
print(tphot_coord_matches[i])

<SkyCoord (ICRS): (ra, dec) in deg
    (215.054167, 52.89869738)>
<SkyCoord (ICRS): (ra, dec) in deg
    (215.054131, 52.89869)>


# 3. Find Duplicate Matches

In [11]:
# # make test list 
# testlist = np.array([3,1,2,3,3,3,4,4])
# print(testlist)

# # make mask of unique soruces 
# maskt = np.zeros(len(testlist), dtype=bool)
# maskt[np.unique(testlist, return_index=True)[1]] = True

# # get value of duplicates 
# duptestlist = np.unique(testlist[~maskt])
# # print(duptestlist)

# # set all non-unique sources to False 
# for dup in duptestlist :
#     inst = np.where(testlist == dup)
#     maskt[inst] = False

# # remove duplicates
# uniquetestlist = testlist[maskt]
# print(uniquetestlist)

# # show duplicates
# duplicatetestlist = testlist[~maskt]
# print(duplicatetestlist)

In [12]:
# make mask of unique soruces 
mask = np.zeros(len(idx_sep), dtype=bool)
mask[np.unique(idx_sep, return_index=True)[1]] = True

# get value of duplicates sources 
duplicates = np.unique(idx_sep[~mask])

# set all non-unique sources to False 
for dup in duplicates :
    mask[np.where(idx_sep == dup)] = False

# apply mask to get unique and duplicate sources
idx_sep_unique = idx_sep[mask]
idx_sep_duplicates = idx_sep[~mask]

# print info
print('Number of unique:\t', len(idx_sep_unique))
print('Number of duplicates:\t', len(idx_sep_duplicates))
print('Duplicate sources:\n', idx_sep_duplicates)

Number of unique:	 1606
Number of duplicates:	 55
Duplicate sources:
 [1412 1412 1427 1427 1397 1273 1273  801  660  930  930 1389 1389  970
  127  791  791 1279  983  771  771  942  942  973  719 1106  789 1106
 1092 1204 1204 1140 1075   38   38   34   34 1427 1397  801  660  379
  379  970  127 1279  983  973  719  789 1092 1140 1075 1058 1058]


In [13]:
# apply mask to egs sources
egs_merged_i    = np.where(sep_constraint)[0]
egs_merged_i_unique = egs_merged_i[mask]
egs_merged_i_duplicates = egs_merged_i[~mask]

# print info
print('Number of unique:\t', len(egs_merged_i_unique))
print('Number of duplicates:\t', len(egs_merged_i_duplicates))
print('Duplicates:\n', egs_merged_i_duplicates)

Number of unique:	 1606
Number of duplicates:	 55
Duplicates:
 [ 8145  8213  9083  9116  9891 12029 12149 12519 12723 12730 12833 12999
 13046 13688 13808 14663 14683 15291 15985 16409 16454 16515 16528 16710
 18318 19018 19058 19070 19224 19521 19596 19795 19911 20003 20122 20378
 20393 34656 34858 35649 35703 35878 35903 35988 36045 36528 36661 36920
 37360 37593 37680 37791 37879 38123 38129]


In [14]:
# get values 
egs_merged_coord_unique = egs_merged_coord[egs_merged_i_unique]
tphot_coord_unique = tphot_coord[idx_sep_unique]

# test match
i=1
print(egs_merged_coord_unique[i])
print(tphot_coord_unique[i])

<SkyCoord (ICRS): (ra, dec) in deg
    (215.06161456, 52.90150667)>
<SkyCoord (ICRS): (ra, dec) in deg
    (215.061606, 52.901508)>


In [15]:
# get values 
egs_merged_coord_duplicates = egs_merged_coord[egs_merged_i_duplicates]
tphot_coord_duplicates = tphot_coord[idx_sep_duplicates]

# test match
i=3
print(egs_merged_coord_duplicates[i])
print(tphot_coord_duplicates[i])

<SkyCoord (ICRS): (ra, dec) in deg
    (215.01528726, 52.91275212)>
<SkyCoord (ICRS): (ra, dec) in deg
    (215.015239, 52.912741)>


# 4. Match ID

In [16]:
# get IDs
egs_merged_ID = egs_merged_data['ID']
egs_merged_ID_tphotMatches = egs_merged_ID[egs_merged_i_unique]
egs_candels_ID = egs_candels_data_byCol[EGSCAN_SOURCE]

In [17]:
# Used to match by id (code from Connor Auge)
def match(a, b):
    b_set = set(b)
    b_match = [i for i, v in enumerate(a) if v in b_set]
    a_set = set(a)
    a_match = [i for i, v in enumerate(b) if v in a_set]
    a_match = np.asarray(a_match)
    b_match = np.asarray(b_match)
    a_match2 = np.argsort(a[b_match])
    b_match2 = np.argsort(b[a_match])
    return b_match[a_match2],a_match[b_match2]

In [18]:
# match 
key_merged, key_candels = match(egs_merged_ID_tphotMatches, egs_candels_ID)

# apply match key
egs_merged_ID_tphotMatches_egsMatches = egs_merged_ID_tphotMatches[key_merged]
egs_candels_ID_egsMatches = egs_candels_ID[key_candels]

In [19]:
# test
i=1
print(egs_merged_ID_tphotMatches_egsMatches[i])
print(egs_candels_ID_egsMatches[i])

1498
1498.0


In [20]:
print('Number of egs_merged:\t',  len(egs_merged_ID_tphotMatches_egsMatches))
print('Number of egs_candels:\t', len(egs_candels_ID_egsMatches))

Number of egs_merged:	 1606
Number of egs_candels:	 1606


# Build Catalog

In [21]:
# helper indexing
tphot_i         = idx_sep_unique
egs_merged_i    = egs_merged_i_unique
egs_candles_i   = key_candels

# verify that all lengths match
print(len(tphot_i))
print(len(egs_merged_i))
print(len(egs_candles_i))

# save number of matches 
n_matches = len(tphot_i)

1606
1606
1606


In [22]:
# convert astropy table to pandas dataframe
tphot_df = tphot_data.to_pandas()
# get matched sources
tphot_df_matched = tphot_df.iloc[tphot_i]
print(tphot_df_matched.shape) # verify shape

(1606, 17)


In [23]:
# convert astropy table to pandas dataframe
egs_merged_df = egs_merged_data.to_pandas()
# get matched sources
egs_merged_df_matched = egs_merged_df.iloc[egs_merged_i]
print(egs_merged_df_matched.shape) # verify shape

(1606, 707)


In [24]:
# convert numpy array to pandas dataframe 
egs_candels_df = pd.DataFrame(egs_candels_data_bySrc, columns=egs_candels_cols)
egs_candels_df['Source'] = egs_candels_df['Source'].astype(int) # correct data type
# get matched sources
egs_candels_df_matched = egs_candels_df.iloc[egs_candles_i]
print(egs_candels_df_matched.shape) # verify shape

# # TODO fix this?
# # these temp columns did not read from file correctly...
# print(egs_candels_data_byCol[55])
# print(egs_candels_df['temp_only24'])
# set dtype to object -- handle any data types. works now, but tricky to use values later... 

(1606, 88)


In [25]:
# concatenate tables horizontally
full_table = pd.concat(
        [tphot_df_matched.reset_index(drop=True), 
        egs_merged_df_matched.reset_index(drop=True), 
        egs_candels_df_matched.reset_index(drop=True)],
        axis=1,
    )

print(full_table.shape) # verify shape

(1606, 812)


In [26]:
# # print col names
# for col in full_table.columns :
#     print(col)

In [27]:
# output table to csv file 
full_table.to_csv('JWST_EGS_UniqueSources_AllColumns.csv', index=False)
# NOTE columns that are not float datatype are 'nan'

# Reduce Table

In [28]:
wantedCols = [
    # ========================  tphot (all cols)
    'field',                    
    'ra', 
    'dec', 
    'f560w_uJy', 
    'f560w_uJy_err', 
    'f770w_uJy', 
    'f770w_uJy_err', 
    'f1000w_uJy', 
    'f1000w_uJy_err', 
    'f1280w_uJy', 
    'f1280w_uJy_err', 
    'f1500w_uJy', 
    'f1500w_uJy_err', 
    'f1800w_uJy', 
    'f1800w_uJy_err', 
    'f2100w_uJy', 
    'f2100w_uJy_err',
    # ========================  egs_merged
    'ID', 
    'z_best',
    'zbest',                                            # ??? WHY TWO Z BEST?
    'RA',
    'DEC',
    'IRAC_CH1_FLUX',
    'IRAC_CH1_FLUXERR',
    'IRAC_CH2_FLUX',
    'IRAC_CH2_FLUXERR',
    'IRAC_CH3_FLUX',
    'IRAC_CH3_FLUXERR',
    'IRAC_CH4_FLUX',
    'IRAC_CH4_FLUXERR',
    'IRAC_CH3_V08_FLUX',
    'IRAC_CH3_V08_FLUXERR', 
    # ========================  egs_candles
    'Source',                   
    'z',                                                # ??? more z?
    'l_IRAC_58',    # IRAC CH 3                         # ??? what is l and f?
    'f_IRAC_58',
    'l_IRAC_80',    # IRAC CH 4
    'f_IRAC_80',
    'l_MIPS24',
    'f_MIPS24',
    'l_MIPS70',
    'f_MIPS70',
    'l_PACS_100',
    'f_PACS_100',
    'l_PACS_160',
    'f_PACS_160',
    'l_SPIRE_250',
    'f_SPIRE_250',
    'l_SPIRE_350',
    'f_SPIRE_350',
    'l_SPIRE_500',
    'f_SPIRE_500'
]
# any more?

In [29]:
# make table of only wanted columns 
reduced_table = full_table[wantedCols]

# print cols (test)
# print(reduced_table.columns)

In [30]:
# output table to csv file 
reduced_table.to_csv('JWST_EGS_UniqueSources_ReducedColumns.csv', index=False)