In [94]:
#!/usr/bin/env python
# coding: utf-8

# Import required libraries and modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from astropy.coordinates import SkyCoord
import astropy.units as u
from tqdm import tqdm

In [95]:

# Define a threshold for matching errors
space_match_threshold = 1 * u.arcsec
MJD_tolerance = 31./(24*3600) #31 sec in units of day 

# Define file paths.
sum_path = {}
sum_path["star"] = '../truth_star/truth_star_summary_v1-0-0.parquet'
#'/sdf/data/rubin/shared/dc2_run2.2i_truth/truth_star/truth_star_summary_v1-0-0.parquet'
sum_path["sn"] = "../truth_sn/truth_sn_summary_v1-0-0.parquet" 
#'/sdf/data/rubin/shared/dc2_run2.2i_truth/truth_sn/truth_sn_summary_v1-0-0.parquet'

var_path = {}
var_path["star"] = '../truth_star/truth_star_variability_v1-0-0.parquet'
#'/sdf/data/rubin/shared/dc2_run2.2i_truth/truth_star/truth_star_variability_v1-0-0.parquet'
var_path["sn"] = '../truth_sn/truth_sn_variability_v1-0-0.parquet'
#'/sdf/data/rubin/shared/dc2_run2.2i_truth/truth_sn/truth_sn_variability_v1-0-0.parquet'

detection_csv_pth = "../sources_with_labels.csv" #'exported_sources.csv'

#get DIA detections
dia_detections = pd.read_csv(detection_csv_pth, index_col="diaSourceId") #formerly known as exported_csv
#only for now based on the csv we are reading
dia_detections.drop(["real"], axis=1, inplace=True)    

dia_detections

Unnamed: 0_level_0,ra,dec,midpointMjdTai,type
diaSourceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1257927201521665,55.760339,-32.260622,59583.125051,
1257927201521666,55.674078,-32.283857,59583.125051,
1257927201521667,55.552914,-32.306395,59583.125051,
1257927201521668,55.547689,-32.309278,59583.125051,
1257927201521669,55.570127,-32.306400,59583.125051,
...,...,...,...,...
660667525163384915,55.889519,-32.485637,61392.194195,star
661047079476396040,55.863218,-32.167598,61393.204087,star
662500331589992590,55.971559,-32.358853,61404.195949,star
662500331589992596,55.881022,-32.482719,61404.195949,star


## load truth catalogs

In [96]:
# Get mind and max ra and dec values to filter out unnecessary records.
max_exp_ra, min_exp_ra = dia_detections.ra.max(), dia_detections.ra.min()
max_exp_dec, min_exp_dec = dia_detections.dec.max(), dia_detections.dec.min()


catalog = {}
result_sum = {}

# Stage 1: Match sources in Space.
for s in ["star", "sn"]:
    # Read Parquet and CSV files to begin ground truth derivation.
    result_sum[s] = pd.read_parquet(sum_path[s])

    # Keep only those records from summary tables which are within the max ra and dec values in the exported sources.
    result_sum[s] = result_sum[s][(result_sum[s]['ra'] >= min_exp_ra) & (result_sum[s]['ra'] <= max_exp_ra) &\
                                    (result_sum[s]['dec'] >= min_exp_dec) & (result_sum[s]['dec'] <= max_exp_dec)]


    # Initialize astropy.coordinates.SkyCoord class for matching in space.
    catalog[s] = SkyCoord(ra=result_sum[s].ra, dec=result_sum[s].dec, unit=u.deg)

# Match exported sources with stars and supernovae.
detections_cat = SkyCoord(ra=dia_detections.ra, dec=dia_detections.dec, unit=u.deg)

In [97]:
# By default, set on_source = 0 and real=0 (bogus) for all values in the exported sources.
dia_detections['on_source'] = 0
dia_detections['real'] = 0
dia_detections['type'] = None

## spatial crossmatch

In [98]:
# we want to put the DIA catalog first so we get a yes or no truth match for every DIASource
star_idx, star_d2d, star_d3d = detections_cat.match_to_catalog_sky(catalog['star'])
sn_idx, sn_d2d, sn_d3d = detections_cat.match_to_catalog_sky(catalog['sn'])

star_mask = star_d2d < space_match_threshold #remove matches that are too far
sn_mask = sn_d2d < space_match_threshold #remove matches that are too far

print(f"{np.sum(star_mask)} of {len(detections_cat)} stars matched after applying spatial threshold")
print(f"{np.sum(sn_mask)} of {len(detections_cat)} sne matched after applying spatial threshold")

7155 of 25446 stars matched after applying spatial threshold
146 of 25446 sne matched after applying spatial threshold


In [99]:


# Get all matched stars
matched_star_idx = star_idx[star_mask] #index in stars of matched dia_detections
print(f"Number of matched stars in Stage #1: {len(matched_star_idx)}")

# Get all matched supernovae
matched_sn_idx = sn_idx[sn_mask] #index in sn_cat of matched dia_detections
print(f"Number of matched sne in Stage #1: {len(matched_sn_idx)}")

dia_idx_stars = dia_detections.index[star_mask]
dia_idx_sn = dia_detections.index[sn_mask]

# Assign the variability sources catalog id to the detections
dia_detections["id"] = None
dia_detections.loc[dia_idx_stars,"id"] = result_sum["star"].iloc[matched_star_idx]["id"].to_numpy()
dia_detections.loc[dia_idx_sn,"id"] = result_sum['sn'].iloc[matched_sn_idx]["id"].to_numpy()



Number of matched stars in Stage #1: 7155
Number of matched sne in Stage #1: 146


In [100]:
dia_detections.head()

Unnamed: 0_level_0,ra,dec,midpointMjdTai,type,on_source,real,id
diaSourceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1257927201521665,55.760339,-32.260622,59583.125051,,0,0,30321355720.0
1257927201521666,55.674078,-32.283857,59583.125051,,0,0,
1257927201521667,55.552914,-32.306395,59583.125051,,0,0,
1257927201521668,55.547689,-32.309278,59583.125051,,0,0,
1257927201521669,55.570127,-32.3064,59583.125051,,0,0,


In [101]:
# The spatially matched detections get on_source = 1

dia_detections.loc[dia_idx_sn, "on_source"] = 1
dia_detections.loc[dia_idx_stars , "on_source"] = 1
dia_detections.loc[dia_idx_sn, "type"] = "sn"
dia_detections.loc[dia_idx_stars, "type"] = "star"
dia_detections

Unnamed: 0_level_0,ra,dec,midpointMjdTai,type,on_source,real,id
diaSourceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1257927201521665,55.760339,-32.260622,59583.125051,star,1,0,30321355720
1257927201521666,55.674078,-32.283857,59583.125051,,0,0,
1257927201521667,55.552914,-32.306395,59583.125051,,0,0,
1257927201521668,55.547689,-32.309278,59583.125051,,0,0,
1257927201521669,55.570127,-32.306400,59583.125051,,0,0,
...,...,...,...,...,...,...,...
660667525163384915,55.889519,-32.485637,61392.194195,star,1,0,31411443281
661047079476396040,55.863218,-32.167598,61393.204087,star,1,0,31102009372
662500331589992590,55.971559,-32.358853,61404.195949,star,1,0,31405685742
662500331589992596,55.881022,-32.482719,61404.195949,star,1,0,31411442918


In [102]:
# Print a summary at the end of first round of matching.
print("Summary at the end of First Stage:")
print(f"detections on a source", dia_detections["on_source"].sum(), "\n")
print(f"class detection: {dia_detections.groupby('type').count().iloc[:,0]}")

Summary at the end of First Stage:
detections on a source 7301 

class detection: type
sn       146
star    7155
Name: ra, dtype: int64


In [103]:
# Stage 2: Match sources in time.

matched = {}
matched["sn"] = dia_detections.loc[dia_idx_sn]
matched["star"] = dia_detections.loc[dia_idx_stars]

# make a column to report cases where there is no variability entry for that truth id
dia_detections['not_in_truth_var'] = False
# make a column to store minimum MJD difference
dia_detections['min_mjd_offset'] = np.inf

for s in ["sn", "star"]:

    print(f"working on class: {s}")
    
    # Get a list of all the unique MJDs of sources that matched in the previous stage for the object type.
    mjd_matched_in_space = matched[s].midpointMjdTai.unique()

    # Get min and max MJD values required for matching.
    max_mjd, min_mjd = mjd_matched_in_space.max(), mjd_matched_in_space.min()

    # Read star/sn lightcurve variability parquet for the object type
    df_var = pd.read_parquet(var_path[s])
    
    # Filter out records with unwanted MJDs.
    df_var = df_var[(df_var.MJD >= min_mjd) & (df_var.MJD <= max_mjd)]
    print(f"need to examine {len(df_var)} variability entries")
    
    for detected in tqdm(matched[s].index): #loop over indices of on_source detection 
        mask_matching_ids = df_var.id == matched[s].loc[detected, 'id'] #mask for sources with matching id in variability file 
        if np.sum(mask_matching_ids) == 0:
            dia_detections.loc[detected, 'not_in_truth_var'] = True
        else:
            dia_detections.loc[detected, 'min_mjd_offset'] = np.min(np.abs(df_var[mask_matching_ids].MJD - 
                                                                           matched[s].loc[detected].midpointMjdTai))
            dia_detections.loc[detected, "real"] = dia_detections.loc[detected, 'min_mjd_offset'] <= MJD_tolerance
    del df_var

dia_detections[dia_detections.real == 1]

working on class: sn
need to examine 12757691 variability entries


100%|████████████████████████████████████████| 146/146 [00:01<00:00, 114.90it/s]


working on class: star
need to examine 380870139 variability entries


100%|███████████████████████████████████████| 7155/7155 [24:33<00:00,  4.85it/s]


Unnamed: 0_level_0,ra,dec,midpointMjdTai,type,on_source,real,id,not_in_truth_var,min_mjd_offset
diaSourceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3162235706802184,55.790623,-32.270147,59588.095548,star,1,True,30321355767,False,0.000347
3162235706802192,55.792149,-32.292336,59588.095548,star,1,True,30830343259,False,0.000347
6683021725925403,55.790364,-32.269960,59597.081036,star,1,True,30321355767,False,0.000347
6683021725925406,55.792260,-32.292177,59597.081036,star,1,True,30830343259,False,0.000347
6686731503927339,55.792412,-32.292530,59597.084203,star,1,True,30830343259,False,0.000347
...,...,...,...,...,...,...,...,...,...
645816020564443153,55.861250,-32.394682,61343.105723,star,1,True,31405689453,False,0.000100
647699403989058266,55.679088,-32.198748,61356.103080,star,1,True,31405664797,False,0.000102
659732781902856397,55.670166,-32.156024,61389.170148,star,1,True,31405659819,False,0.000347
660667525163384893,55.967617,-32.464952,61392.194195,star,1,True,31102016005,False,0.000347


In [105]:
dia_detections.loc[(dia_detections.real == 0) & (dia_detections.on_source == 1), 
['type','not_in_truth_var', 'min_mjd_offset'] ]

Unnamed: 0_level_0,type,not_in_truth_var,min_mjd_offset
diaSourceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1257927201521665,star,True,inf
1257927201521673,star,False,4.97015
1257927201521688,star,True,inf
1257927201521708,star,True,inf
1257927201521726,star,True,inf
...,...,...,...
660667525163384904,star,True,inf
661047079476396040,star,True,inf
662500331589992590,star,True,inf
662500331589992596,star,True,inf


next step: look at image stamps to see if we can understand the behavior of some of this cases where there's no match in variability (either by MJD or by id).