In [14]:
#!/usr/bin/env python
# coding: utf-8

# Import required libraries and modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from astropy.coordinates import SkyCoord
import astropy.units as u

# Define file paths.
star_sum_path = '../truth_star/truth_star_summary_v1-0-0.parquet'
#'/sdf/data/rubin/shared/dc2_run2.2i_truth/truth_star/truth_star_summary_v1-0-0.parquet'
#'/sdf/data/rubin/shared/dc2_run2.2i_truth/truth_star/truth_star_variability_v1-0-0.parquet'
sn_sum_path = "../truth_sn/truth_sn_summary_v1-0-0.parquet" #'/sdf/data/rubin/shared/dc2_run2.2i_truth/truth_sn/truth_sn_summary_v1-0-0.parquet'

var_path = {}
var_path["star"] = '../truth_star/truth_star_variability_v1-0-0.parquet'
var_path["sn"] = '../truth_sn/truth_sn_variability_v1-0-0.parquet'
#'/sdf/data/rubin/shared/dc2_run2.2i_truth/truth_sn/truth_sn_variability_v1-0-0.parquet'
exported_csv_pth = "../sources_with_labels_v1.csv" #'exported_sources.csv'

# Read Parquet and CSV files to begin ground truth derivation.
result_star_sum = pd.read_parquet(star_sum_path)
result_sn_sum = pd.read_parquet(sn_sum_path)
dia_detections = pd.read_csv(exported_csv_pth, index_col="diaSourceId") #formerly known as exported_csv
#only for now based on the csv we are reading
dia_detections.drop(["real"], axis=1, inplace=True)
# Stage 1: Match sources in Space.

# Get mind and max ra and dec values to filter out unnecessary records.
max_exp_ra, min_exp_ra = dia_detections.ra.max(), dia_detections.ra.min()
max_exp_dec, min_exp_dec = dia_detections.dec.max(), dia_detections.dec.min()
print(max_exp_ra)

# Keep only those records from summary tables which are within the max ra and dec values in the exported sources.
filtered_star_sum = result_star_sum[(result_star_sum['ra'] >= min_exp_ra) & (result_star_sum['ra'] <= max_exp_ra) &\
                                    (result_star_sum['dec'] >= min_exp_dec) & (result_star_sum['dec'] <= max_exp_dec)]

filtered_sn_sum = result_sn_sum[(result_sn_sum['ra'] >= min_exp_ra) & (result_sn_sum['ra'] <= max_exp_ra) &\
                                (result_sn_sum['dec'] >= min_exp_dec) & (result_sn_sum['dec'] <= max_exp_dec)]

# Initialize astropy.coordinates.SkyCoord class for matching in space.
exported_cat = SkyCoord(ra=dia_detections.ra, dec=dia_detections.dec, unit=u.deg)
star_cat = SkyCoord(ra=filtered_star_sum.ra, dec=filtered_star_sum.dec, unit=u.deg)
sn_cat = SkyCoord(ra=filtered_sn_sum.ra, dec=filtered_sn_sum.dec, unit=u.deg)

# Match exported sources with stars and supernovae.
star_idx, star_d2d, star_d3d = star_cat.match_to_catalog_sky(exported_cat)
sn_idx, sn_d2d, sn_d3d = sn_cat.match_to_catalog_sky(exported_cat)

# Define a threshold for matching errors
threshold = 1 * u.arcsec
star_mask = star_d2d < threshold
sn_mask = sn_d2d < threshold

# Get all matched stars
matched_star_idx = star_idx[star_mask]
#matched_stars = exported_csv.iloc[matched_star_idx]
print(f"Number of matched stars in Stage #1: {len(matched_star_idx)}")

# Get all matched supernovae
matched_sn_idx = sn_idx[sn_mask]
#matched_sn = exported_csv.iloc[matched_sn_idx]
print(f"Number of matched sne in Stage #1: {len(matched_sn_idx)}")

# By default, set real=0 (bogus) for all values in the exported sources.
dia_detections['detected'] = 0
dia_detections['real'] = 0
dia_detections['type'] = None

dia_detections

55.95468796724601
Number of matched stars in Stage #1: 67
Number of matched sne in Stage #1: 3


Unnamed: 0_level_0,Unnamed: 0,ra,dec,detected,real,type
diaSourceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
506428274000265217,0,55.795261,-32.459151,0,0,
506428274000265218,1,55.689182,-32.489502,0,0,
506428274000265219,2,55.683139,-32.490595,0,0,
506428274000265220,3,55.753450,-32.456798,0,0,
506428274000265221,4,55.838220,-32.416279,0,0,
...,...,...,...,...,...,...
527736141479149714,293,55.922485,-32.442749,0,0,
527736141479149715,294,55.861709,-32.447215,0,0,
527736141479149717,295,55.696996,-32.434769,0,0,
527736141479149719,296,55.693576,-32.439695,0,0,


In [15]:
#FBB
listid_sn = dia_detections.iloc[matched_sn_idx]
listid_stars = dia_detections.iloc[matched_star_idx]

listid_stars.index

Index([527736141479149694, 506428274000265220, 506428274000265243,
       506428274000265258, 527736141479149704, 527736141479149684,
       527736141479149643, 506428274000265276, 527736141479149677,
       506428274000265309, 506428274000265342, 527736141479149618,
       506428274000265414, 527736141479149613, 506428274000265311,
       527736141479149573, 506428274000265242, 527736141479149686,
       527736141479149689, 506428274000265279, 506428274000265253,
       527736141479149692, 506428274000265265, 506428274000265248,
       506428274000265289, 506428274000265218, 506428274000265234,
       527736141479149707, 527736141479149709, 527736141479149702,
       527736141479149683, 527736141479149703, 527736141479149699,
       506428274000265232, 506428274000265217, 527736141479149715,
       506428274000265221, 506428274000265260, 527736141479149666,
       506428274000265320, 506428274000265303, 506428274000265373,
       527736141479149668, 527736141479149600, 527736141479149

In [16]:

dia_detections.loc[listid_sn.index, "detected"] = 1
dia_detections.loc[listid_stars.index , "detected"] = 1
dia_detections.loc[listid_sn.index, "type"] = "sn"
dia_detections.loc[listid_stars.index, "type"] = "star"
dia_detections["midpointMjdTai"] = 60753.011084
dia_detections

Unnamed: 0_level_0,Unnamed: 0,ra,dec,detected,real,type,midpointMjdTai
diaSourceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
506428274000265217,0,55.795261,-32.459151,1,0,star,60753.011084
506428274000265218,1,55.689182,-32.489502,1,0,star,60753.011084
506428274000265219,2,55.683139,-32.490595,0,0,,60753.011084
506428274000265220,3,55.753450,-32.456798,1,0,star,60753.011084
506428274000265221,4,55.838220,-32.416279,1,0,star,60753.011084
...,...,...,...,...,...,...,...
527736141479149714,293,55.922485,-32.442749,0,0,,60753.011084
527736141479149715,294,55.861709,-32.447215,1,0,star,60753.011084
527736141479149717,295,55.696996,-32.434769,0,0,,60753.011084
527736141479149719,296,55.693576,-32.439695,0,0,,60753.011084


In [17]:

# Print a summary at the end of first round of matching.
print("Summary at the end of First Stage:")
print(dia_detections[["detected"]].dropna().shape[0])
#print(df_combined.groupby("real").count())

# Stage 2: Match sources in time.

"""
# Separate the matched and unmatched sources.
df_matched = df_combined[df_combined.real==1]
df_matched.reset_index(drop=True, inplace=True)

df_unmatched = df_combined[df_combined.real==0]
df_unmatched.reset_index(drop=True, inplace=True)

"""
matched_index = ~dia_detections["detected"].isnull()


# Get a list of all the unique MJDs of sources that matched in previous stage.
mjd_matched_in_space = dia_detections.loc[matched_index].midpointMjdTai.unique()

# Get min and max MJD values required for matching.
max_mjd, min_mjd = mjd_matched_in_space.max(), mjd_matched_in_space.min()

# Define tolerance for errors while matching.
tolerance=0.00034
matched_mjd = []

for s in ["sn"]:#"star", ]:
    # Read star lightcurve variability parquet
    df_var = pd.read_parquet(var_path[s])
    # Filter out records with unwanted MJDs.
    df_var = df_var[(df_var.MJD>=min_mjd) & (df_var.MJD<=max_mjd)]

    # Get a list of unique star MJDs for matching.
    unique_mjd_var = df_var.MJD.unique()
    
    # Perform MJD matching for detection.
    for v in mjd_matched_in_space:
        #FBBB here there should be an index that matches IDs and runs the diff < tolerance only on the spatially matched ids
        if np.any(np.abs(unique_mjd_var-v) <= tolerance):
            matched_mjd.append(v)

    # Delete data not required anymore to save memory
    del df_var, unique_mjd_var
# Create a set of matched MJD to get rid of duplicate MJD matches across stars and sn.
matched_mjd = list(set(matched_mjd))
print(matched_mjd)

Summary at the end of First Stage:
298
[60753.011084]


In [19]:
# Match sources that were real in the first stage.
mask = dia_detections.loc[matched_index]['midpointMjdTai'].isin(matched_mjd)
dia_detections.loc[matched_index]['real'] = mask.astype(int)
# print(df_matched.groupby('real').count())

print(dia_detections.groupby("real").count())
# Write final CSV
dia_detections

      Unnamed: 0   ra  dec  detected  type  midpointMjdTai
real                                                      
0            298  298  298       298    70             298


Unnamed: 0_level_0,Unnamed: 0,ra,dec,detected,real,type,midpointMjdTai
diaSourceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
506428274000265217,0,55.795261,-32.459151,1,0,star,60753.011084
506428274000265218,1,55.689182,-32.489502,1,0,star,60753.011084
506428274000265219,2,55.683139,-32.490595,0,0,,60753.011084
506428274000265220,3,55.753450,-32.456798,1,0,star,60753.011084
506428274000265221,4,55.838220,-32.416279,1,0,star,60753.011084
...,...,...,...,...,...,...,...
527736141479149714,293,55.922485,-32.442749,0,0,,60753.011084
527736141479149715,294,55.861709,-32.447215,1,0,star,60753.011084
527736141479149717,295,55.696996,-32.434769,0,0,,60753.011084
527736141479149719,296,55.693576,-32.439695,0,0,,60753.011084
