## Analysis
In this notebook, we generate some of the summary statistics for the THOR runs on both simulations and ZTF alerts.

Data and results files for this notebook may be downloaded [here](https://dirac.astro.washington.edu/~moeyensj/projects/thor/paper1/).

In [1]:
%load_ext autoreload
%autoreload 2

import glob
import os
import numpy as np
import pandas as pd
import sqlite3 as sql
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.colors as colors
import seaborn as sns
sns.set(font_scale=1.0, context="paper", style="ticks")
sns.set_palette("viridis")

from astropy.time import Time

%matplotlib inline

In [2]:
import thor

from thor import __version__
print("THOR version: {}".format(__version__))

import difi

from difi import __version__
print("difi version: {}".format(__version__))

THOR version: 1.1.dev199+g1c54766.d20210401
difi version: 1.1.dev57+gf3bff13.d20210323


In [3]:
def createComponentSummary(
        all_linkages, 
        all_truths, 
        components=["clustering", "iod", "od", "od+a"]
    ):
    
    linkage_types = ["mixed", "partial", "pure", "pure_complete"]
    summary_quantities = [
        "component", 
        "num_linkages", "num_mixed", "num_partial", "num_pure", "num_pure_complete",
        "num_findable", "num_found", "num_found_pure", "num_found_partial"
    ]
    component_summary = {q : [] for q in summary_quantities}
    
    for component in components:
        component_mask_all_linkages = (all_linkages["component"] == component)
        component_mask_all_truths = (all_truths["component"] == component)
        component_summary["component"].append(component)

        # Calculate the total number of different linkage types 
        num_total_linkages = len(all_linkages[component_mask_all_linkages])
        component_summary["num_linkages"].append(num_total_linkages)
        for linkage_type in linkage_types:
            num_linkages = all_linkages[component_mask_all_linkages][linkage_type].sum()
            component_summary[f"num_{linkage_type}"].append(num_linkages)

        # Calculate the total numbers of objects found, findable, etc..
        num_findable = all_truths[component_mask_all_truths & (all_truths["findable"] == 1)]["obj_id"].nunique()
        component_summary["num_findable"].append(num_findable)

        num_found_pure = all_truths[component_mask_all_truths & (all_truths["found_pure"] >= 1)]["obj_id"].nunique()
        component_summary["num_found_pure"].append(num_found_pure)

        num_found_partial = all_truths[component_mask_all_truths & (all_truths["found_partial"] >= 1)]["obj_id"].nunique()
        component_summary["num_found_partial"].append(num_found_partial)

        num_found = all_truths[component_mask_all_truths & (all_truths["found"] >= 1)]["obj_id"].nunique()
        component_summary["num_found"].append(num_found)

    component_summary = pd.DataFrame(component_summary)
    component_summary["completeness"] = 100 * component_summary["num_found"] / component_summary["num_findable"]
    component_summary["completeness_pure"] = 100 * component_summary["num_found_pure"] / component_summary["num_findable"]
    component_summary["completeness_partial"] = 100 * component_summary["num_found_partial"] / component_summary["num_findable"]
    component_summary["purity"] = 100 * component_summary["num_pure"] / component_summary["num_linkages"]
    component_summary["linkage_efficiency"] = 100 * component_summary["num_found_pure"] / component_summary["num_linkages"]

    return component_summary

### Simulations

In [4]:
DATA_DIR = "/mnt/data/projects/thor/thor_data/msst_4x4/"
preprocessed_observations = pd.read_csv(
    os.path.join(DATA_DIR, "preprocessed_observations.csv"),
    index_col=False,
    dtype={"obs_id" : str}
)
preprocessed_associations = pd.read_csv(
    os.path.join(DATA_DIR, "preprocessed_associations.csv"),
    index_col=False,
    dtype={"obs_id" : str}
)

RUN_DIR = "/mnt/data/projects/thor/thor_results/msst_4x4/v1.1/run_4/"

In [5]:
DATABASE = "/home/moeyensj/projects/thor/thor_data/msst_4x4/msst_survey.db"
con = sql.connect(DATABASE)

known_orbits = pd.read_sql("SELECT * FROM mpcOrbitCat", con)
known_orbits = known_orbits[known_orbits["designation"].isin(preprocessed_associations["obj_id"].unique())]
con.close()

In [6]:
sma_bins = [0, 1.7, 2.06, 2.5, 2.82, 2.95, 3.27, 5.0, 50, 1000.0]
classes = {}
for i, (bin_start, bin_end) in enumerate(zip(sma_bins[:-1], sma_bins[1:])):
    bin_mask = (known_orbits["a_au"] >= bin_start) & (known_orbits["a_au"] < bin_end)
    classes["{}<=a<{}".format(bin_start, bin_end)] = known_orbits[bin_mask]["designation"].unique()
    
classes["Noise"] = preprocessed_associations[preprocessed_associations["obj_id"].str.contains("^u[0-9]{8}$", regex=True)]["obj_id"].unique()

In [7]:
from thor.orbits import Orbits
from thor import analyzeTHOR

ANALYSIS_DIR = os.path.join(RUN_DIR, "analysis")
os.makedirs(ANALYSIS_DIR, exist_ok=True)

# Read recovered orbits and orbit members 
recovered_orbits = Orbits.from_csv(
    os.path.join(RUN_DIR, "recovered_orbits.csv"),
).to_df(include_units=False)

recovered_orbit_members = pd.read_csv(
    os.path.join(RUN_DIR, "recovered_orbit_members.csv"),
    index_col=False,
    dtype={"obs_id" : str}
)

if not os.path.exists(os.path.join(ANALYSIS_DIR, "all_orbits_recovered.csv")):

    # Analyze THOR run
    run_analysis, test_orbit_analysis = analyzeTHOR(
        preprocessed_associations,
        RUN_DIR,
        classes=classes
    )
    all_orbits_recovered, all_truths_recovered, summary_recovered = run_analysis
    all_linkages, all_truths, summary = test_orbit_analysis
    
    # Compute component summary
    summary_components = createComponentSummary(all_linkages, all_truths)
    
    all_orbits_recovered.to_csv(
        os.path.join(ANALYSIS_DIR, "all_orbits_recovered.csv"),
        index=False
    )
    all_truths_recovered.to_csv(
        os.path.join(ANALYSIS_DIR, "all_truths_recovered.csv"),
        index=False
    )
    summary_recovered.to_csv(
        os.path.join(ANALYSIS_DIR, "summary_recovered.csv"),
        index=False
    )
    all_linkages.to_csv(
        os.path.join(ANALYSIS_DIR, "all_linkages.csv"),
        index=False
    )
    all_truths.to_csv(
        os.path.join(ANALYSIS_DIR, "all_truths.csv"),
        index=False
    )
    summary.to_csv(
        os.path.join(ANALYSIS_DIR, "summary.csv"),
        index=False
    )
    summary_components.to_csv(
        os.path.join(ANALYSIS_DIR, "summary_components.csv"),
        index=False
    )

else:
    all_orbits_recovered = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "all_orbits_recovered.csv"),
        index_col=False
    )
    all_truths_recovered = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "all_truths_recovered.csv"),
        index_col=False
    )
    summary_recovered = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "summary_recovered.csv"),
        index_col=False
    )
    all_linkages = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "all_linkages.csv"),
        index_col=False
    )
    all_truths = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "all_truths.csv"),
        index_col=False
    )
    summary = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "summary.csv"),
        index_col=False
    )
    summary_components = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "summary_components.csv"),
        index_col=False
    )

In [8]:
# Number of noise detections
num_noise_obs = len(preprocessed_associations[preprocessed_associations["obj_id"].str.contains("^u[0-9]{8}$", regex=True)])
num_detections = len(preprocessed_associations)
percent_noise = 100 * num_noise_obs / num_detections
print(f"{num_noise_obs} [{percent_noise:.3f}%]")

112078 [44.617%]


In [9]:
# Number of real detections
num_object_obs = len(preprocessed_associations[~preprocessed_associations["obj_id"].str.contains("^u[0-9]{8}$", regex=True)])
percent_object = 100 * num_object_obs / num_detections
print(f"{num_object_obs} [{percent_object:.3f}%]")

139120 [55.383%]


In [10]:
summary_components

Unnamed: 0,component,num_linkages,num_mixed,num_partial,num_pure,num_pure_complete,num_findable,num_found,num_found_pure,num_found_partial,completeness,completeness_pure,completeness_partial,purity,linkage_efficiency
0,clustering,120445,48107,13898,58440,10113,18332,16763,16739,3053,91.441196,91.310277,16.653938,48.520071,13.89763
1,iod,116127,45392,12308,58427,10110,18332,16762,16738,3053,91.435741,91.304822,16.653938,50.313019,14.41353
2,od,58238,188,0,58050,10062,18332,16725,16725,0,91.233908,91.233908,0.0,99.677187,28.718363
3,od+a,16729,1,0,16728,16697,18332,16728,16728,0,91.250273,91.250273,0.0,99.994022,99.994022


In [11]:
summary_recovered

Unnamed: 0,class,num_members,num_obs,completeness,findable,found,findable_found,findable_missed,not_findable_found,not_findable_missed,...,unique_in_partial_linkages_only,unique_in_pure_and_partial_linkages,unique_in_partial_linkages,unique_in_partial_contaminant_linkages,unique_in_mixed_linkages,obs_in_pure_linkages,obs_in_pure_complete_linkages,obs_in_partial_linkages,obs_in_partial_contaminant_linkages,obs_in_mixed_linkages
0,All,136559,251198,91.250273,18332,16728,16728,1604,0,118227,...,0,0,0,0,2,113385,113228,0,0,5
1,Noise,112078,112078,,0,0,0,0,0,112078,...,0,0,0,0,0,0,0,0,0,0
2,2.06<=a<2.5,9594,53196,85.148372,6942,5911,5911,1031,0,2652,...,0,0,0,0,1,39838,39793,0,0,1
3,2.5<=a<2.82,8450,48385,95.854193,6392,6127,6127,265,0,2058,...,0,0,0,0,0,41611,41586,0,0,0
4,2.95<=a<3.27,4400,25796,96.464793,3451,3329,3329,122,0,949,...,0,0,0,0,0,22652,22596,0,0,0
5,2.82<=a<2.95,1085,6464,96.270396,858,826,826,32,0,227,...,0,0,0,0,1,5638,5638,0,0,4
6,3.27<=a<5.0,405,2580,96.619718,355,343,343,12,0,50,...,0,0,0,0,0,2356,2325,0,0,0
7,1.7<=a<2.06,319,1565,61.458333,192,118,118,74,0,127,...,0,0,0,0,0,775,775,0,0,0
8,0<=a<1.7,163,694,18.75,80,15,15,65,0,83,...,0,0,0,0,0,102,102,0,0,0
9,5.0<=a<50,59,398,94.642857,56,53,53,3,0,3,...,0,0,0,0,0,371,371,0,0,0


In [12]:
findable = all_truths_recovered[all_truths_recovered["findable"] == 1]["obj_id"].values
found = all_truths_recovered[all_truths_recovered["found"] >= 1]["obj_id"].values

sma_bins = [0.0, 1.7, 50.0]
for a_min, a_max in zip(sma_bins[:-1], sma_bins[1:]):
    found_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(found)]) 
    findable_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(findable)])
    print("Completeness between {} and {} AU: {:.2f} %, {}".format(a_min, a_max, found_in_bin / findable_in_bin * 100.0, found_in_bin))

Completeness between 0.0 and 1.7 AU: 18.75 %, 15
Completeness between 1.7 and 50.0 AU: 91.58 %, 16733


In [13]:
sma_bins = [0.0, 2.5, 50.0]
for a_min, a_max in zip(sma_bins[:-1], sma_bins[1:]):
    found_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(found)]) 
    findable_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(findable)])
    print("Completeness between {} and {} AU: {:.2f} %, {}".format(a_min, a_max, found_in_bin / findable_in_bin * 100.0, found_in_bin))

Completeness between 0.0 and 2.5 AU: 83.78 %, 6045
Completeness between 2.5 and 50.0 AU: 96.10 %, 10703


### ZTF Analysis with 2018 Catalog

In [14]:
DATA_DIR = "/mnt/data/projects/thor/thor_data/ztf/"
RUN_DIR = "/mnt/data/projects/thor/thor_results/ztf/v1.1/run4"

preprocessed_observations = pd.read_csv(
    os.path.join(DATA_DIR, "preprocessed_observations.csv"),
    index_col=False,
    dtype={"obs_id" : str}
)
preprocessed_associations = pd.read_csv(
    os.path.join(DATA_DIR, "preprocessed_associations.csv"),
    index_col=False,
    dtype={"obs_id" : str}
)

# Read original observations 
observations = pd.read_csv(
    os.path.join(DATA_DIR, "ztf_observations_610_624.csv"),
    index_col=False,
    sep=" ",
    dtype={"candid" : str},
    low_memory=False
)

# Add magnitudes, magnitude errors, filter IDs, night IDs to the preprocessed observations
preprocessed_observations = preprocessed_observations.merge(
    observations[["candid", "nid", "magpsf", "sigmapsf", "fid"]],  
    left_on="obs_id", 
    right_on="candid"
)
preprocessed_observations.drop(
    columns=["candid"],
    inplace=True
)
preprocessed_observations.rename(
    columns={
        "nid" : "night_id",
        "magpsf" : "mag",
        "sigmapsf" : "mag_sigma",
        "fid" : "filter"
    },
    inplace=True
)
# Filter ID (1=g; 2=r; 3=i) from https://zwickytransientfacility.github.io/ztf-avro-alert/schema.html
for i, f in enumerate(["g", "r", "i"]):
    preprocessed_observations.loc[preprocessed_observations["filter"].isin([i + 1]), "filter"] = f

In [15]:
preprocessed_observations.head()

Unnamed: 0,obs_id,mjd_utc,RA_deg,Dec_deg,RA_sigma_deg,Dec_sigma_deg,observatory_code,night_id,mag,mag_sigma,filter
0,610130484415010015,58364.130486,255.347544,-23.059466,2.8e-05,2.8e-05,I41,610,14.8516,0.034435,r
1,610130481215010007,58364.130486,255.077637,-26.542553,2.8e-05,2.8e-05,I41,610,16.4782,0.044278,r
2,610130481215015021,58364.130486,254.708502,-26.721381,2.8e-05,2.8e-05,I41,610,17.4692,0.114528,r
3,610130483515015056,58364.130486,261.287334,-24.006969,2.8e-05,2.8e-05,I41,610,17.4321,0.1333,r
4,610130483515015069,58364.130486,261.062061,-23.963993,2.8e-05,2.8e-05,I41,610,17.6541,0.086178,r


In [16]:
from thor.utils import unpackMPCDesignation

# Read orbits file (MPCORB in OORB format from 2018)
known_orbits = pd.read_csv(
    "/mnt/data/projects/thor/thor_data/ztf/MPCORB_20181106_ZTF_keplerian.orb", 
    delim_whitespace=True, 
    skiprows=4,
    names=["designation", "a_au", "e", "i_deg", "ascNode_deg", "argPeri_deg", "meanAnom_deg", "epoch_mjd_tt", "H", "G"],
    low_memory=False
)
known_orbits.loc[:, "designation"] = known_orbits["designation"].apply(unpackMPCDesignation)
known_orbits = known_orbits[known_orbits["designation"].isin(preprocessed_associations["obj_id"].unique())]

In [17]:
known_orbits

Unnamed: 0,designation,a_au,e,i_deg,ascNode_deg,argPeri_deg,meanAnom_deg,epoch_mjd_tt,H,G
24,25,2.400161,0.254614,21.60484,214.13060,90.26320,110.352151,58364.131287,7.83,0.15
34,35,2.994072,0.225588,7.93272,353.73890,213.39677,170.377664,58364.131287,8.50,0.15
47,48,3.110088,0.072444,6.54742,183.55392,253.05401,26.989431,58364.131287,6.90,0.15
57,58,2.700076,0.042782,5.06541,161.10757,30.80170,246.067061,58364.131287,8.86,0.15
64,65,3.425953,0.111943,3.56353,155.61620,102.63085,224.346157,58364.131287,6.62,0.01
...,...,...,...,...,...,...,...,...,...,...
788558,2018 RK1,2.329186,0.312291,27.48645,342.00803,353.21310,11.664594,58364.131287,18.80,0.15
788569,2018 RM3,2.630455,0.484081,11.08690,196.36231,197.33103,348.839281,58364.131287,20.50,0.15
788571,2018 RP3,2.140752,0.317913,6.65454,295.67620,347.31493,38.615980,58364.131287,18.70,0.15
788573,2018 RF4,2.701907,0.334816,13.50859,74.56787,329.56481,341.164499,58364.131287,17.40,0.15


In [18]:
sma_bins = [0, 1.7, 2.06, 2.5, 2.82, 2.95, 3.27, 5.0, 50, 1000.0]
classes = {}
for i, (bin_start, bin_end) in enumerate(zip(sma_bins[:-1], sma_bins[1:])):
    bin_mask = (known_orbits["a_au"] >= bin_start) & (known_orbits["a_au"] < bin_end)
    classes["{}<=a<{}".format(bin_start, bin_end)] = known_orbits[bin_mask]["designation"].unique()

# Observations unattributed by ZTF
classes["Unknown"] = preprocessed_associations[preprocessed_associations["obj_id"].str.contains("^u[0-9]{8}$", regex=True)]["obj_id"].unique()

# Observations attributed by ZTF that could not be matched the known catalog (probably designation changes or comets)
unclassified_mask = ~preprocessed_associations["obj_id"].isin(known_orbits["designation"].unique()) & (~preprocessed_associations["obj_id"].str.contains("^u[0-9]{8}$", regex=True))
classes["Unmatched"] = preprocessed_associations[unclassified_mask]["obj_id"].unique()

In [19]:
from thor.orbits import Orbits
from difi import analyzeLinkages
from difi import analyzeObservations


ANALYSIS_DIR = os.path.join(RUN_DIR, "analysis_2018")
os.makedirs(ANALYSIS_DIR, exist_ok=True)

# Read the recovered orbits and recovered_orbit_members which where combined from
# the patches to see how the overall run performed
recovered_orbits = Orbits.from_csv(
    os.path.join(RUN_DIR, "recovered_orbits.csv"),
).to_df(include_units=False)

recovered_orbit_members = pd.read_csv(
    os.path.join(RUN_DIR, "recovered_orbit_members.csv"),
    index_col=False,
    dtype={"obs_id" : str}
)

column_mapping = {
    'linkage_id': 'orbit_id', 
    'obs_id': 'obs_id', 
    'truth': 'obj_id'
}

analysis_observations = preprocessed_observations.merge(preprocessed_associations, on="obs_id")

all_truths_survey, findable_observations, summary_survey = analyzeObservations(
    analysis_observations,
    classes=classes,
    metric='min_obs',
    column_mapping=column_mapping,
    min_obs=5,
)

if not os.path.exists(os.path.join(ANALYSIS_DIR, "all_orbits_recovered.csv")):
    
    # Go through each individual patch directory (which themselves
    # are individual THOR runs) and analyze the performance
    all_linkages_patches = []
    all_truths_patches = []
    summary_patches = []

    contents = sorted(glob.glob(os.path.join(RUN_DIR, "patch_*")))
    for c in contents:
        if os.path.isdir(c):
            print(f"Analyzing Patch {os.path.basename(c)}")
            run_analysis, test_orbit_analysis = analyzeTHOR(
                preprocessed_associations,
                c,
                classes=classes,
            )

            all_linkages_patches_i, all_truths_patches_i, summary_patches_i = test_orbit_analysis
            all_linkages_patches.append(all_linkages_patches_i)
            all_truths_patches.append(all_truths_patches_i)
            summary_patches.append(summary_patches_i)
        
        
    all_linkages_patches = pd.concat(
        all_linkages_patches,
        ignore_index=True
    )
    all_truths_patches = pd.concat(
        all_truths_patches,
        ignore_index=True
    )
    summary_patches = pd.concat(
        summary_patches,
        ignore_index=True
    )
    
    all_orbits_recovered, all_truths_recovered, summary_recovered = analyzeLinkages(
        analysis_observations,
        recovered_orbit_members,
        all_truths=all_truths_survey,
        min_obs=5,
        contamination_percentage=0.0,
        classes=classes,
        column_mapping=column_mapping
    )
    for df in [all_orbits_recovered, all_truths_recovered, summary_recovered]:
        df.insert(0, "component", "combined")

    all_linkages = pd.concat([all_linkages_patches, all_orbits_recovered], ignore_index=True)
    all_truths = pd.concat([all_truths_patches, all_truths_recovered], ignore_index=True)
    summary = pd.concat([summary_patches, summary_recovered], ignore_index=True)
    
    for df in [all_orbits_recovered, all_truths_recovered, summary_recovered]:
        df.drop(columns=["component"], inplace=True)
    
    summary_components = createComponentSummary(
        all_linkages, 
        all_truths, 
        components=["clustering", "iod", "od", "od+a", "combined"]
    )
    
    all_orbits_recovered.to_csv(
        os.path.join(ANALYSIS_DIR, "all_orbits_recovered.csv"),
        index=False
    )
    all_truths_recovered.to_csv(
        os.path.join(ANALYSIS_DIR, "all_truths_recovered.csv"),
        index=False
    )
    summary_recovered.to_csv(
        os.path.join(ANALYSIS_DIR, "summary_recovered.csv"),
        index=False
    )
    all_linkages.to_csv(
        os.path.join(ANALYSIS_DIR, "all_linkages.csv"),
        index=False
    )
    all_truths.to_csv(
        os.path.join(ANALYSIS_DIR, "all_truths.csv"),
        index=False
    )
    summary.to_csv(
        os.path.join(ANALYSIS_DIR, "summary.csv"),
        index=False
    )
    summary_components.to_csv(
        os.path.join(ANALYSIS_DIR, "summary_components.csv"),
        index=False
    )

else:
    all_orbits_recovered = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "all_orbits_recovered.csv"),
        index_col=False
    )
    all_truths_recovered = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "all_truths_recovered.csv"),
        index_col=False
    )
    summary_recovered = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "summary_recovered.csv"),
        index_col=False
    )
    all_linkages = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "all_linkages.csv"),
        index_col=False
    )
    all_truths = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "all_truths.csv"),
        index_col=False
    )
    summary = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "summary.csv"),
        index_col=False
    )
    summary_components = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "summary_components.csv"),
        index_col=False
    )

In [20]:
# Number of noise detections
num_noise_obs = len(preprocessed_associations[preprocessed_associations["obj_id"].str.contains("^u[0-9]{8}$", regex=True)])
num_detections = len(preprocessed_associations)
percent_noise = 100 * num_noise_obs / num_detections
print(f"{num_noise_obs} [{percent_noise:.3f}%]")

572188 [69.143%]


In [21]:
# Number of real detections
num_object_obs = len(preprocessed_associations[~preprocessed_associations["obj_id"].str.contains("^u[0-9]{8}$", regex=True)])
percent_object = 100 * num_object_obs / num_detections
print(f"{num_object_obs} [{percent_object:.3f}%]")

255358 [30.857%]


In [22]:
summary_recovered

Unnamed: 0,class,num_members,num_obs,completeness,findable,found,findable_found,findable_missed,not_findable_found,not_findable_missed,...,unique_in_partial_linkages_only,unique_in_pure_and_partial_linkages,unique_in_partial_linkages,unique_in_partial_contaminant_linkages,unique_in_mixed_linkages,obs_in_pure_linkages,obs_in_pure_complete_linkages,obs_in_partial_linkages,obs_in_partial_contaminant_linkages,obs_in_mixed_linkages
0,All,635058,827546,97.205459,21542,20940,20940,602,0,613516,...,0,0,0,0,4554,161849,154423,0,0,4973
1,Unknown,572188,572188,,0,0,0,0,0,572188,...,0,0,0,0,4384,0,0,0,0,4384
2,2.5<=a<2.82,22490,90941,97.904496,7874,7709,7709,165,0,14616,...,0,0,0,0,58,58153,56116,0,0,154
3,2.06<=a<2.5,19453,77147,95.574273,6530,6241,6241,289,0,12923,...,0,0,0,0,54,47572,45557,0,0,180
4,2.95<=a<3.27,15918,64226,99.048664,5466,5414,5414,52,0,10452,...,0,0,0,0,43,41418,39983,0,0,161
5,1.7<=a<2.06,1504,9177,93.489149,599,560,560,39,0,905,...,0,0,0,0,7,6762,5027,0,0,36
6,2.82<=a<2.95,2453,9105,98.913043,736,728,728,8,0,1717,...,0,0,0,0,1,5438,5383,0,0,1
7,3.27<=a<5.0,549,2275,97.959184,196,192,192,4,0,353,...,0,0,0,0,1,1481,1423,0,0,3
8,5.0<=a<50,331,1309,97.530864,81,79,79,2,0,250,...,0,0,0,0,2,803,781,0,0,2
9,0<=a<1.7,126,866,28.888889,45,13,13,32,0,81,...,0,0,0,0,3,190,121,0,0,24


In [23]:
mask = ((~summary["test_orbit_id"].isna()) & (summary["class"] == "All") & (summary["component"] == "clustering"))
len(summary[mask & (summary["findable"] == 0)])

88

In [24]:
mask = ((~summary["test_orbit_id"].isna()) & (summary["class"] == "All") & (summary["component"] == "od+a"))
len(summary[mask & (summary["found"] == 0)])

220

In [25]:
summary_components

Unnamed: 0,component,num_linkages,num_mixed,num_partial,num_pure,num_pure_complete,num_findable,num_found,num_found_pure,num_found_partial,completeness,completeness_pure,completeness_partial,purity,linkage_efficiency
0,clustering,7136467,3981065,2885578,269824,43037,21542,21121,21112,2385,98.045678,98.003899,11.071395,3.780918,0.295833
1,iod,1343065,995310,82798,264957,42583,21542,21043,21038,2350,97.683595,97.660384,10.908922,19.727787,1.566417
2,od,306363,42262,0,264101,42203,21542,21018,21018,0,97.567542,97.567542,0.0,86.205253,6.860489
3,od+a,75120,3883,0,71237,64041,21542,20995,20995,0,97.460774,97.460774,0.0,94.830937,27.948616
4,combined,21723,705,0,21018,20407,21542,20940,20940,0,97.205459,97.205459,0.0,96.754592,96.395525


In [26]:
duplicate_orbits = all_orbits_recovered["linked_truth"].value_counts()
len(duplicate_orbits.index.values[duplicate_orbits.values > 1]), duplicate_orbits.values[duplicate_orbits.values > 1].sum()

(72, 150)

In [27]:
def addNumNights(
        linkages, 
        linkage_members, 
        preprocessed_observations, 
        linkage_id_col="orbit_id"
    ):
    linkages_ = linkages.copy()
    linkage_members_ = linkage_members.copy()
    linkage_members_ = linkage_members_.merge(
        preprocessed_observations, 
        on="obs_id"
    )
    linkages_ = linkages_.merge(
        linkage_members_.groupby(by=[linkage_id_col])["night_id"].nunique().to_frame("num_nights").reset_index(),
        on="orbit_id"
    )
    return linkages_, linkage_members_
    
def calculateDeltas(
        linkage_members, 
        preprocessed_observations
    ):
    linkage_members_ = linkage_members.copy() 
    deltas = linkage_members_.groupby(by=["orbit_id", "night_id"])[["mjd_utc", "RA_deg", "Dec_deg", "mag"]].diff()
    deltas["mjd_utc"] = deltas["mjd_utc"].values
    deltas.rename(
        columns={
            "mjd_utc" : "dt",
            "RA_deg" : "dRA_deg",
            "Dec_deg" : "dDec_deg",
            "mag" : "dmag",
        },
        inplace=True
    )
    linkage_members_ = linkage_members_.join(deltas)
    linkage_members_["dt_sec"] = linkage_members_["dt"] * 86400

    return linkage_members_

recovered_orbits, recovered_orbit_members = addNumNights(recovered_orbits, recovered_orbit_members, preprocessed_observations)
recovered_orbit_members = calculateDeltas(recovered_orbit_members, preprocessed_observations)
analysis_orbit_members = recovered_orbit_members.merge(preprocessed_associations, on="obs_id")

In [28]:
analysis_orbit_members.head()

Unnamed: 0,orbit_id,obs_id,residual_ra_arcsec,residual_dec_arcsec,chi2,outlier,mjd_utc,RA_deg,Dec_deg,RA_sigma_deg,...,night_id,mag,mag_sigma,filter,dt,dRA_deg,dDec_deg,dmag,dt_sec,obj_id
0,0001e66e00ce40ff9b706653dc12868c,611270055815015012,0.013711,0.033395,0.130321,0,58365.270058,303.899116,-14.52885,2.8e-05,...,611,19.3245,0.104672,g,,,,,,23532
1,0001e66e00ce40ff9b706653dc12868c,614251075815015018,-0.071151,-0.091398,1.341611,0,58368.251076,303.721667,-14.637633,2.8e-05,...,614,19.3183,0.105554,g,,,,,,23532
2,0001e66e00ce40ff9b706653dc12868c,614272965815015023,0.042106,-0.026438,0.247193,0,58368.272963,303.720463,-14.638358,2.8e-05,...,614,18.5029,0.071292,r,0.021887,-0.001204,-0.000725,-0.8154,1891.002257,23532
3,0001e66e00ce40ff9b706653dc12868c,617251605815015016,0.079715,0.029725,0.723809,0,58371.251609,303.606974,-14.735852,2.8e-05,...,617,19.2716,0.08743,g,,,,,,23532
4,0001e66e00ce40ff9b706653dc12868c,617293945815015028,-0.072533,0.09477,1.424245,0,58371.293947,303.605474,-14.737099,2.8e-05,...,617,18.5753,0.073987,r,0.042338,-0.0015,-0.001248,-0.6963,3658.003173,23532


In [29]:
recovered_orbits

Unnamed: 0,orbit_id,epoch,x,y,z,vx,vy,vz,covariance,r,...,v,v_sigma,arc_length,num_obs,num_params,num_iterations,chi2,rchi2,improved,num_nights
0,0001e66e00ce40ff9b706653dc12868c,58374.110403,1.903582,-1.609999,0.146526,0.008657,0.006809,0.000264,"[[1.14866073e-07, -1.80582249e-07, 1.83841721e...",2.497437,...,0.011017,0.000040,12.003472,9,6,2,4.389190,0.365766,True,5
1,0004c514d96d44f59a7d20708f9c4aa5,58370.434189,1.724032,0.955398,0.091693,-0.005506,0.011827,0.001510,"[[3.16433403e-06, 5.06194434e-06, 3.90161816e-...",1.973190,...,0.013133,0.000035,6.021921,5,6,10,0.101949,0.025487,True,3
2,00055205550742bc932b3b09154dc93f,58370.380725,3.048622,0.005230,-0.405679,0.000421,0.009138,0.001518,"[[2.33690847e-05, 2.733617e-06, -4.57912802e-0...",3.075500,...,0.009273,0.000044,9.038287,7,6,10,9.845222,1.230653,True,4
3,00080487093f4fb08ba1884ee23aa478,58375.331125,2.006455,-0.280056,-0.142242,0.004241,0.012417,0.002490,"[[2.07149799e-06, -2.58348355e-07, -2.91006074...",2.030893,...,0.013355,0.000008,11.959977,9,6,10,11.609602,0.967467,False,5
4,000810c9d4bf4fedab1020733b508f2b,58373.322733,2.197809,-0.025010,-0.137868,-0.000635,0.011965,-0.000747,"[[2.1675363e-06, 2.91533232e-07, -2.4693288e-0...",2.202271,...,0.012005,0.000019,8.957940,7,6,10,10.449729,1.306216,True,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21718,ff86b55cc6024b00b69ef638c1a8215b,58370.361122,2.197009,0.016085,0.134666,0.001648,0.012245,-0.001185,"[[4.85442404e-05, 1.00947778e-05, 5.36491947e-...",2.201191,...,0.012412,0.000104,12.082222,6,6,2,2.364404,0.394067,False,4
21719,ffb0c9078c9143d984cc26e707d57d1c,58371.304602,1.824960,-0.195746,0.587564,0.001693,0.012236,0.000950,"[[4.41684672e-09, 1.3325059e-10, 3.07464022e-0...",1.927181,...,0.012389,0.000001,14.031725,40,6,2,62.128736,0.839578,True,11
21720,ffc7eb73f79e4c8baf00f8d029df63e9,58368.388953,2.980501,0.712414,-0.273991,-0.000761,0.009101,0.001201,"[[3.97871151e-05, 1.94475042e-05, -5.42361722e...",3.076685,...,0.009212,0.000134,11.993495,7,6,2,0.623326,0.077916,True,4
21721,fff7bc534437490f84a7a33a8d64ddc0,58370.363740,1.717298,-0.071539,0.083221,-0.000577,0.014476,0.000745,"[[3.55678619e-07, 7.96915931e-08, 3.99356354e-...",1.720801,...,0.014507,0.000014,8.988912,5,6,2,2.174136,0.543534,True,3


In [30]:
len(recovered_orbits[recovered_orbits["num_nights"] == recovered_orbits["num_obs"]])

1783

In [31]:
pure_orbits_2018 = all_orbits_recovered[all_orbits_recovered["pure"] == 1]["orbit_id"].values
mixed_orbits_2018 = all_orbits_recovered[all_orbits_recovered["mixed"] == 1]["orbit_id"].values

In [32]:
# Total number of observations recovered
analysis_orbit_members["obs_id"].nunique()

166822

In [33]:
# Total number of observations recovered in pure orbits
analysis_orbit_members[analysis_orbit_members["orbit_id"].isin(pure_orbits_2018)]["obs_id"].nunique()

161849

In [34]:
mixed_orbit_members = analysis_orbit_members[analysis_orbit_members["orbit_id"].isin(mixed_orbits_2018)]
unknown_observations_in_mixed = mixed_orbit_members[mixed_orbit_members["obj_id"].str.contains("^u[0-9]{8}$", regex=True)]
known_observations_in_mixed = mixed_orbit_members[~mixed_orbit_members["obj_id"].str.contains("^u[0-9]{8}$", regex=True)]

In [35]:
len(unknown_observations_in_mixed)

4384

In [36]:
len(known_observations_in_mixed)

589

In [37]:
unknown_observations_in_mixed["orbit_id"].nunique(), unknown_observations_in_mixed["obj_id"].nunique()

(671, 4384)

In [38]:
known_observations_in_mixed["orbit_id"].nunique(), known_observations_in_mixed["obj_id"].nunique()

(135, 170)

How would MOPS and ZMODE perform?

In [39]:
from difi import analyzeObservations

column_mapping = {
    'obs_id': 'obs_id', 
    'truth': 'obj_id',
    'night' : 'night_id',
    'time' : 'mjd_utc'
}

all_truths_survey_MOPS, findable_observations_MOPS, summary_survey_MOPS = analyzeObservations(
    analysis_observations,
    classes=None,
    metric='nightly_linkages',
    linkage_min_obs=2,
    max_obs_separation=3.0/24,
    min_linkage_nights=3,
    column_mapping=column_mapping
)

In [40]:
summary_survey_MOPS["findable"].values[0]

9381

In [41]:
summary_survey_MOPS["findable"].values[0] / summary_recovered[summary_recovered["class"] == "All"]["found"].values[0] 

0.4479942693409742

In [42]:
percent_mops = 100 * summary_recovered[summary_recovered["class"] == "All"]["found"].values[0] / summary_survey_MOPS["findable"].values[0]
print(f"THOR discovery potential over ideal MOPS: {percent_mops:.3f}%")

THOR discovery potential over ideal MOPS: 223.217%


In [43]:
def calcDiscoverableZMODE(
        observations,          
        min_tracklets=2, 
        tracklet_min_obs=2,
        min_obs_per_track=4,
        max_track_night_span=4, 
        column_mapping=column_mapping
    ):

    # Count number of observations per object
    obs_per_obj = observations[column_mapping["truth"]].value_counts()
    possibly_findable = obs_per_obj.index.values[obs_per_obj >= min_obs_per_track]

    night_designation_count = observations.groupby(by=[column_mapping["night"]])[column_mapping["truth"]].value_counts()
    night_designation_count = pd.DataFrame(night_designation_count)
    night_designation_count.rename(columns={"obj_id": "num_obs"}, inplace=True)
    night_designation_count.reset_index(inplace=True)
    
    night_designation_count["delta_night"] = night_designation_count.groupby([column_mapping["truth"]])[column_mapping["night"]].diff()
    night_designation_count.loc[night_designation_count["delta_night"].isna(), "delta_night"] = 0
    night_designation_count["possible_tracklet"] = np.where(night_designation_count["num_obs"] >= tracklet_min_obs, 1, 0)

    night_designation_count = night_designation_count[night_designation_count["delta_night"] < max_track_night_span]
    tracklets_per_designation = night_designation_count.groupby(by=[column_mapping["truth"]])["possible_tracklet"].sum()
    possibly_findable = tracklets_per_designation.index.values[tracklets_per_designation >= min_tracklets]

    obs_per_designation = night_designation_count[night_designation_count[column_mapping["truth"]].isin(possibly_findable)].groupby([column_mapping["truth"]])["num_obs"].sum()
    return obs_per_designation.index.values[obs_per_designation >= min_obs_per_track]
    

In [44]:
findableZMODE = calcDiscoverableZMODE(analysis_observations)

In [45]:
len(findableZMODE)

14291

In [46]:
len(findableZMODE) / summary_recovered[summary_recovered["class"] == "All"]["found"].values[0] 

0.6824737344794651

In [47]:
percent_zmode = 100 * summary_recovered[summary_recovered["class"] == "All"]["found"].values[0] / len(findableZMODE)
print(f"THOR discovery potential over ideal ZMODE: {percent_zmode:.3f}%")

THOR discovery potential over ideal ZMODE: 146.526%


In [48]:
findable = all_truths_recovered[all_truths_recovered["findable"] == 1]["obj_id"].values
found = all_truths_recovered[all_truths_recovered["found"] >= 1]["obj_id"].values

sma_bins = [0.0, 1.7, 50.0]
for a_min, a_max in zip(sma_bins[:-1], sma_bins[1:]):
    found_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(found)]) 
    findable_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(findable)])
    print("Completeness between {} and {} AU: {:.2f} %, {}".format(a_min, a_max, found_in_bin / findable_in_bin * 100.0, found_in_bin))

Completeness between 0.0 and 1.7 AU: 28.89 %, 13
Completeness between 1.7 and 50.0 AU: 97.40 %, 20923


In [49]:
sma_bins = [0.0, 2.5, 50.0]
for a_min, a_max in zip(sma_bins[:-1], sma_bins[1:]):
    found_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(found)]) 
    findable_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(findable)])
    print("Completeness between {} and {} AU: {:.2f} %, {}".format(a_min, a_max, found_in_bin / findable_in_bin * 100.0, found_in_bin))

Completeness between 0.0 and 2.5 AU: 94.98 %, 6814
Completeness between 2.5 and 50.0 AU: 98.39 %, 14122


### ZTF Analysis with 2021 Catalog

In [50]:
DATA_DIR = "/mnt/data/projects/thor/thor_data/ztf/"
preprocessed_associations_2021 = pd.read_csv(
    os.path.join(DATA_DIR, "preprocessed_associations_20210420_3arcsec.csv"),
    index_col=False,
    dtype={"obs_id" : str}
)

RUN_DIR = "/mnt/data/projects/thor/thor_results/ztf/v1.1/run4"

In [51]:
# Read orbits file from 2021
known_orbits = pd.read_csv(
    "/mnt/data/projects/thor/thor_data/ztf/MPCORB_20210420.csv",
    index_col=False,
    low_memory=False
)
known_orbits = known_orbits[known_orbits["designation"].isin(preprocessed_associations_2021["obj_id"].unique())]

In [52]:
known_orbits

Unnamed: 0,designation,number,name,provisional_designation,other_provisional_designations,mjd_tt,a_au,e,i_deg,ascNode_deg,...,neo_flag,1km_neo_flag,pha_flag,1_oppo_flag,critical_list_flag,hex_flags,perturbers1,perturbers2,reference,computer
24,25,(25),Phocaea,A853 GB,['1956 GC'],59200.0,2.399704,0.254997,21.60654,214.11874,...,,,,,,0000,M-v,3Ek,MPO593088,Pan
34,35,(35),Leukothea,A855 HA,"['1948 DC', '1950 RS1', '1976 WH']",59200.0,2.993820,0.225167,7.93306,353.71444,...,,,,,,0000,M-v,3Ek,MPO593088,Pan
47,48,(48),Doris,A857 SA,['1948 FE'],59200.0,3.110068,0.072024,6.54750,183.54569,...,,,,,,0000,M-v,3Ek,MPO593088,Pan
57,58,(58),Concordia,A860 FA,['1928 XE'],59200.0,2.700345,0.042711,5.06600,161.09651,...,,,,,,0000,M-v,3Ek,MPO593089,Pan
64,65,(65),Cybele,A861 EB,['1949 YQ'],59200.0,3.425277,0.112211,3.56404,155.61552,...,,,,,,0000,M-v,3Ek,MPO593089,Pan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043860,2018 RN34,,,2018 RN34,,58360.0,2.772612,0.288885,11.38082,126.54590,...,,,,1.0,,2000,,,MPO461097,MPCMEL
1043861,2018 RR36,,,2018 RR36,,58400.0,3.056892,0.188234,18.00847,13.62534,...,,,,1.0,,2000,,,MPO473502,MPCW
1043862,2018 SL2,,,2018 SL2,,58400.0,2.162376,0.383049,8.44163,242.34856,...,,,,1.0,,2005,,,MPO473502,MPCW
1043867,2018 TO,,,2018 TO,,58380.0,2.566532,0.403415,30.57138,3.63667,...,,,,1.0,,2005,,,MPO461101,MPCMEL


In [53]:
sma_bins = [0, 1.7, 2.06, 2.5, 2.82, 2.95, 3.27, 5.0, 50, 1000.0]
classes_2021 = {}
for i, (bin_start, bin_end) in enumerate(zip(sma_bins[:-1], sma_bins[1:])):
    bin_mask = (known_orbits["a_au"] >= bin_start) & (known_orbits["a_au"] < bin_end)
    classes_2021["{}<=a<{}".format(bin_start, bin_end)] = known_orbits[bin_mask]["designation"].unique()

# Observations unattributed by ZTF
classes_2021["Unknown"] = preprocessed_associations_2021[preprocessed_associations_2021["obj_id"].str.contains("^u[0-9]{8}$", regex=True)]["obj_id"].unique()

# Observations attributed by ZTF that could not be matched the known catalog (probably designation changes or comets)
unclassified_mask = ~preprocessed_associations_2021["obj_id"].isin(known_orbits["designation"].unique()) & (~preprocessed_associations_2021["obj_id"].str.contains("^u[0-9]{8}$", regex=True))
classes_2021["Unmatched"] = preprocessed_associations_2021[unclassified_mask]["obj_id"].unique()

In [54]:
from thor.orbits import Orbits
from difi import analyzeLinkages
from difi import analyzeObservations

ANALYSIS_DIR = os.path.join(RUN_DIR, "analysis_2021")
os.makedirs(ANALYSIS_DIR, exist_ok=True)

column_mapping = {
    'linkage_id': 'orbit_id', 
    'obs_id': 'obs_id', 
    'truth': 'obj_id'
}

analysis_observations_2021 = preprocessed_observations.merge(preprocessed_associations_2021, on="obs_id")

all_truths_survey_2021, findable_observations_2021, summary_survey_2021 = analyzeObservations(
    analysis_observations_2021,
    classes=classes_2021,
    metric='min_obs',
    column_mapping=column_mapping,
    min_obs=5,
)

if not os.path.exists(os.path.join(ANALYSIS_DIR, "all_orbits_recovered.csv")):
    
    # Go through each individual patch directory (which themselves
    # are individual THOR runs) and analyze the performance
    all_linkages_patches_2021 = []
    all_truths_patches_2021 = []
    summary_patches_2021 = []

    contents = sorted(glob.glob(os.path.join(RUN_DIR, "patch_*")))
    for c in contents:
        if os.path.isdir(c):
            print(f"Analyzing Patch {os.path.basename(c)}")
            run_analysis, test_orbit_analysis = analyzeTHOR(
                preprocessed_associations_2021,
                c,
                classes=classes_2021,
            )

            all_linkages_patches_i, all_truths_patches_i, summary_patches_i = test_orbit_analysis
            all_linkages_patches_2021.append(all_linkages_patches_i)
            all_truths_patches_2021.append(all_truths_patches_i)
            summary_patches_2021.append(summary_patches_i)
        
        
    all_linkages_patches_2021 = pd.concat(
        all_linkages_patches_2021,
        ignore_index=True
    )
    all_truths_patches_2021 = pd.concat(
        all_truths_patches_2021,
        ignore_index=True
    )
    summary_patches_2021 = pd.concat(
        summary_patches_2021,
        ignore_index=True
    )
    
    all_orbits_recovered_2021, all_truths_recovered_2021, summary_recovered_2021 = analyzeLinkages(
        analysis_observations_2021,
        recovered_orbit_members,
        all_truths=all_truths_survey_2021,
        min_obs=5,
        contamination_percentage=0.0,
        classes=classes_2021,
        column_mapping=column_mapping
    )
    for df in [all_orbits_recovered_2021, all_truths_recovered_2021, summary_recovered_2021]:
        df.insert(0, "component", "combined")

    all_linkages_2021 = pd.concat([all_linkages_patches_2021, all_orbits_recovered_2021], ignore_index=True)
    all_truths_2021 = pd.concat([all_truths_patches_2021, all_truths_recovered_2021], ignore_index=True)
    summary_2021 = pd.concat([summary_patches_2021, summary_recovered_2021], ignore_index=True)
    
    for df in [all_orbits_recovered_2021, all_truths_recovered_2021, summary_recovered_2021]:
        df.drop(columns=["component"], inplace=True)
    
    summary_components_2021 = createComponentSummary(
        all_linkages_2021, 
        all_truths_2021, 
        components=["clustering", "iod", "od", "od+a", "combined"]
    )
    
    all_orbits_recovered_2021.to_csv(
        os.path.join(ANALYSIS_DIR, "all_orbits_recovered.csv"),
        index=False
    )
    all_truths_recovered_2021.to_csv(
        os.path.join(ANALYSIS_DIR, "all_truths_recovered.csv"),
        index=False
    )
    summary_recovered_2021.to_csv(
        os.path.join(ANALYSIS_DIR, "summary_recovered.csv"),
        index=False
    )
    all_linkages_2021.to_csv(
        os.path.join(ANALYSIS_DIR, "all_linkages.csv"),
        index=False
    )
    all_truths_2021.to_csv(
        os.path.join(ANALYSIS_DIR, "all_truths.csv"),
        index=False
    )
    summary_2021.to_csv(
        os.path.join(ANALYSIS_DIR, "summary.csv"),
        index=False
    )
    summary_components_2021.to_csv(
        os.path.join(ANALYSIS_DIR, "summary_components.csv"),
        index=False
    )

else:
    all_orbits_recovered_2021 = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "all_orbits_recovered.csv"),
        index_col=False
    )
    all_truths_recovered_2021 = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "all_truths_recovered.csv"),
        index_col=False
    )
    summary_recovered_2021 = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "summary_recovered.csv"),
        index_col=False
    )
    all_linkages_2021 = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "all_linkages.csv"),
        index_col=False
    )
    all_truths_2021 = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "all_truths.csv"),
        index_col=False
    )
    summary_2021 = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "summary.csv"),
        index_col=False
    )
    summary_components_2021 = pd.read_csv(
        os.path.join(ANALYSIS_DIR, "summary_components.csv"),
        index_col=False
    )

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [55]:
summary_components

Unnamed: 0,component,num_linkages,num_mixed,num_partial,num_pure,num_pure_complete,num_findable,num_found,num_found_pure,num_found_partial,completeness,completeness_pure,completeness_partial,purity,linkage_efficiency
0,clustering,7136467,3981065,2885578,269824,43037,21542,21121,21112,2385,98.045678,98.003899,11.071395,3.780918,0.295833
1,iod,1343065,995310,82798,264957,42583,21542,21043,21038,2350,97.683595,97.660384,10.908922,19.727787,1.566417
2,od,306363,42262,0,264101,42203,21542,21018,21018,0,97.567542,97.567542,0.0,86.205253,6.860489
3,od+a,75120,3883,0,71237,64041,21542,20995,20995,0,97.460774,97.460774,0.0,94.830937,27.948616
4,combined,21723,705,0,21018,20407,21542,20940,20940,0,97.205459,97.205459,0.0,96.754592,96.395525


In [56]:
summary_components_2021

Unnamed: 0,component,num_linkages,num_mixed,num_partial,num_pure,num_pure_complete,num_findable,num_found,num_found_pure,num_found_partial,completeness,completeness_pure,completeness_partial,purity,linkage_efficiency
0,clustering,7136467,3850260,3007833,278374,44178,22091,21635,21627,2443,97.935811,97.899597,11.058802,3.900726,0.303049
1,iod,1343065,983447,86302,273316,43724,22091,21555,21551,2407,97.573673,97.555566,10.89584,20.350169,1.604613
2,od,306363,34055,0,272308,43343,22091,21534,21534,0,97.478611,97.478611,0.0,88.884102,7.028917
3,od+a,75120,2093,0,73027,65595,22091,21515,21515,0,97.392603,97.392603,0.0,97.213791,28.640841
4,combined,21723,183,0,21540,20919,22091,21459,21459,0,97.139106,97.139106,0.0,99.157575,98.784698


In [57]:
summary_recovered

Unnamed: 0,class,num_members,num_obs,completeness,findable,found,findable_found,findable_missed,not_findable_found,not_findable_missed,...,unique_in_partial_linkages_only,unique_in_pure_and_partial_linkages,unique_in_partial_linkages,unique_in_partial_contaminant_linkages,unique_in_mixed_linkages,obs_in_pure_linkages,obs_in_pure_complete_linkages,obs_in_partial_linkages,obs_in_partial_contaminant_linkages,obs_in_mixed_linkages
0,All,635058,827546,97.205459,21542,20940,20940,602,0,613516,...,0,0,0,0,4554,161849,154423,0,0,4973
1,Unknown,572188,572188,,0,0,0,0,0,572188,...,0,0,0,0,4384,0,0,0,0,4384
2,2.5<=a<2.82,22490,90941,97.904496,7874,7709,7709,165,0,14616,...,0,0,0,0,58,58153,56116,0,0,154
3,2.06<=a<2.5,19453,77147,95.574273,6530,6241,6241,289,0,12923,...,0,0,0,0,54,47572,45557,0,0,180
4,2.95<=a<3.27,15918,64226,99.048664,5466,5414,5414,52,0,10452,...,0,0,0,0,43,41418,39983,0,0,161
5,1.7<=a<2.06,1504,9177,93.489149,599,560,560,39,0,905,...,0,0,0,0,7,6762,5027,0,0,36
6,2.82<=a<2.95,2453,9105,98.913043,736,728,728,8,0,1717,...,0,0,0,0,1,5438,5383,0,0,1
7,3.27<=a<5.0,549,2275,97.959184,196,192,192,4,0,353,...,0,0,0,0,1,1481,1423,0,0,3
8,5.0<=a<50,331,1309,97.530864,81,79,79,2,0,250,...,0,0,0,0,2,803,781,0,0,2
9,0<=a<1.7,126,866,28.888889,45,13,13,32,0,81,...,0,0,0,0,3,190,121,0,0,24


In [58]:
summary_recovered_2021

Unnamed: 0,class,num_members,num_obs,completeness,findable,found,findable_found,findable_missed,not_findable_found,not_findable_missed,...,unique_in_partial_linkages_only,unique_in_pure_and_partial_linkages,unique_in_partial_linkages,unique_in_partial_contaminant_linkages,unique_in_mixed_linkages,obs_in_pure_linkages,obs_in_pure_complete_linkages,obs_in_partial_linkages,obs_in_partial_contaminant_linkages,obs_in_mixed_linkages
0,All,628164,827546,97.139106,22091,21459,21459,632,0,606073,...,0,0,0,0,678,165847,158185,0,0,975
1,Unknown,560400,560400,,0,0,0,0,0,560400,...,0,0,0,0,552,0,0,0,0,552
2,2.5<=a<2.82,24216,95271,97.834962,8083,7908,7908,175,0,16133,...,0,0,0,0,35,59686,57533,0,0,95
3,2.06<=a<2.5,20754,80187,95.296585,6676,6362,6362,314,0,14078,...,0,0,0,0,50,48354,46345,0,0,176
4,2.95<=a<3.27,17349,67568,99.108258,5607,5557,5557,50,0,11742,...,0,0,0,0,29,42624,41160,0,0,81
5,1.7<=a<2.06,1694,9825,93.343899,631,589,589,42,0,1063,...,0,0,0,0,5,7076,5244,0,0,14
6,2.82<=a<2.95,2638,9528,98.811096,757,748,748,9,0,1881,...,0,0,0,0,0,5559,5504,0,0,0
7,3.27<=a<5.0,608,2409,98.029557,203,199,199,4,0,405,...,0,0,0,0,1,1528,1470,0,0,3
8,5.0<=a<50,337,1321,97.560976,82,80,80,2,0,255,...,0,0,0,0,2,808,786,0,0,2
9,0<=a<1.7,144,987,27.083333,48,13,13,35,0,96,...,0,0,0,0,4,190,121,0,0,52


Filtering out suspect orbits for 2018 and 2021

In [59]:
mixed_orbits_2021 = all_orbits_recovered_2021[all_orbits_recovered_2021["mixed"] == 1]["orbit_id"].unique()
pure_orbits_2021 = all_orbits_recovered_2021[all_orbits_recovered_2021["pure"] == 1]["orbit_id"].unique()
analysis_orbit_members_2021 = recovered_orbit_members.merge(preprocessed_associations_2021, on="obs_id")

In [60]:
# Remove observations that are made within 5 minutes (using 2018)
high_quality_mask = (
    analysis_orbit_members["orbit_id"].isin(mixed_orbits_2018) 
    & ((analysis_orbit_members["dt_sec"].isna()) | (analysis_orbit_members["dt_sec"] > 1800))
)
occurences = analysis_orbit_members[high_quality_mask]["orbit_id"].value_counts()
orbit_ids_keep = occurences.index.values[occurences.values >= 5]
print(len(orbit_ids_keep))
mixed_orbits_high_quality_2018 = orbit_ids_keep

526


In [61]:
# Remove observations of known objects (using 2018)
high_quality_mask_2 = (
    high_quality_mask & analysis_orbit_members["obj_id"].str.contains("^u[0-9]{8}$", regex=True)
)
occurences = analysis_orbit_members[high_quality_mask_2]["orbit_id"].value_counts()
orbit_ids_keep = occurences.index.values[occurences.values >= 5]
print(len(orbit_ids_keep))
mixed_orbits_high_quality_2018 = orbit_ids_keep

488


In [62]:
print("Known in 2021")
known_2021 = recovered_orbits[
    recovered_orbits["orbit_id"].isin(pure_orbits_2021) 
]
print(len(known_2021), all_orbits_recovered_2021[all_orbits_recovered_2021["orbit_id"].isin(known_2021["orbit_id"].values)]["linked_truth"].nunique())

print("Unknown in 2021")
unknown_2021 = recovered_orbits[
    recovered_orbits["orbit_id"].isin(mixed_orbits_2021) 
]
print(len(unknown_2021))

print("Unknown in 2018, Found in 2021")
found_since_2018 = recovered_orbits[
    (recovered_orbits["orbit_id"].isin(mixed_orbits_2018) 
     & recovered_orbits["orbit_id"].isin(pure_orbits_2021)) 
]
print(len(found_since_2018))

print("Hiqh Quality Unknown in 2018, Found in 2021")
found_since_2018_high_quality = recovered_orbits[
    (recovered_orbits["orbit_id"].isin(mixed_orbits_high_quality_2018) 
     & recovered_orbits["orbit_id"].isin(pure_orbits_2021)) 
]
print(len(found_since_2018_high_quality))

print("Hiqh Quality Unknown in 2018, Unknown in 2021")
unknown_2021_high_quality = recovered_orbits[
    (recovered_orbits["orbit_id"].isin(mixed_orbits_high_quality_2018) 
     & (~recovered_orbits["orbit_id"].isin(pure_orbits_2021)))
]
print(len(unknown_2021_high_quality))

Known in 2021
21540 21459
Unknown in 2021
183
Unknown in 2018, Found in 2021
523
Hiqh Quality Unknown in 2018, Found in 2021
477
Hiqh Quality Unknown in 2018, Unknown in 2021
11


In [63]:
Orbits.from_df(unknown_2021_high_quality).to_csv(
    os.path.join(RUN_DIR, "discovery_candidates.csv")
)

In [64]:
unknown_2021_high_quality

Unnamed: 0,orbit_id,epoch,x,y,z,vx,vy,vz,covariance,r,...,v,v_sigma,arc_length,num_obs,num_params,num_iterations,chi2,rchi2,improved,num_nights
399,01abe8115046493d8267339fb50074c1,58376.32808,1.761619,-0.064171,-0.228694,0.000884,0.014116,-0.002094,"[[2.36346937e-06, 2.24139386e-07, -7.06175187e...",1.77756,...,0.014297,1.6e-05,5.051516,5,6,2,21.279035,5.319759,False,3
5266,17be63d615a14ebd9c84e5622a59036a,58373.289621,2.027553,-0.640169,0.92274,0.002921,0.012404,0.00097,"[[1.16973398e-07, -5.08875874e-08, 1.03705851e...",2.317808,...,0.01278,9e-06,11.920995,5,6,10,11.199017,2.799754,False,5
6907,208a071be88844d6837218f5cd2e1d9b,58369.384905,2.053634,-0.113435,0.504916,0.00084,0.012497,0.004074,"[[4.39336398e-06, 5.70024193e-07, 2.05639032e-...",2.117834,...,0.013171,2.4e-05,4.996319,5,6,2,3.145274,0.786318,False,5
10442,35d34162c69948b68cfd25600a6b2cbb,58374.344944,2.058807,0.075163,0.451023,0.001313,0.012869,0.001614,"[[2.49615042e-06, 5.74066733e-07, 1.05446024e-...",2.108971,...,0.013036,2.4e-05,6.051146,5,6,10,2.957583,0.739396,False,4
12403,43cd7cc3099644e089129f11dc5ad3cf,58370.404675,1.66919,0.124587,0.461137,-0.001988,0.013284,0.002791,"[[7.30154796e-08, 3.81182063e-08, 4.87623749e-...",1.736193,...,0.013719,4e-06,9.9575,5,6,10,15.086939,3.771735,True,5
15457,5f5f205516d5445b9bbae9e80d6ab0ca,58369.14512,2.801118,-1.199882,9.111245,-0.003198,-0.004749,-0.005921,"[[3.5047149e-05, -1.81006764e-05, 0.0001747602...",9.607329,...,0.008236,0.000204,6.127639,5,6,10,29.017786,7.254447,False,4
15984,64fe94b3869d4ba5be6fff9c78198908,58367.328609,2.35247,0.371711,0.923012,0.000147,0.0116,0.000836,"[[3.65261227e-06, 1.73593133e-06, 2.43102112e-...",2.554258,...,0.011631,4.8e-05,7.99272,5,6,10,16.02269,4.005673,True,4
17691,7b4c42ee1e804630ac7712df0cd79642,58373.305308,1.493656,-0.076738,0.157969,0.001305,0.015983,0.00031,"[[3.16501492e-07, 6.96640011e-08, 9.89778756e-...",1.503945,...,0.016039,1.8e-05,5.955949,5,6,10,6.057678,1.51442,True,5
18527,88bf5ffbc3ed444c84ce1c97b6c8efd9,58373.302984,1.780391,-0.041714,0.314543,-0.000168,0.013158,0.005104,"[[6.99916674e-08, 1.28899385e-08, 2.77864709e-...",1.808444,...,0.014114,3e-06,10.982662,5,6,10,2.282007,0.570502,False,5
21348,ddffb3b24dfd411f8a15b0d8a298af39,58374.338842,1.757968,-0.101169,0.579006,-0.00109,0.012519,0.0067,"[[4.3984375e-07, 3.9686143e-08, 3.32306913e-07...",1.853628,...,0.014242,1e-05,6.974884,7,6,10,19.041634,2.380204,True,6


In [72]:
import astropy.units as u

from astroquery.imcce import Skybot
from astropy.coordinates import SkyCoord
from astropy.time import Time

def createMPCCheckerQuery(observation):
    
    assert len(observation) == 1
    
    # Configure RA, Dec
    coords = SkyCoord(
        ra=observation["RA_deg"].values[:1]*u.deg, 
        dec=observation["Dec_deg"].values[:1]*u.deg
    )
    ra_hms = "{:02.0f} {:02.0f} {:05.2f}".format(*coords.ra[0].hms)
    dec_dms = "{:+03.0f} {:02.0f} {:05.2f}".format(*coords.dec[0].dms)
    
    # Configure observation time
    observation_time = Time(
        observation["mjd_utc"].values[0],
        scale="utc",
        format="mjd"
    )
    decimal_day = np.modf(observation["mjd_utc"].values[0])[0]
    time = "{} {} {}{}".format(
        *observation_time.utc.isot.split("T")[0].split("-"),
        "{:.2f}".format(decimal_day).lstrip("0")
    )
    
    observatory_code = observation["observatory_code"].values[0]
    
    return (ra_hms, dec_dms, time, observatory_code)
    
    
def querySkyBot(observations, radius=50*u.arcsecond): 
    
    # Configure RA, Dec
    coords = SkyCoord(
        ra=observations["RA_deg"].values*u.deg, 
        dec=observations["Dec_deg"].values*u.deg
    )
    
    # Configure observation time
    observation_times = Time(
        observations["mjd_utc"].values,
        scale="utc",
        format="mjd"
    )    
    observatory_codes = observations["observatory_code"].values
    obs_ids = observations["obs_id"].values
    
    results = []
    for i, obs_id in enumerate(obs_ids):
        try:
            result = Skybot.cone_search(
                coords[i],
                radius, 
                observation_times[i], 
                location=observatory_codes[i]
            )
            result = result.to_pandas()
            result.insert(0, "orbit_id", orbit_id)
            result.insert(1, "obs_id", obs_id)
        except RuntimeError as e:
            result = pd.DataFrame({
                "orbit_id" : [orbit_id],
                "obs_id" : [obs_id]
            })
            
        results.append(result)
        
    results = pd.concat(
        results,
        ignore_index=True
    )
    return results

In [73]:
result_dfs = []
for orbit_id in unknown_2021_high_quality["orbit_id"].unique():

    print(orbit_id)
    obs_ids = recovered_orbit_members[recovered_orbit_members["orbit_id"].isin([orbit_id])]["obs_id"].values
    selected_obs = preprocessed_observations[preprocessed_observations["obs_id"].isin(obs_ids)]
    results = querySkyBot(selected_obs)
    result_dfs.append(results)
    
results = pd.concat(
    result_dfs,
    ignore_index=True
)

01abe8115046493d8267339fb50074c1
17be63d615a14ebd9c84e5622a59036a
208a071be88844d6837218f5cd2e1d9b
35d34162c69948b68cfd25600a6b2cbb
43cd7cc3099644e089129f11dc5ad3cf
5f5f205516d5445b9bbae9e80d6ab0ca
64fe94b3869d4ba5be6fff9c78198908
7b4c42ee1e804630ac7712df0cd79642
88bf5ffbc3ed444c84ce1c97b6c8efd9
ddffb3b24dfd411f8a15b0d8a298af39
fd0a80fb496040709c9b33b4f8148d1b


In [74]:
results

Unnamed: 0,orbit_id,obs_id,Number,Name,RA,DEC,Type,V,posunc,centerdist,...,heliodist,alpha,elong,x,y,z,vx,vy,vz,epoch
0,01abe8115046493d8267339fb50074c1,619330380815015010,,,,,,,,,...,,,,,,,,,,
1,01abe8115046493d8267339fb50074c1,619395040815015010,,,,,,,,,...,,,,,,,,,,
2,01abe8115046493d8267339fb50074c1,622331881115015006,,,,,,,,,...,,,,,,,,,,
3,01abe8115046493d8267339fb50074c1,622413021115015007,,,,,,,,,...,,,,,,,,,,
4,01abe8115046493d8267339fb50074c1,624381894915015004,,,,,,,,,...,,,,,,,,,,
5,17be63d615a14ebd9c84e5622a59036a,612354983715015046,,,,,,,,,...,,,,,,,,,,
6,17be63d615a14ebd9c84e5622a59036a,613319693715015045,,,,,,,,,...,,,,,,,,,,
7,17be63d615a14ebd9c84e5622a59036a,619297254015015014,,,,,,,,,...,,,,,,,,,,
8,17be63d615a14ebd9c84e5622a59036a,621343584315015004,,,,,,,,,...,,,,,,,,,,
9,17be63d615a14ebd9c84e5622a59036a,624275984315015043,,,,,,,,,...,,,,,,,,,,


In [75]:
recovered_orbits[recovered_orbits["orbit_id"] == "5f5f205516d5445b9bbae9e80d6ab0ca"]

Unnamed: 0,orbit_id,epoch,x,y,z,vx,vy,vz,covariance,r,...,v,v_sigma,arc_length,num_obs,num_params,num_iterations,chi2,rchi2,improved,num_nights
15457,5f5f205516d5445b9bbae9e80d6ab0ca,58369.14512,2.801118,-1.199882,9.111245,-0.003198,-0.004749,-0.005921,"[[3.5047149e-05, -1.81006764e-05, 0.0001747602...",9.607329,...,0.008236,0.000204,6.127639,5,6,10,29.017786,7.254447,False,4


In [77]:
q = (-47.494 * (1 - 1.101))
print(q)

4.796893999999999
