## Analysis
In this notebook, we generate some of the summary statistics for the THOR runs on both simulations and ZTF alerts.

Data and results files for this notebook may be downloaded [here](https://dirac.astro.washington.edu/~moeyensj/projects/thor/paper1/).

In [1]:
import glob
import os
import numpy as np
import pandas as pd
import sqlite3 as sql
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.colors as colors
import seaborn as sns
sns.set(font_scale=1.2, context="paper", style="ticks")
sns.set_palette("viridis")

import mysql.connector as mariadb
from astropy.time import Time

from scipy.stats import binned_statistic_2d

%matplotlib inline

import plotly
plotly.offline.init_notebook_mode(connected=True)

import sys
sys.path.append("/epyc/projects/thor/thor")

In [2]:
import thor

### Simulations

In [3]:
RUN_DIR = "/epyc/projects/thor/results/msst_4x4/run_16/"
DATABASE = "/epyc/projects/thor/data/msst_4x4/msst_survey.db"
con = sql.connect(DATABASE)

In [4]:
allObjects_survey = pd.read_csv(os.path.join(RUN_DIR, "allObjects_survey.txt"), sep=" ", index_col=False)
summary_survey = pd.read_csv(os.path.join(RUN_DIR, "summary_survey.txt"), sep=" ", index_col=False)
summary_orbits = pd.read_csv(os.path.join(RUN_DIR, "summary_orbits.txt"), sep=" ", index_col=False)
test_orbits_survey = pd.read_csv(os.path.join(RUN_DIR, "orbits.txt"), sep=" ", index_col=False)

known_orbits = pd.read_sql("SELECT * FROM mpcOrbitCat", con)
known_orbits = known_orbits[known_orbits["designation"].isin(allObjects_survey["designation"].values)]

In [5]:
findable = allObjects_survey[allObjects_survey["findable"] == 1]["designation"].values
found = allObjects_survey[allObjects_survey["found"] == 1]["designation"].values
missed = allObjects_survey[(allObjects_survey["found"] == 0) & (allObjects_survey["findable"] == 1) ]["designation"].values
test_orbits = test_orbits_survey["designation"].values

falsePositiveIDs = ["NS"]
unknownIDs = []

In [6]:
def printRunStats(allObjects, orbits):
    completeness = len(allObjects[allObjects["found"] == 1]) / len(allObjects[allObjects["findable"] == 1])
    print("Completeness: {:.2f}".format(completeness * 100.0))
    print("Number of objects findable: {}".format(len(allObjects[allObjects["findable"] == 1])))
    print("Number of objects found: {}".format(len(allObjects[allObjects["found"] == 1])))
    print("Number of objects missed: {}".format(len(allObjects[(allObjects["found"] == 0) & (allObjects["findable"] == 1)])))
    print("Number of test orbits: {}".format(len(orbits)))
    
def printClusterStats(allClusters, falsePositiveIDs=["NS"], unknownIDs=[]):
    # Don't want linkages with NaN linked ID, or linked false positive IDs or unknownIDs
    good_linkages = allClusters[(~allClusters["linked_object"].isna()) & (~allClusters["linked_object"].isin(falsePositiveIDs + unknownIDs))]
    
    print("Total clusters: {}".format(len(allClusters)))
    print("Total pure clusters: {}".format(len(good_linkages[good_linkages["pure"] == 1])))
    print("Total partial clusters: {}".format(len(good_linkages[good_linkages["partial"] == 1])))
    print("Total pure + partial clusters: {}".format(len(good_linkages)))
    print("Total false or unknown clusters: {}".format(len(allClusters) - len(good_linkages)))
    print("Cluster Contamination [%]: {:.2f}".format((1 - len(good_linkages)/len(allClusters)) * 100.0))
    print("Total duplicate visit clusters: {}".format(len(allClusters[allClusters["num_dupes"] != 0])))

In [7]:
printRunStats(allObjects_survey, test_orbits)

Completeness: 94.57
Number of objects findable: 18332
Number of objects found: 17337
Number of objects missed: 995
Number of test orbits: 9


In [8]:
allClusters_projection = []
allObjects_projection = []
for i in range(9):
    allClusters_projection.append(pd.read_csv(os.path.join(RUN_DIR, "orbit_{:04d}/allClusters.txt".format(i + 1)), sep=" ", index_col=False))
    allObjects_projection.append(pd.read_csv(os.path.join(RUN_DIR, "orbit_{:04d}/allObjects.txt".format(i + 1)), sep=" ", index_col=False))
    
allClusters_projection = pd.concat(allClusters_projection)
allObjects_projection = pd.concat(allObjects_projection)

allClusters_projection["num_dupes"] = allClusters_projection["num_obs"] - allClusters_projection["num_visits"] 

In [9]:
printClusterStats(allClusters_projection, falsePositiveIDs=falsePositiveIDs, unknownIDs=unknownIDs)

Total clusters: 238205
Total pure clusters: 46688
Total partial clusters: 12910
Total pure + partial clusters: 59598
Total false or unknown clusters: 178607
Cluster Contamination [%]: 74.98
Total duplicate visit clusters: 157622


In [10]:
printClusterStats(allClusters_projection[allClusters_projection["num_dupes"] == 0], falsePositiveIDs=falsePositiveIDs, unknownIDs=unknownIDs)

Total clusters: 80583
Total pure clusters: 46688
Total partial clusters: 2457
Total pure + partial clusters: 49145
Total false or unknown clusters: 31438
Cluster Contamination [%]: 39.01
Total duplicate visit clusters: 0


In [11]:
printClusterStats(allClusters_projection[allClusters_projection["num_dupes"].isin([0, 1])], falsePositiveIDs=falsePositiveIDs, unknownIDs=unknownIDs)

Total clusters: 178243
Total pure clusters: 46688
Total partial clusters: 12910
Total pure + partial clusters: 59598
Total false or unknown clusters: 118645
Cluster Contamination [%]: 66.56
Total duplicate visit clusters: 97660


In [12]:
sma_bins = [1.7, 2.06, 2.5, 2.82, 2.95, 3.27, 5.0, 50.0]

for a_min, a_max in zip(sma_bins[:-1], sma_bins[1:]):
    found_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(found)]) 
    findable_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(findable)])
    print("Completeness between {} and {} AU: {:.2f} %, {}".format(a_min, a_max, found_in_bin / findable_in_bin * 100.0, found_in_bin))

Completeness between 1.7 and 2.06 AU: 65.10 %, 125
Completeness between 2.06 and 2.5 AU: 88.78 %, 6164
Completeness between 2.5 and 2.82 AU: 98.99 %, 6351
Completeness between 2.82 and 2.95 AU: 99.65 %, 855
Completeness between 2.95 and 3.27 AU: 99.59 %, 3437
Completeness between 3.27 and 5.0 AU: 99.44 %, 353
Completeness between 5.0 and 50.0 AU: 100.00 %, 57


In [13]:
sma_bins = [0.0, 1.7, 50.0]
for a_min, a_max in zip(sma_bins[:-1], sma_bins[1:]):
    found_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(found)]) 
    findable_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(findable)])
    print("Completeness between {} and {} AU: {:.2f} %, {}".format(a_min, a_max, found_in_bin / findable_in_bin * 100.0, found_in_bin))

Completeness between 0.0 and 1.7 AU: 18.75 %, 15
Completeness between 1.7 and 50.0 AU: 94.91 %, 17342


### ZTF 

In [14]:
columnMapping = {        
        
        # Observation ID
        "obs_id" : "obs_id",
        
        # Exposure time
        "exp_mjd" : "exp_mjd",
        
        # Visit ID
        "visit_id" : "visit_id",
        
        # Field ID
        "field_id" : "field",
        
        # Field RA in degrees
        "field_RA_deg" : "fieldRA_deg",
        
        # Field Dec in degrees
        "field_Dec_deg" : "fieldDec_deg",
        
        # Night number
        "night": "nid",
        
        # RA in degrees
        "RA_deg" : "ra",
        
        # Dec in degrees
        "Dec_deg" : "decl",
        
        # Observer's x coordinate in AU
        "obs_x_au" : "HEclObsy_X_au",
        
        # Observer's y coordinate in AU
        "obs_y_au" : "HEclObsy_Y_au",
        
        # Observer's z coordinate in AU
        "obs_z_au" : "HEclObsy_Z_au",
        
        # Magnitude (UNUSED)
        "mag" : "magpsf",
        
        ### Truth Parameters
        
        # Object name
        "name" : "designation",
        
        # Observer-object distance in AU
        "Delta_au" : "Delta_au",
        
        # Sun-object distance in AU (heliocentric distance)
        "r_au" : "r_au",
        
        # Object's x coordinate in AU
        "obj_x_au" : "HEclObj_X_au",
        
        # Object's y coordinate in AU
        "obj_y_au" : "HEclObj_Y_au",
        
        # Object's z coordinate in AU
        "obj_z_au" : "HEclObj_Z_au",
        
        # Object's x velocity in AU per day
        "obj_dx/dt_au_p_day" : "HEclObj_dX/dt_au_p_day",
        
        # Object's y velocity in AU per day
        "obj_dy/dt_au_p_day" : "HEclObj_dY/dt_au_p_day",
        
        # Object's z velocity in AU per day
        "obj_dz/dt_au_p_day" : "HEclObj_dZ/dt_au_p_day",
        
        # Semi-major axis
        "a_au" : "a_au",
        
        # Inclination
        "i_deg" : "i_deg",
        
        # Eccentricity
        "e" : "e",
    }

In [15]:
RUN_DIR = "/epyc/projects/thor/results/ztf/run_16/"

In [16]:
allObjects_survey = pd.read_csv(os.path.join(RUN_DIR, "allObjects_survey.txt"), sep=" ", index_col=False)
summary_survey = pd.read_csv(os.path.join(RUN_DIR, "summary_survey.txt"), sep=" ", index_col=False)
summary_orbits = pd.read_csv(os.path.join(RUN_DIR, "summary_orbits.txt"), sep=" ", index_col=False)
test_orbits_survey = pd.read_csv(os.path.join(RUN_DIR, "orbits.txt"), sep=" ", index_col=False)

known_orbits = pd.read_sql("SELECT * FROM mpcOrbitCat", con)
known_orbits = known_orbits[known_orbits["designation"].isin(allObjects_survey["designation"].values)]

In [17]:
findable = allObjects_survey[allObjects_survey["findable"] == 1]["designation"].values
found = allObjects_survey[allObjects_survey["found"] == 1]["designation"].values
missed = allObjects_survey[(allObjects_survey["found"] == 0) & (allObjects_survey["findable"] == 1) ]["designation"].values
test_orbits = test_orbits_survey["designation"].values

falsePositiveIDs = []
unknownIDs = [-1, "-1"]

In [18]:
DATA_DIR = "/epyc/projects/thor/data/ztf"

simulated_ephemeris = thor.readEPHFile(os.path.join(DATA_DIR, "MPCORB_20181106_ZTF.eph"))
orbits = thor.readORBFile(os.path.join(DATA_DIR, "MPCORB_20181106_ZTF_keplerian.orb"), elementType="keplerian")
observations = pd.read_csv(os.path.join(DATA_DIR, "observations_nid_610_624.txt"), sep=" ", index_col=False, low_memory=False)

observations.sort_values(by="mjd", inplace=True)
observations.loc[observations["designation"].isna(), "designation"] = -1

In [19]:
def calcFindableMOPS(observations, trackletMinObs=2, trackMinNights=3, falsePositiveIDs=[-1], unknownIDs=[]):
    # Groupby night, then count number of occurences per night
    night_designation_count = observations[~observations["designation"].isin(falsePositiveIDs + unknownIDs)].groupby(["nid"])["designation"].value_counts()
    night_designation_count = pd.DataFrame(night_designation_count)
    night_designation_count.rename(columns={"designation": "num_obs"}, inplace=True)
    night_designation_count.reset_index(inplace=True)

    # Remove nightly detections that would not be linked into a tracklet
    night_designation_count = night_designation_count[night_designation_count["num_obs"] >= trackletMinObs]

    # Groupby object then count number of nights
    try: 
        designation_night_count = pd.DataFrame(night_designation_count.groupby(["designation"])["nid"].value_counts())
    except:
        # No objects satisfy the requirements, return empty array
        return np.array([])
    designation_night_count.rename(columns={"nid": "num_nights"}, inplace=True)
    designation_night_count.reset_index(inplace=True)

    # Grab objects that meet the night requirement
    tracklet_nights_possible = designation_night_count["designation"].value_counts()
    return tracklet_nights_possible.index[tracklet_nights_possible >= trackMinNights].values

In [20]:
findableMOPS = calcFindableMOPS(observations)

In [21]:
len(findableMOPS)

9373

In [22]:
allObjects_survey[allObjects_survey["designation"].isin(findableMOPS)]["found"].sum() / len(findableMOPS)

0.9900778832817668

In [23]:
len(findableMOPS) / 21401 * 100

43.79701883089575

In [24]:
def calcDiscoverableZMODE(observations, 
                          minTracklets=2, 
                          trackletMinObs=2,
                          minObsPerTrack=4,
                          maxTrackNightSpan=4, 
                          falsePositiveIDs=[-1],
                          unknownIDs=[]):

    # Count number of observations per object
    obs_per_obj = observations["designation"].value_counts()
    possibly_findable = obs_per_obj.index.values[obs_per_obj >= minObsPerTrack]

    night_designation_count = observations[~observations["designation"].isin(falsePositiveIDs + unknownIDs)].groupby(["nid"])["designation"].value_counts()
    night_designation_count = pd.DataFrame(night_designation_count)
    night_designation_count.rename(columns={"designation": "num_obs"}, inplace=True)
    night_designation_count.reset_index(inplace=True)
    
    night_designation_count["delta_night"] = night_designation_count.groupby(["designation"])["nid"].diff()
    night_designation_count.loc[night_designation_count["delta_night"].isna(), "delta_night"] = 0
    night_designation_count["possible_tracklet"] = np.where(night_designation_count["num_obs"] >= trackletMinObs, 1, 0)

    night_designation_count = night_designation_count[night_designation_count["delta_night"] < maxTrackNightSpan]
    tracklets_per_designation = night_designation_count.groupby(["designation"])["possible_tracklet"].sum()
    possibly_findable = tracklets_per_designation.index.values[tracklets_per_designation >= minTracklets]

    obs_per_designation = night_designation_count[night_designation_count["designation"].isin(possibly_findable)].groupby(["designation"])["num_obs"].sum()
    return obs_per_designation.index.values[obs_per_designation >= minObsPerTrack]
    

In [25]:
findableZMODE = calcDiscoverableZMODE(observations)

In [26]:
len(findableZMODE)

14200

In [27]:
allObjects_survey[allObjects_survey["designation"].isin(findableZMODE)]["found"].sum() / len(findableZMODE)

0.9557746478873239

In [28]:
len(findableZMODE) / 21401 * 100

66.35203962431662

In [29]:
_, _ = thor.analyzeObservations(observations, unknownIDs=unknownIDs, falsePositiveIDs=falsePositiveIDs, columnMapping=columnMapping)

THOR: analyzeObservations
-------------------------
Analyzing observations...
Known object observations: 252836
Unknown object observations: 574710
False positive observations: 0
Percent known object observations (%): 30.553
Percent unknown object observations (%): 69.447
Percent false positive observations (%): 0.000
Unique known objects: 62307
Unique known objects with at least 5 detections: 21401

Total time in seconds: 0.5905530452728271
-------------------------



In [30]:
printRunStats(allObjects_survey, test_orbits)

Completeness: 97.38
Number of objects findable: 21401
Number of objects found: 20840
Number of objects missed: 561
Number of test orbits: 821


In [31]:
allClusters_projection = []
allObjects_projection = []
for i in range(821):
    try:
        allClusters_projection.append(pd.read_csv(os.path.join(RUN_DIR, "orbit_{:04d}/allClusters.txt".format(i + 1)), sep=" ", index_col=False, low_memory=False))
        allObjects_projection.append(pd.read_csv(os.path.join(RUN_DIR, "orbit_{:04d}/allObjects.txt".format(i + 1)), sep=" ", index_col=False, low_memory=False))
    except:
        continue
    
allClusters_projection = pd.concat(allClusters_projection)
allObjects_projection = pd.concat(allObjects_projection)

allClusters_projection.loc[allClusters_projection["linked_object"].isin([-1]), "linked_object"] = "-1"
allClusters_projection["num_dupes"] = allClusters_projection["num_obs"] - allClusters_projection["num_visits"] 

In [32]:
printClusterStats(allClusters_projection, falsePositiveIDs=falsePositiveIDs, unknownIDs=unknownIDs)

Total clusters: 4282373
Total pure clusters: 74148
Total partial clusters: 167486
Total pure + partial clusters: 241634
Total false or unknown clusters: 4040739
Cluster Contamination [%]: 94.36
Total duplicate visit clusters: 2521184


In [33]:
printClusterStats(allClusters_projection[allClusters_projection["num_dupes"] == 0], falsePositiveIDs=falsePositiveIDs, unknownIDs=unknownIDs)

Total clusters: 1761189
Total pure clusters: 73939
Total partial clusters: 162256
Total pure + partial clusters: 236195
Total false or unknown clusters: 1524994
Cluster Contamination [%]: 86.59
Total duplicate visit clusters: 0


In [34]:
printClusterStats(allClusters_projection[allClusters_projection["num_dupes"].isin([0, 1])], falsePositiveIDs=falsePositiveIDs, unknownIDs=unknownIDs)

Total clusters: 2026543
Total pure clusters: 74148
Total partial clusters: 167040
Total pure + partial clusters: 241188
Total false or unknown clusters: 1785355
Cluster Contamination [%]: 88.10
Total duplicate visit clusters: 265354


In [35]:
sma_bins = [1.7, 2.06, 2.5, 2.82, 2.95, 3.27, 5.0, 100.0]

for a_min, a_max in zip(sma_bins[:-1], sma_bins[1:]):
    found_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(found)]) 
    findable_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(findable)])
    print("Completeness between {} and {} AU: {:.2f} %, {}".format(a_min, a_max, found_in_bin / findable_in_bin * 100.0, found_in_bin))

Completeness between 1.7 and 2.06 AU: 94.61 %, 544
Completeness between 2.06 and 2.5 AU: 95.69 %, 6218
Completeness between 2.5 and 2.82 AU: 97.79 %, 7639
Completeness between 2.82 and 2.95 AU: 99.04 %, 724
Completeness between 2.95 and 3.27 AU: 99.26 %, 5375
Completeness between 3.27 and 5.0 AU: 98.96 %, 191
Completeness between 5.0 and 100.0 AU: 97.56 %, 80


In [36]:
sma_bins = [0.0, 1.7, 50.0]
for a_min, a_max in zip(sma_bins[:-1], sma_bins[1:]):
    found_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(found)]) 
    findable_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(findable)])
    print("Completeness between {} and {} AU: {:.2f} %, {}".format(a_min, a_max, found_in_bin / findable_in_bin * 100.0, found_in_bin))

Completeness between 0.0 and 1.7 AU: 43.90 %, 18
Completeness between 1.7 and 50.0 AU: 97.49 %, 20770


In [37]:
sma_bins = [0.0, 2.5, 50.0]
for a_min, a_max in zip(sma_bins[:-1], sma_bins[1:]):
    found_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(found)]) 
    findable_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(findable)])
    print("Completeness between {} and {} AU: {:.2f} %, {}".format(a_min, a_max, found_in_bin / findable_in_bin * 100.0, found_in_bin))

Completeness between 0.0 and 2.5 AU: 95.31 %, 6780
Completeness between 2.5 and 50.0 AU: 98.43 %, 14008


In [38]:
summary_orbits["num_unique_known_objects_findable"].value_counts()

0       146
1        41
2        22
3        19
4        18
13       18
9        17
5        16
8        15
7        13
11       13
14       13
6        12
28       11
15       11
24       11
22       11
19       10
12       10
10        9
18        8
36        8
16        7
68        7
33        7
45        6
25        6
29        6
41        5
40        5
       ... 
1167      1
142       1
204       1
212       1
349       1
1238      1
333       1
332       1
328       1
2366      1
313       1
311       1
308       1
304       1
293       1
1311      1
273       1
269       1
266       1
257       1
256       1
255       1
246       1
244       1
243       1
240       1
239       1
224       1
217       1
157       1
Name: num_unique_known_objects_findable, Length: 237, dtype: int64

In [39]:
summary_orbits["num_unique_known_objects_found"].value_counts()

0       441
1        89
2        35
3        31
5        19
4        17
6        15
7        11
8        11
13        8
16        7
20        5
10        5
18        5
17        5
14        5
15        4
11        4
9         4
12        3
21        3
37        3
35        3
29        2
59        2
52        2
45        2
69        2
38        2
33        2
       ... 
395       1
404       1
514       1
517       1
672       1
704       1
829       1
929       1
176       1
154       1
1997      1
147       1
63        1
68        1
71        1
72        1
77        1
81        1
84        1
85        1
87        1
96        1
97        1
99        1
100       1
103       1
126       1
130       1
145       1
62        1
Name: num_unique_known_objects_found, Length: 91, dtype: int64

In [40]:
summary_orbits["num_unique_known_objects_found"].max()

2331

In [41]:
unknown_clusters = len(allClusters_projection[allClusters_projection["linked_object"].isin(unknownIDs)])
unknown_clusters

2002501

In [42]:
mixed_clusters = len(allClusters_projection[allClusters_projection["linked_object"].isna()])
mixed_clusters

2038238

In [43]:
true_clusters = len(allClusters_projection[(~allClusters_projection["linked_object"].isna()) & (~allClusters_projection["linked_object"].isin(["-1"]))])
true_clusters

241634

In [44]:
total_clusters = true_clusters + mixed_clusters + unknown_clusters

In [45]:
total_clusters == len(allClusters_projection)

True

In [46]:
start = Time(observations["exp_mjd"].min(), format="mjd", scale="utc")
end = Time(observations["exp_mjd"].max(), format="mjd", scale="utc")

In [47]:
start.isot

'2018-09-03T03:07:53.999'

In [48]:
end.isot

'2018-09-17T12:37:12.999'