In [1]:
import glob
import os
import numpy as np
import pandas as pd
import sqlite3 as sql
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.colors as colors
import seaborn as sns
sns.set(font_scale=1.2, context="paper", style="ticks")
sns.set_palette("viridis")

import mysql.connector as mariadb
from astropy.time import Time

from scipy.stats import binned_statistic_2d

%matplotlib inline

import plotly
plotly.offline.init_notebook_mode(connected=True)

import sys
sys.path.append("../..")

In [2]:
import thor

### Simulations

In [3]:
RUN_DIR = "../../analysis/msst_4x4/run_14/"
DATABASE = "/epyc/projects/thor/data/msst_survey.db"
con = sql.connect(DATABASE)

In [4]:
allObjects_survey = pd.read_csv(os.path.join(RUN_DIR, "allObjects_survey.txt"), sep=" ", index_col=False)
summary_survey = pd.read_csv(os.path.join(RUN_DIR, "summary_survey.txt"), sep=" ", index_col=False)
summary_orbits = pd.read_csv(os.path.join(RUN_DIR, "summary_orbits.txt"), sep=" ", index_col=False)
test_orbits_survey = pd.read_csv(os.path.join(RUN_DIR, "orbits.txt"), sep=" ", index_col=False)

known_orbits = pd.read_sql("SELECT * FROM mpcOrbitCat", con)
known_orbits = known_orbits[known_orbits["designation"].isin(allObjects_survey["designation"].values)]

In [5]:
findable = allObjects_survey[allObjects_survey["findable"] == 1]["designation"].values
found = allObjects_survey[allObjects_survey["found"] == 1]["designation"].values
missed = allObjects_survey[(allObjects_survey["found"] == 0) & (allObjects_survey["findable"] == 1) ]["designation"].values
test_orbits = test_orbits_survey["designation"].values

falsePositiveIDs = ["NS"]
unknownIDs = []

In [6]:
def printRunStats(allObjects, orbits):
    completeness = len(allObjects[allObjects["found"] == 1]) / len(allObjects[allObjects["findable"] == 1])
    print("Completeness: {:.2f}".format(completeness * 100.0))
    print("Number of objects findable: {}".format(len(allObjects[allObjects["findable"] == 1])))
    print("Number of objects found: {}".format(len(allObjects[allObjects["found"] == 1])))
    print("Number of objects missed: {}".format(len(allObjects[(allObjects["found"] == 0) & (allObjects["findable"] == 1)])))
    print("Number of test orbits: {}".format(len(orbits)))
    
def printClusterStats(allClusters, falsePositiveIDs=["NS"], unknownIDs=[]):
    # Don't want linkages with NaN linked ID, or linked false positive IDs or unknownIDs
    good_linkages = allClusters[(~allClusters["linked_object"].isna()) & (~allClusters["linked_object"].isin(falsePositiveIDs + unknownIDs))]
    
    print("Total clusters: {}".format(len(allClusters)))
    print("Total pure clusters: {}".format(len(good_linkages[good_linkages["pure"] == 1])))
    print("Total partial clusters: {}".format(len(good_linkages[good_linkages["partial"] == 1])))
    print("Total pure + partial clusters: {}".format(len(good_linkages)))
    print("Total false clusters: {}".format(len(allClusters) - len(good_linkages)))
    print("Cluster Contamination [%]: {:.2f}".format((1 - len(good_linkages)/len(allClusters)) * 100.0))
    print("Total duplicate visit clusters: {}".format(len(allClusters[allClusters["num_dupes"] != 0])))

In [7]:
printRunStats(allObjects_survey, test_orbits)

Completeness: 94.19
Number of objects findable: 18332
Number of objects found: 17267
Number of objects missed: 1065
Number of test orbits: 8


In [8]:
allClusters_projection = []
allObjects_projection = []
for i in range(8):
    allClusters_projection.append(pd.read_csv(os.path.join(RUN_DIR, "orbit_{:04d}/allClusters.txt".format(i + 1)), sep=" ", index_col=False))
    allObjects_projection.append(pd.read_csv(os.path.join(RUN_DIR, "orbit_{:04d}/allObjects.txt".format(i + 1)), sep=" ", index_col=False))
    
allClusters_projection = pd.concat(allClusters_projection)
allObjects_projection = pd.concat(allObjects_projection)

allClusters_projection["num_dupes"] = allClusters_projection["num_obs"] - allClusters_projection["num_visits"] 

In [9]:
printClusterStats(allClusters_projection, falsePositiveIDs=falsePositiveIDs, unknownIDs=unknownIDs)

Total clusters: 234106
Total pure clusters: 46545
Total partial clusters: 12874
Total pure + partial clusters: 59419
Total false clusters: 174687
Cluster Contamination [%]: 74.62
Total duplicate visit clusters: 154312


In [10]:
printClusterStats(allClusters_projection[allClusters_projection["num_dupes"] == 0], falsePositiveIDs=falsePositiveIDs, unknownIDs=unknownIDs)

Total clusters: 79794
Total pure clusters: 46545
Total partial clusters: 2457
Total pure + partial clusters: 49002
Total false clusters: 30792
Cluster Contamination [%]: 38.59
Total duplicate visit clusters: 0


In [11]:
printClusterStats(allClusters_projection[allClusters_projection["num_dupes"].isin([0, 1])], falsePositiveIDs=falsePositiveIDs, unknownIDs=unknownIDs)

Total clusters: 175683
Total pure clusters: 46545
Total partial clusters: 12874
Total pure + partial clusters: 59419
Total false clusters: 116264
Cluster Contamination [%]: 66.18
Total duplicate visit clusters: 95889


In [37]:
sma_bins = [1.7, 2.06, 2.5, 2.82, 2.95, 3.27, 5.0]

for a_min, a_max in zip(sma_bins[:-1], sma_bins[1:]):
    found_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(found)]) 
    findable_in_bin = len(known_orbits[(known_orbits["a_au"] >= a_min) & (known_orbits["a_au"] < a_max) & known_orbits["designation"].isin(findable)])
    print("Completness between {} and {} AU: {:.2f} %, {}".format(a_min, a_max, found_in_bin / findable_in_bin * 100.0, found_in_bin))

Completness between 1.7 and 2.06 AU: 65.10 %, 125
Completness between 2.06 and 2.5 AU: 88.69 %, 6158
Completness between 2.5 and 2.82 AU: 98.94 %, 6348
Completness between 2.82 and 2.95 AU: 99.65 %, 855
Completness between 2.95 and 3.27 AU: 99.59 %, 3437
Completness between 3.27 and 5.0 AU: 99.44 %, 353


### ZTF 

In [None]:
columnMapping = {        
        
        # Observation ID
        "obs_id" : "obs_id",
        
        # Exposure time
        "exp_mjd" : "exp_mjd",
        
        # Visit ID
        "visit_id" : "visit_id",
        
        # Field ID
        "field_id" : "field",
        
        # Field RA in degrees
        "field_RA_deg" : "fieldRA_deg",
        
        # Field Dec in degrees
        "field_Dec_deg" : "fieldDec_deg",
        
        # Night number
        "night": "nid",
        
        # RA in degrees
        "RA_deg" : "ra",
        
        # Dec in degrees
        "Dec_deg" : "decl",
        
        # Observer's x coordinate in AU
        "obs_x_au" : "HEclObsy_X_au",
        
        # Observer's y coordinate in AU
        "obs_y_au" : "HEclObsy_Y_au",
        
        # Observer's z coordinate in AU
        "obs_z_au" : "HEclObsy_Z_au",
        
        # Magnitude (UNUSED)
        "mag" : "magpsf",
        
        ### Truth Parameters
        
        # Object name
        "name" : "designation",
        
        # Observer-object distance in AU
        "Delta_au" : "Delta_au",
        
        # Sun-object distance in AU (heliocentric distance)
        "r_au" : "r_au",
        
        # Object's x coordinate in AU
        "obj_x_au" : "HEclObj_X_au",
        
        # Object's y coordinate in AU
        "obj_y_au" : "HEclObj_Y_au",
        
        # Object's z coordinate in AU
        "obj_z_au" : "HEclObj_Z_au",
        
        # Object's x velocity in AU per day
        "obj_dx/dt_au_p_day" : "HEclObj_dX/dt_au_p_day",
        
        # Object's y velocity in AU per day
        "obj_dy/dt_au_p_day" : "HEclObj_dY/dt_au_p_day",
        
        # Object's z velocity in AU per day
        "obj_dz/dt_au_p_day" : "HEclObj_dZ/dt_au_p_day",
        
        # Semi-major axis
        "a_au" : "a_au",
        
        # Inclination
        "i_deg" : "i_deg",
        
        # Eccentricity
        "e" : "e",
    }

In [None]:
RUN_DIR = "../../analysis/ztf/run_10/"

In [None]:
allObjects_survey = pd.read_csv(os.path.join(RUN_DIR, "allObjects_survey.txt"), sep=" ", index_col=False)
summary_survey = pd.read_csv(os.path.join(RUN_DIR, "summary_survey.txt"), sep=" ", index_col=False)
summary_orbits = pd.read_csv(os.path.join(RUN_DIR, "summary_orbits.txt"), sep=" ", index_col=False)
test_orbits_survey = pd.read_csv(os.path.join(RUN_DIR, "orbits.txt"), sep=" ", index_col=False)

known_orbits = pd.read_sql("SELECT * FROM mpcOrbitCat", con)
known_orbits = known_orbits[known_orbits["designation"].isin(allObjects_survey["designation"].values)]

In [None]:
findable = allObjects_survey[allObjects_survey["findable"] == 1]["designation"].values
found = allObjects_survey[allObjects_survey["found"] == 1]["designation"].values
missed = allObjects_survey[(allObjects_survey["found"] == 0) & (allObjects_survey["findable"] == 1) ]["designation"].values
test_orbits = test_orbits_survey["designation"].values

falsePositiveIDs = []
unknownIDs = [-1, "-1"]

In [None]:
simulated_ephemeris = pd.read_csv("../../analysis/ztf/known_object_observations_nid_610_624.eph", sep=" ", index_col=False, low_memory=False)
known_orbits = pd.read_csv("../../analysis/ztf/known_object_observations_nid_610_624.orb", sep=" ", index_col=False, low_memory=False)

#observations = pd.read_csv("../analysis/ztf/known_object_observations_nid_610_624.txt", sep=" ", index_col=False, low_memory=False)
observations = pd.read_csv("../../analysis/ztf/observations_nid_610_624.txt", sep=" ", index_col=False, low_memory=False)

observations.sort_values(by="mjd", inplace=True)
#observations.drop(index=observations[observations["designation"].isna()].index, inplace=True)
observations.loc[observations["designation"].isna(), "designation"] = -1

In [None]:
def calcFindableMOPS(observations, trackletMinObs=2, trackMinNights=3, falsePositiveIDs=[-1], unknownIDs=[]):
    # Groupby night, then count number of occurences per night
    night_designation_count = observations[~observations["designation"].isin(falsePositiveIDs + unknownIDs)].groupby(["nid"])["designation"].value_counts()
    night_designation_count = pd.DataFrame(night_designation_count)
    night_designation_count.rename(columns={"designation": "num_obs"}, inplace=True)
    night_designation_count.reset_index(inplace=True)

    # Remove nightly detections that would not be linked into a tracklet
    night_designation_count = night_designation_count[night_designation_count["num_obs"] >= trackletMinObs]

    # Groupby object then count number of nights
    try: 
        designation_night_count = pd.DataFrame(night_designation_count.groupby(["designation"])["nid"].value_counts())
    except:
        # No objects satisfy the requirements, return empty array
        return np.array([])
    designation_night_count.rename(columns={"nid": "num_nights"}, inplace=True)
    designation_night_count.reset_index(inplace=True)

    # Grab objects that meet the night requirement
    tracklet_nights_possible = designation_night_count["designation"].value_counts()
    return tracklet_nights_possible.index[tracklet_nights_possible >= trackMinNights].values

In [None]:
findableMOPS = calcFindableMOPS(observations)

In [None]:
allObjects_survey[allObjects_survey["designation"].isin(findableMOPS)]["found"].sum() / len(findableMOPS)

In [None]:
len(findableMOPS) / 21401

In [None]:
def calcDiscoverableZMODE(observations, 
                          minTracklets=2, 
                          trackletMinObs=2,
                          minObsPerTrack=4,
                          maxTrackNightSpan=4, 
                          falsePositiveIDs=[-1],
                          unknownIDs=[]):

    # Count number of observations per object
    obs_per_obj = observations["designation"].value_counts()
    possibly_findable = obs_per_obj.index.values[obs_per_obj >= minObsPerTrack]

    night_designation_count = observations[~observations["designation"].isin(falsePositiveIDs + unknownIDs)].groupby(["nid"])["designation"].value_counts()
    night_designation_count = pd.DataFrame(night_designation_count)
    night_designation_count.rename(columns={"designation": "num_obs"}, inplace=True)
    night_designation_count.reset_index(inplace=True)
    
    night_designation_count["delta_night"] = night_designation_count.groupby(["designation"])["nid"].diff()
    night_designation_count.loc[night_designation_count["delta_night"].isna(), "delta_night"] = 0
    night_designation_count["possible_tracklet"] = np.where(night_designation_count["num_obs"] >= trackletMinObs, 1, 0)

    night_designation_count = night_designation_count[night_designation_count["delta_night"] < maxTrackNightSpan]
    tracklets_per_designation = night_designation_count.groupby(["designation"])["possible_tracklet"].sum()
    possibly_findable = tracklets_per_designation.index.values[tracklets_per_designation >= minTracklets]

    obs_per_designation = night_designation_count[night_designation_count["designation"].isin(possibly_findable)].groupby(["designation"])["num_obs"].sum()
    return obs_per_designation.index.values[obs_per_designation >= minObsPerTrack]
    

In [None]:
findableZMODE = calcDiscoverableZMODE(observations)

In [None]:
allObjects_survey[allObjects_survey["designation"].isin(findableZMODE)]["found"].sum() / len(findableZMODE)

In [None]:
len(findableZMODE) / 21401

In [None]:
_, _ = thor.analyzeObservations(observations, unknownIDs=unknownIDs, falsePositiveIDs=falsePositiveIDs, columnMapping=columnMapping)

In [None]:
printRunStats(allObjects_survey, test_orbits)

In [None]:
allClusters_projection = []
allObjects_projection = []
for i in range(338):
    allClusters_projection.append(pd.read_csv(os.path.join(RUN_DIR, "orbit_{:04d}/allClusters.txt".format(i + 1)), sep=" ", index_col=False, low_memory=False))
    allObjects_projection.append(pd.read_csv(os.path.join(RUN_DIR, "orbit_{:04d}/allObjects.txt".format(i + 1)), sep=" ", index_col=False, low_memory=False))
    
allClusters_projection = pd.concat(allClusters_projection)
allObjects_projection = pd.concat(allObjects_projection)

allClusters_projection["num_dupes"] = allClusters_projection["num_obs"] - allClusters_projection["num_visits"] 

In [None]:
printClusterStats(allClusters_projection, falsePositiveIDs=falsePositiveIDs, unknownIDs=unknownIDs)

In [None]:
printClusterStats(allClusters_projection[allClusters_projection["num_dupes"] == 0], falsePositiveIDs=falsePositiveIDs, unknownIDs=unknownIDs)

In [None]:
accepted_clusters = allClusters_projection[(~allClusters_projection["linked_object"].isna()) 
                                           & (~allClusters_projection["linked_object"].isin(falsePositiveIDs + unknownIDs))]
a = set(accepted_clusters[accepted_clusters["num_dupes"] == 0]["linked_object"].unique())
b = set(accepted_clusters[accepted_clusters["num_dupes"] > 0]["linked_object"].unique())


fig, ax = plt.subplots(1, 1, dpi=600)
ax.hist([allClusters_projection[allClusters_projection["linked_object"].isna()]["num_dupes"].values, 
         allClusters_projection[allClusters_projection["linked_object"].isin(falsePositiveIDs)]["num_dupes"].values,
         allClusters_projection[allClusters_projection["linked_object"].isin(unknownIDs)]["num_dupes"].values,
         allClusters_projection[(~allClusters_projection["linked_object"].isin(falsePositiveIDs + unknownIDs) & (~allClusters_projection["linked_object"].isna()))]["num_dupes"].values], 
        bins=range(7), 
        label=["False Clusters ({})".format(len(allClusters_projection[allClusters_projection["linked_object"].isna()])), 
               "False Positive Clusters ({})".format(len(allClusters_projection[allClusters_projection["linked_object"].isin(falsePositiveIDs)])),
               "Unknown Clusters ({})".format(len(allClusters_projection[allClusters_projection["linked_object"].isin(unknownIDs)])),
               "True Positive Clusters ({})".format(len(allClusters_projection[((allClusters_projection["partial"] == 1) | (allClusters_projection["pure"] == 1)) & (~allClusters_projection["linked_object"].isin(falsePositiveIDs + unknownIDs))]))])
ax.legend(fontsize=8)
ax.set_xlabel("Number of detections from the same exposure")
ax.vlines(1, 1, 10**7)
ax.set_yscale("log")
ax.set_ylim(1, 10**7)
ax.text(0.2, 10**6.5, "{:0.2f}%".format(len(a) / 21401 * 100))
ax.text(1.2, 10**6.5, "{:0.2f}%".format(len(b.difference(a)) / 21401 * 100))
ax.axvspan(1, 6, color="k", alpha=0.2)
ax.set_xlim(0, 6)
ax.set_title("Cluster Summary (338 Orbits: 92.1% Completeness)")
ax.set_ylabel("Number")
fig.savefig("plots/cluster_summary_ztf.png")

In [None]:
len(observations) == (len(observations[observations["designation"].isna() | observations["designation"].isin(falsePositiveIDs + unknownIDs)])
                        + len(observations[~observations["designation"].isna() & ~observations["designation"].isin(falsePositiveIDs + unknownIDs)]))

In [None]:
len(allClusters_projection) == (len(allClusters_projection[allClusters_projection["linked_object"].isna() | allClusters_projection["linked_object"].isin(falsePositiveIDs + unknownIDs)])
                        + len(allClusters_projection[~allClusters_projection["linked_object"].isna() & ~allClusters_projection["linked_object"].isin(falsePositiveIDs + unknownIDs)]))

In [None]:
observations[]

In [None]:
# Groupby night, then count number of occurences per night
night_designation_count = observations[~observations["designation"].isin(falsePositiveIDs + unknownIDs)].groupby(["nid"])["designation"].value_counts()
night_designation_count = pd.DataFrame(night_designation_count)
night_designation_count.rename(columns={"designation": "num_obs"}, inplace=True)
night_designation_count.reset_index(inplace=True)

In [None]:
test = observations[~observations["designation"].isin(falsePositiveIDs + unknownIDs) & observations["designation"].isin(["09816"])]

In [None]:
fig, ax = plt.subplots(1, 1)
ax.scatter(*test[["ra", "decl"]].values.T)

In [None]:
obs_per_night = test["nid"].value_counts()
tracklet_nights = obs_per_night[obs_per_night >= 2].index.values

In [None]:
tracklet_nights

In [None]:
obs_per_object = observations[(~observations["designation"].isna()) 
                              & (~observations["designation"].isin(falsePositiveIDs + unknownIDs))]["designation"].value_counts()
possibly_findable = obs_per_object[obs_per_object >= 4].index.values

In [None]:
test = observations[observations["designation"].isin([possibly_findable[0]])]

In [None]:
test[]

In [None]:
for tracklet_night in tracklet_nights:
    delta_night = test["nid"].values - tracklet_night

In [None]:
delta_night[delta_night > -3 % ]

In [None]:
delta_night =  614 - test["nid"]

In [None]:
test["nid"]

In [None]:
delta_night

In [None]:
delta_night[(delta_night > 0) & (delta_night < 4)]

In [None]:
delta_night