In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

import os
import sys
import annoy
from annoy import AnnoyIndex
import random



from IPython.display import display_markdown

from collections import Counter

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial import cKDTree





In [2]:
class AnnoyIndex():
    def __init__(self, vectors, labels, metric):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
        self.metric= metric
   
    def build(self, number_of_trees=100):
        self.index = annoy.AnnoyIndex(self.dimension, self.metric)
        for i, vec in enumerate(self.vectors):
            self.index.add_item(i, vec.tolist())
        self.index.build(number_of_trees)
        
    def query(self, vector, k=1000):
        indices = self.index.get_nns_by_vector(
              vector.tolist(), 
              k, 
              search_k=-1)                                           
        return [self.labels[i] for i in indices]

In [3]:
# Visually vetted GHOST results split up over 6 files
# So merge them, and take subset of just YSE DR1 objects

dfs = []
for idx in range(1, 7):
    df = pd.read_csv(f'../YSE/photclass/Tables/Paper/Hosts/{idx}_of_5/ghost_hosts.csv')
    dfs.append(df)
    
g_hosts_df = pd.concat(dfs, axis=0).reset_index(drop=True)
g_hosts_df = g_hosts_df.set_index('TransientName')
g_hosts_df.sort_values('TransientName', inplace=True)
ps1_hosts_df = g_hosts_df.reset_index(drop=False)
ps1_hosts_df = ps1_hosts_df.set_index('TransientName')
ps1_hosts_df

Unnamed: 0_level_0,Unnamed: 0,level_0,index,objName,objAltName1,objAltName2,objAltName3,objID,uniquePspsOBid,ippObjID,...,NED_type,NED_vel,NED_redshift,NED_mag,class,dist/DLR,dist,TransientClass,TransientRA,TransientDEC
TransientName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017fgc,438,1083.0,1.0,PSO J012014.639+032412.408,,,,112080200610004597,920640000012027,44345537343227,...,,,,,,2.043633,3.864276,,20.060167,3.402767
2019aaag,242,1587.0,13.0,PSO J122902.877+124305.856,,,,123261872619862441,1075711000039539,102744207694452,...,G,,,19.8,False,0.042838,0.201594,,187.262053,12.718327
2019aaax,96,632.0,20.0,PSO J101958.736-030625.511,,,,104271549947461885,1044689000082043,378506877943937,...,G,,,21.8,False,0.015769,0.060278,,154.994731,-3.107074
2019aabv,338,2275.0,39.0,PSO J095020.993+320144.032,,,,146431475874425238,1055533000070586,214804199445433,...,*,,,20.1,False,0.058378,0.192407,,147.587449,32.028943
2019aacg,66,431.0,43.0,PSO J104738.560-043456.602,,,,102501619106751538,1050657000004462,379005094072685,...,IrS,,,,False,0.111747,0.244734,,161.910702,-4.582341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022cpk,462,2825.0,8244.0,PSO J091328.597+310703.441,,,,145341383690751801,1048059000053667,214151364399522,...,,,,,,1.665415,4.059217,,138.370342,31.117122
2022gy,210,1680.0,8252.0,PSO J122727.536+081137.071,,,,117831868647142827,1038671000072790,102894531583061,...,G,35795.0,0.119398,17.6,False,0.218665,2.091249,,186.864439,8.194126
2022jq,130,1096.0,8270.0,PSO J091327.581+010010.966,,,,109201383649304082,1011152000032043,58437325061418,...,G,,,18.8,False,0.343289,1.654076,,138.365050,1.003510
2022km,135,1129.0,8277.0,PSO J100351.342+011009.027,,,,109401509639963451,1018984000006514,59468117186930,...,G,,,20.0,False,0.354672,1.248986,,150.964005,1.168831


In [4]:
feature_names_hostgal = [
#  'Unnamed: 0',
#  'level_0',
#  'index',
#  'objName',
#  'objAltName1',
#  'objAltName2',
#  'objAltName3',
#  'objID',
#  'uniquePspsOBid',
#  'ippObjID',
#  'surveyID',
#  'htmID',
#  'zoneID',
#  'tessID',
#  'projectionID',
#  'skyCellID',
#  'randomID',
#  'batchID',
#  'dvoRegionID',
#  'processingVersion',
#  'objInfoFlag',
#  'qualityFlag',
#  'raStack',
#  'decStack',
#  'raStackErr',
#  'decStackErr',
#  'raMean',
#  'decMean',
#  'raMeanErr',
#  'decMeanErr',
#  'epochMean',
#  'posMeanChisq',
#  'cx',
#  'cy',
#  'cz',
#  'lambda',
#  'beta',
#  'l',
#  'b',
#  'nStackObjectRows',
#  'nStackDetections',
#  'nDetections',
#  'ng',
#  'nr',
#  'ni',
#  'nz',
#  'ny',
#  'uniquePspsSTid',
#  'primaryDetection',
#  'bestDetection',
#  'gippDetectID',
#  'gstackDetectID',
#  'gstackImageID',
#  'gra',
#  'gdec',
#  'graErr',
#  'gdecErr',
#  'gEpoch',
#  'gPSFMag',
#  'gPSFMagErr',
#  'gApMag',
#  'gApMagErr',
#  'gKronMag',
#  'gKronMagErr',
#  'ginfoFlag',
#  'ginfoFlag2',
#  'ginfoFlag3',
#  'gnFrames',
#  'gxPos',
#  'gyPos',
#  'gxPosErr',
#  'gyPosErr',
#  'gpsfMajorFWHM',
#  'gpsfMinorFWHM',
#  'gpsfTheta',
#  'gpsfCore',
#  'gpsfLikelihood',
#  'gpsfQf',
#  'gpsfQfPerfect',
#  'gpsfChiSq',
 'gmomentXX',
 'gmomentXY',
 'gmomentYY',
 'gmomentR1',
 'gmomentRH',
 'gPSFFlux',
#  'gPSFFluxErr',
 'gApFlux',
#  'gApFluxErr',
#  'gApFillFac',
#  'gApRadius',
 'gKronFlux',
#  'gKronFluxErr',
 'gKronRad',
#  'gexpTime',
 'gExtNSigma',
#  'gsky',
#  'gskyErr',
#  'gzp',
#  'gPlateScale',
#  'rippDetectID',
#  'rstackDetectID',
#  'rstackImageID',
#  'rra',
#  'rdec',
#  'rraErr',
#  'rdecErr',
#  'rEpoch',
# 'rPSFMag',
#  'rPSFMagErr',
# 'rApMag',
#  'rApMagErr',
# 'rKronMag',
#  'rKronMagErr',
#  'rinfoFlag',
#  'rinfoFlag2',
#  'rinfoFlag3',
#  'rnFrames',
#  'rxPos',
#  'ryPos',
#  'rxPosErr',
#  'ryPosErr',
#  'rpsfMajorFWHM',
#  'rpsfMinorFWHM',
#  'rpsfTheta',
#  'rpsfCore',
#  'rpsfLikelihood',
#  'rpsfQf',
#  'rpsfQfPerfect',
#  'rpsfChiSq',
 'rmomentXX',
 'rmomentXY',
 'rmomentYY',
 'rmomentR1',
 'rmomentRH',
'rPSFFlux',
#  'rPSFFluxErr',
'rApFlux',
#  'rApFluxErr',
#  'rApFillFac',
# 'rApRadius',
'rKronFlux',
#  'rKronFluxErr',
'rKronRad',
#  'rexpTime',
 'rExtNSigma',
#  'rsky',
#  'rskyErr',
#  'rzp',
#  'rPlateScale',
#  'iippDetectID',
#  'istackDetectID',
#  'istackImageID',
#  'ira',
#  'idec',
#  'iraErr',
#  'idecErr',
#  'iEpoch',
#  'iPSFMag',
#  'iPSFMagErr',
#  'iApMag',
#  'iApMagErr',
#  'iKronMag',
#  'iKronMagErr',
#  'iinfoFlag',
#  'iinfoFlag2',
#  'iinfoFlag3',
#  'inFrames',
#  'ixPos',
#  'iyPos',
#  'ixPosErr',
#  'iyPosErr',
#  'ipsfMajorFWHM',
#  'ipsfMinorFWHM',
#  'ipsfTheta',
#  'ipsfCore',
#  'ipsfLikelihood',
#  'ipsfQf',
#  'ipsfQfPerfect',
#  'ipsfChiSq',
  'imomentXX',
 'imomentXY',
 'imomentYY',
 'imomentR1',
 'imomentRH',
 'iPSFFlux',
#  'iPSFFluxErr',
 'iApFlux',
#  'iApFluxErr',
#  'iApFillFac',
#  'iApRadius',
 'iKronFlux',
#  'iKronFluxErr',
 'iKronRad',
#  'iexpTime',
  'iExtNSigma',
#  'isky',
#  'iskyErr',
#  'izp',
#  'iPlateScale',
#  'zippDetectID',
#  'zstackDetectID',
#  'zstackImageID',
#  'zra',
#  'zdec',
#  'zraErr',
#  'zdecErr',
#  'zEpoch',
#  'zPSFMag',
#  'zPSFMagErr',
#  'zApMag',
#  'zApMagErr',
#  'zKronMag',
#  'zKronMagErr',
#  'zinfoFlag',
#  'zinfoFlag2',
#  'zinfoFlag3',
#  'znFrames',
#  'zxPos',
#  'zyPos',
#  'zxPosErr',
#  'zyPosErr',
#  'zpsfMajorFWHM',
#  'zpsfMinorFWHM',
#  'zpsfTheta',
#  'zpsfCore',
#  'zpsfLikelihood',
#  'zpsfQf',
#  'zpsfQfPerfect',
#  'zpsfChiSq',
  'zmomentXX',
 'zmomentXY',
 'zmomentYY',
 'zmomentR1',
 'zmomentRH',
 'zPSFFlux',
#  'zPSFFluxErr',
 'zApFlux',
#  'zApFluxErr',
#  'zApFillFac',
#  'zApRadius',
 'zKronFlux',
#  'zKronFluxErr',
 'zKronRad',
#  'zexpTime',
  'zExtNSigma',
#  'zsky',
#  'zskyErr',
#  'zzp',
#  'zPlateScale',
#  'yippDetectID',
#  'ystackDetectID',
#  'ystackImageID',
#  'yra',
#  'ydec',
#  'yraErr',
#  'ydecErr',
#  'yEpoch',
#  'yPSFMag',
#  'yPSFMagErr',
#  'yApMag',
#  'yApMagErr',
#  'yKronMag',
#  'yKronMagErr',
#  'yinfoFlag',
#  'yinfoFlag2',
#  'yinfoFlag3',
#  'ynFrames',
#  'yxPos',
#  'yyPos',
#  'yxPosErr',
#  'yyPosErr',
#  'ypsfMajorFWHM',
#  'ypsfMinorFWHM',
#  'ypsfTheta',
#  'ypsfCore',
#  'ypsfLikelihood',
#  'ypsfQf',
#  'ypsfQfPerfect',
#  'ypsfChiSq',
  'ymomentXX',
  'ymomentXY',
  'ymomentYY',
  'ymomentR1',
  'ymomentRH',
  'yPSFFlux',
#   'yPSFFluxErr',
  'yApFlux',
#   'yApFluxErr',
#   'yApFillFac',
#  'yApRadius',
 'yKronFlux',
#  'yKronFluxErr',
 'yKronRad',
#  'yexpTime',
  'yExtNSigma',
#  'ysky',
#  'yskyErr',
#  'yzp',
#  'yPlateScale',
#  'distance',
#  'SkyMapper_StarClass',
#  'gelong',
#  'g_a',
#  'g_b',
#  'g_pa',
#  'relong',
#  'r_a',
#  'r_b',
#  'r_pa',
#  'ielong',
#  'i_a',
#  'i_b',
#  'i_pa',
#  'zelong',
#  'z_a',
#  'z_b',
#  'z_pa',
#    'i-z',
#    'g-r',
#    'r-i',
#    'g-i',
#    'z-y',
#   'g-rErr',
#   'r-iErr',
#   'i-zErr',
#   'z-yErr',
 'gApMag_gKronMag',
 'rApMag_rKronMag',
 'iApMag_iKronMag',
 'zApMag_zKronMag',
 'yApMag_yKronMag',
#  '7DCD',
#  'NED_name',
#  'NED_type',
#  'NED_vel',
#  'NED_redshift',
#  'NED_mag',
#  'class',
   'dist/DLR',
#   'dist',
#  'TransientClass',
#  'TransientRA',
#  'TransientDEC'
   ]

In [5]:
# only keep features of bank
host_dataset_bank = ps1_hosts_df[feature_names_hostgal]
        
# drop row if any feature values are nan
host_dataset_bank = host_dataset_bank[~host_dataset_bank[feature_names_hostgal].isnull().any(axis=1)]

#mask = (dataset_bank['num_mag_values'] >= 40) #& (dataset_bank['num_mag_values'] <= 40)
#dataset_bank = dataset_bank[mask]
host_dataset_bank

Unnamed: 0_level_0,gmomentXX,gmomentXY,gmomentYY,gmomentR1,gmomentRH,gPSFFlux,gApFlux,gKronFlux,gKronRad,gExtNSigma,...,yApFlux,yKronFlux,yKronRad,yExtNSigma,gApMag_gKronMag,rApMag_rKronMag,iApMag_iKronMag,zApMag_zKronMag,yApMag_yKronMag,dist/DLR
TransientName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019aaag,0.322234,0.003140,0.299576,1.616320,1.034360,0.000024,0.000042,0.000045,4.04080,12.911300,...,0.000243,0.000293,4.27831,13.846000,0.080601,0.140699,0.269800,0.248400,0.201000,0.042838
2019aaax,0.189772,-0.004596,0.198261,1.141030,0.849408,0.000030,0.000035,0.000033,2.85257,2.607320,...,0.000110,0.000107,2.78548,8.117690,-0.051600,0.044199,0.081400,0.188400,-0.030800,0.015769
2019aabv,0.283302,-0.002636,0.257574,1.130610,0.885985,0.000026,0.000028,0.000027,2.82652,2.351040,...,0.000158,0.000140,2.71481,3.608620,-0.066900,-0.054901,-0.008900,-0.004499,-0.130301,0.058378
2019aacg,0.193669,0.035234,0.161559,0.843921,0.666147,0.000005,0.000004,0.000004,2.10980,-1.447330,...,0.000019,0.000018,2.23357,2.534030,0.147301,-0.125401,-0.035101,-0.128801,-0.098099,0.111747
2019aacp,0.327138,0.013750,0.297035,1.360040,0.904056,0.000010,0.000015,0.000015,3.40009,7.569170,...,0.000114,0.000120,2.95051,9.019760,0.000399,0.025101,0.081900,0.022999,0.049299,0.060816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022cnt,0.326595,0.010935,0.345536,2.178740,1.169140,0.000067,0.000183,0.000242,5.44686,34.122898,...,0.000619,0.000796,4.98550,21.261999,0.299900,0.225000,0.287399,0.394600,0.274000,0.337622
2022cpj,0.249317,0.004646,0.267804,1.386830,0.984514,0.000037,0.000062,0.000061,3.46707,13.196400,...,0.000155,0.000148,3.01631,13.647900,-0.007299,-0.013500,0.022099,0.037600,-0.051899,0.063465
2022gy,0.382634,0.011327,0.331931,3.645380,1.195170,0.000055,0.000124,0.000284,9.11345,27.451300,...,0.000497,0.000945,7.61617,20.641300,0.900299,0.815701,0.880999,0.775600,0.697001,0.218665
2022jq,0.282136,0.015385,0.320016,1.901300,1.108720,0.000036,0.000089,0.000110,4.75325,22.964701,...,0.000367,0.000440,4.38128,18.240101,0.238401,0.191101,0.235600,0.283699,0.195000,0.343289


In [6]:
# https://github.com/uiucsn/laiss/blob/main/LAISS/notebooks/LAISS_tests.ipynb
idx_arr = np.hstack(np.array(host_dataset_bank.index, dtype='object'))
feat_arr = np.vstack(np.array(host_dataset_bank, dtype='object'))

scaler = StandardScaler() # Create a scalar
#scaler = MinMaxScaler() # Create a scalar
scaler.fit(feat_arr) # Fit only to training data
feat_arr_scaled = scaler.transform(feat_arr) # What your model learns on

# Build ANNOY index of scaled features of dataset_bank
index = AnnoyIndex(feat_arr_scaled, idx_arr, metric='euclidean')
index.build()


In [7]:
# Reference

#dataset_ref = host_dataset_bank[host_dataset_bank.index == '2019pmd'] # SN Ia, in spiral (medium size), matches all Ia in z=0.02 to z=0.06. Most spiral
#dataset_ref = host_dataset_bank[host_dataset_bank.index == '2021qxr'] # SN Ia, bright elliptical
#dataset_ref = host_dataset_bank[host_dataset_bank.index == '2021smj'] # SN Ia, bright spiral
#dataset_ref = host_dataset_bank[host_dataset_bank.index == '2021dbg'] # SN II, bright spiral
#dataset_ref = host_dataset_bank[host_dataset_bank.index == '2021qht'] # no spec class, bright spiral, edge on
#dataset_ref = host_dataset_bank[host_dataset_bank.index == '2021sev'] # SN II, smudgy top down gal (good matches)
dataset_ref = host_dataset_bank[host_dataset_bank.index == '2021xbg'] # SN Ibc, spiral on edge pointing down (first match is good, rest not as much)
#dataset_ref = host_dataset_bank[host_dataset_bank.index == '2021aadc'] # SLSN-II, small smudgy host (good matches)
dataset_ref 

Unnamed: 0_level_0,gmomentXX,gmomentXY,gmomentYY,gmomentR1,gmomentRH,gPSFFlux,gApFlux,gKronFlux,gKronRad,gExtNSigma,...,yApFlux,yKronFlux,yKronRad,yExtNSigma,gApMag_gKronMag,rApMag_rKronMag,iApMag_iKronMag,zApMag_zKronMag,yApMag_yKronMag,dist/DLR
TransientName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021xbg,0.328799,-0.018915,0.402941,3.75606,1.21929,6e-05,0.000136,0.000284,9.39015,41.703098,...,0.000701,0.004427,14.0968,49.02,0.797499,1.8875,2.285799,2.3229,2.001001,0.751558


In [8]:
dataset_ref.columns.to_list()

['gmomentXX',
 'gmomentXY',
 'gmomentYY',
 'gmomentR1',
 'gmomentRH',
 'gPSFFlux',
 'gApFlux',
 'gKronFlux',
 'gKronRad',
 'gExtNSigma',
 'rmomentXX',
 'rmomentXY',
 'rmomentYY',
 'rmomentR1',
 'rmomentRH',
 'rPSFFlux',
 'rApFlux',
 'rKronFlux',
 'rKronRad',
 'rExtNSigma',
 'imomentXX',
 'imomentXY',
 'imomentYY',
 'imomentR1',
 'imomentRH',
 'iPSFFlux',
 'iApFlux',
 'iKronFlux',
 'iKronRad',
 'iExtNSigma',
 'zmomentXX',
 'zmomentXY',
 'zmomentYY',
 'zmomentR1',
 'zmomentRH',
 'zPSFFlux',
 'zApFlux',
 'zKronFlux',
 'zKronRad',
 'zExtNSigma',
 'ymomentXX',
 'ymomentXY',
 'ymomentYY',
 'ymomentR1',
 'ymomentRH',
 'yPSFFlux',
 'yApFlux',
 'yKronFlux',
 'yKronRad',
 'yExtNSigma',
 'gApMag_gKronMag',
 'rApMag_rKronMag',
 'iApMag_iKronMag',
 'zApMag_zKronMag',
 'yApMag_yKronMag',
 'dist/DLR']

In [9]:
feat_arr_ref = np.vstack(np.array(dataset_ref, dtype='object'))

#scaler.fit(feat_arr_ref) # Fit only to training data
scaled_feat_arr_ref = scaler.transform(feat_arr_ref) #scaler.transform(feat_arr) # What your model learns on

snid = dataset_ref.index.values[0]
lcdata = {'name': snid, 'vector': scaled_feat_arr_ref[0]}
#lcdata = {'name': ztf_id, 'vector': feat_arr_scaled[-5:-4][0]}

#result_id, result_dist = index.index.get_nns_by_item(, n=10, include_distances=True)
result_id, result_dist = index.index.get_nns_by_vector(lcdata['vector'], n=10, include_distances=True)
#result_name = index.query(lcdata['vector'])

print(f"ZTF ID name (ref) {snid}") 
link_ref = display_markdown(f'https://ziggy.ucolick.org/yse/transient_detail/{snid}', raw=True)
print(link_ref)
print("%%%%%%")
print("ANN\t\tIDX\t\t\t\tZTF_id\t\t\t\tDIST")#\tFEAT")
for n, (i, d) in enumerate(zip(result_id, result_dist)):#, feat_arr):
    print(f"{n}\t\t{i}\t\t\t\t{display_markdown(f'https://ziggy.ucolick.org/yse/transient_detail/{idx_arr[i]}', raw=True)}\t\t\t{round(d, 4)}")#\t{f}")

ZTF ID name (ref) 2021xbg


https://ziggy.ucolick.org/yse/transient_detail/2021xbg

None
%%%%%%
ANN		IDX				ZTF_id				DIST


https://ziggy.ucolick.org/yse/transient_detail/2021xbg

0		1868				None			0.0


https://ziggy.ucolick.org/yse/transient_detail/2021aeuw

1		1188				None			5.5506


https://ziggy.ucolick.org/yse/transient_detail/2020aczc

2		162				None			6.4408


https://ziggy.ucolick.org/yse/transient_detail/2020svn

3		791				None			6.6931


https://ziggy.ucolick.org/yse/transient_detail/2021adtd

4		1154				None			6.8995


https://ziggy.ucolick.org/yse/transient_detail/2020zqv

5		1026				None			6.9434


https://ziggy.ucolick.org/yse/transient_detail/2020xua

6		972				None			7.0209


https://ziggy.ucolick.org/yse/transient_detail/2021aceh

7		1103				None			7.1529


https://ziggy.ucolick.org/yse/transient_detail/2020tug

8		844				None			7.3866


https://ziggy.ucolick.org/yse/transient_detail/2020kpz

9		485				None			7.3905


In [10]:
# TODO: do ANN for all matches, find the 1st ANN with largest d from its reference.

In [11]:
# TODO: try on YSE DR1 with kostya extracted features and these host gal features!

In [12]:
# TODO: make github repo!

# YSE DR1 lc features and host info

In [13]:
archive = np.load("./features/yse_dr1_zenodo.npz")
ids = archive["ids"]  # object names
features = archive["features"]  # 2d array, objects x features 
# Convert to record arrays with feature names:
with open("./features/names.txt") as fh:
    names = fh.read().split()
features = np.rec.array(features, names=names, formats=["f"] * len(names)).squeeze()

In [14]:
dr1_lc_allfeat_df = pd.DataFrame(features, columns=names)
dr1_lc_allfeat_df = dr1_lc_allfeat_df.set_index(ids)
dr1_lc_allfeat_df.sort_index(inplace=True)
dr1_lc_allfeat_df

Unnamed: 0,mag_anderson_darling_normal_g,mag_bins_window1.0_offset0.0_beyond_1_std_g,mag_bins_window1.0_offset0.0_beyond_2_std_g,mag_bins_window1.0_offset0.0_eta_e_g,mag_bins_window1.0_offset0.0_kurtosis_g,mag_bins_window1.0_offset0.0_linear_trend_g,mag_bins_window1.0_offset0.0_linear_trend_sigma_g,mag_bins_window1.0_offset0.0_linear_trend_noise_g,mag_bins_window1.0_offset0.0_maximum_slope_g,mag_bins_window1.0_offset0.0_minimum_time_interval_g,...,fullflux_bazin_fit_fall_time_Y,fullflux_bazin_fit_reduced_chi2_Y,fullflux_villar_fit_amplitude_Y,fullflux_villar_fit_baseline_Y,fullflux_villar_fit_reference_time_Y,fullflux_villar_fit_rise_time_Y,fullflux_villar_fit_fall_time_Y,fullflux_villar_fit_plateau_rel_amplitude_Y,fullflux_villar_fit_plateau_duration_Y,fullflux_villar_fit_reduced_chi2_Y
2019lbi,0.216406,0.444444,0.0,10.290580,-1.233801,0.007288,0.001786,0.242701,0.209614,3.0,...,85.686859,2.423075,9208.376953,-92.017288,-1195.647583,83.931435,94.492111,0.288869,4.881969,1.047435
2019pmd,0.391645,0.285714,0.0,1.806227,-0.705101,0.011165,0.002437,0.129676,0.078864,3.0,...,23.487520,6.380569,32643.027344,305.142731,-1265.290039,3.740786,27.303635,0.766776,38.719402,5.219153
2019ppi,,0.000000,0.0,2.000000,,,,,0.082812,9.0,...,21.816778,3.437780,4157.710449,230.103683,-1201.411377,49.079441,56.896420,0.799803,28.853926,2.696669
2019szh,,0.000000,0.0,2.000000,,,,,0.092147,6.0,...,24.085527,3.861329,4443.414062,179.280121,-1264.499390,13.052733,23.961723,0.004652,41.019459,13.561466
2019tvv,,0.333333,0.0,27.854954,,0.003921,0.004322,0.219535,0.047642,6.0,...,22.693514,4.903403,48788.632812,167.568161,-1209.337646,8.856142,30.131523,0.790199,11.299833,2.127344
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021ztz,0.253991,0.250000,0.0,0.422869,3.625412,0.062985,0.011643,0.130696,0.086599,3.0,...,17.565086,1.424005,1643.183716,115.727654,-523.097534,3.430753,19.209139,0.029427,11.852758,1.670255
2021zuf,,0.000000,0.0,2.000000,,,,,0.001574,3.0,...,,,,,,,,,,
2021zvo,,0.333333,0.0,5.862638,,-0.029232,0.009775,0.200013,0.111863,2.0,...,21.983507,0.461530,1485.035889,-240.269516,-553.913696,5.401869,86654.609375,0.595883,35.150803,0.548194
2021zvx,,0.333333,0.0,1.354845,,0.008765,0.006001,0.064070,0.016849,6.0,...,30.308945,4.260723,492.952515,220.026062,-523.277039,3.950231,17.788208,0.000314,13.657744,0.278923


In [15]:
dr1_lc_allfeat_df.columns.to_list()

['mag_anderson_darling_normal_g',
 'mag_bins_window1.0_offset0.0_beyond_1_std_g',
 'mag_bins_window1.0_offset0.0_beyond_2_std_g',
 'mag_bins_window1.0_offset0.0_eta_e_g',
 'mag_bins_window1.0_offset0.0_kurtosis_g',
 'mag_bins_window1.0_offset0.0_linear_trend_g',
 'mag_bins_window1.0_offset0.0_linear_trend_sigma_g',
 'mag_bins_window1.0_offset0.0_linear_trend_noise_g',
 'mag_bins_window1.0_offset0.0_maximum_slope_g',
 'mag_bins_window1.0_offset0.0_minimum_time_interval_g',
 'mag_bins_window1.0_offset0.0_observation_count_g',
 'mag_bins_window1.0_offset0.0_skew_g',
 'mag_duration_g',
 'mag_inter_percentile_range_1_g',
 'mag_linear_fit_slope_g',
 'mag_linear_fit_slope_sigma_g',
 'mag_linear_fit_reduced_chi2_g',
 'mag_maximum_time_interval_g',
 'mag_observation_count_g',
 'mag_period_0_g',
 'mag_period_s_to_n_0_g',
 'mag_periodogram_median_g',
 'mag_periodogram_percent_difference_magnitude_percentile_25_g',
 'mag_chi2_g',
 'mag_stetson_K_g',
 'mag_weighted_mean_g',
 'mag_anderson_darling_n

In [16]:
lc_feature_names = [
#  'mag_anderson_darling_normal_g',
#  'mag_bins_window1.0_offset0.0_beyond_1_std_g',
#  'mag_bins_window1.0_offset0.0_beyond_2_std_g',
# #  'mag_bins_window1.0_offset0.0_eta_e_g',
#  'mag_bins_window1.0_offset0.0_kurtosis_g',
#  'mag_bins_window1.0_offset0.0_linear_trend_g',
#  'mag_bins_window1.0_offset0.0_linear_trend_sigma_g',
# #  'mag_bins_window1.0_offset0.0_linear_trend_noise_g',
#  'mag_bins_window1.0_offset0.0_maximum_slope_g',
#  'mag_bins_window1.0_offset0.0_minimum_time_interval_g',
#  'mag_bins_window1.0_offset0.0_observation_count_g',
#  'mag_bins_window1.0_offset0.0_skew_g',
#  'mag_duration_g',
#  'mag_inter_percentile_range_1_g',
# #  'mag_linear_fit_slope_g',
# #  'mag_linear_fit_slope_sigma_g',
# #  'mag_linear_fit_reduced_chi2_g',
# #  'mag_maximum_time_interval_g',
# #  'mag_observation_count_g',
# #  'mag_period_0_g',
#  'mag_period_s_to_n_0_g',
# #  'mag_periodogram_median_g',
# #  'mag_periodogram_percent_difference_magnitude_percentile_25_g',
# #  'mag_chi2_g',
#  'mag_stetson_K_g',
#  'mag_weighted_mean_g',
#  'mag_anderson_darling_normal_r',
#  'mag_bins_window1.0_offset0.0_beyond_1_std_r',
#  'mag_bins_window1.0_offset0.0_beyond_2_std_r',
# #  'mag_bins_window1.0_offset0.0_eta_e_r',
#  'mag_bins_window1.0_offset0.0_kurtosis_r',
#  'mag_bins_window1.0_offset0.0_linear_trend_r',
#  'mag_bins_window1.0_offset0.0_linear_trend_sigma_r',
# #  'mag_bins_window1.0_offset0.0_linear_trend_noise_r',
#  'mag_bins_window1.0_offset0.0_maximum_slope_r',
#  'mag_bins_window1.0_offset0.0_minimum_time_interval_r',
#  'mag_bins_window1.0_offset0.0_observation_count_r',
#  'mag_bins_window1.0_offset0.0_skew_r',
#  'mag_duration_r',
#  'mag_inter_percentile_range_1_r',
# #  'mag_linear_fit_slope_r',
# #  'mag_linear_fit_slope_sigma_r',
# #  'mag_linear_fit_reduced_chi2_r',
# #  'mag_maximum_time_interval_r',
# #  'mag_observation_count_r',
# #  'mag_period_0_r',
#  'mag_period_s_to_n_0_r',
#  'mag_periodogram_median_r',
#  'mag_periodogram_percent_difference_magnitude_percentile_25_r',
#  'mag_chi2_r',
#  'mag_stetson_K_r',
#  'mag_weighted_mean_r',
#  'mag_anderson_darling_normal_i',
#  'mag_bins_window1.0_offset0.0_beyond_1_std_i',
#  'mag_bins_window1.0_offset0.0_beyond_2_std_i',
#  'mag_bins_window1.0_offset0.0_eta_e_i',
#  'mag_bins_window1.0_offset0.0_kurtosis_i',
#  'mag_bins_window1.0_offset0.0_linear_trend_i',
#  'mag_bins_window1.0_offset0.0_linear_trend_sigma_i',
#  'mag_bins_window1.0_offset0.0_linear_trend_noise_i',
#  'mag_bins_window1.0_offset0.0_maximum_slope_i',
#  'mag_bins_window1.0_offset0.0_minimum_time_interval_i',
#  'mag_bins_window1.0_offset0.0_observation_count_i',
#  'mag_bins_window1.0_offset0.0_skew_i',
#  'mag_duration_i',
#  'mag_inter_percentile_range_1_i',
#  'mag_linear_fit_slope_i',
#  'mag_linear_fit_slope_sigma_i',
#  'mag_linear_fit_reduced_chi2_i',
#  'mag_maximum_time_interval_i',
#  'mag_observation_count_i',
#  'mag_period_0_i',
#  'mag_period_s_to_n_0_i',
#  'mag_periodogram_median_i',
#  'mag_periodogram_percent_difference_magnitude_percentile_25_i',
#  'mag_chi2_i',
#  'mag_stetson_K_i',
#  'mag_weighted_mean_i',
#  'mag_anderson_darling_normal_z',
#  'mag_bins_window1.0_offset0.0_beyond_1_std_z',
#  'mag_bins_window1.0_offset0.0_beyond_2_std_z',
#  'mag_bins_window1.0_offset0.0_eta_e_z',
#  'mag_bins_window1.0_offset0.0_kurtosis_z',
#  'mag_bins_window1.0_offset0.0_linear_trend_z',
#  'mag_bins_window1.0_offset0.0_linear_trend_sigma_z',
#  'mag_bins_window1.0_offset0.0_linear_trend_noise_z',
#  'mag_bins_window1.0_offset0.0_maximum_slope_z',
#  'mag_bins_window1.0_offset0.0_minimum_time_interval_z',
#  'mag_bins_window1.0_offset0.0_observation_count_z',
#  'mag_bins_window1.0_offset0.0_skew_z',
#  'mag_duration_z',
#  'mag_inter_percentile_range_1_z',
#  'mag_linear_fit_slope_z',
#  'mag_linear_fit_slope_sigma_z',
#  'mag_linear_fit_reduced_chi2_z',
#  'mag_maximum_time_interval_z',
#  'mag_observation_count_z',
#  'mag_period_0_z',
#  'mag_period_s_to_n_0_z',
#  'mag_periodogram_median_z',
#  'mag_periodogram_percent_difference_magnitude_percentile_25_z',
#  'mag_chi2_z',
#  'mag_stetson_K_z',
#  'mag_weighted_mean_z',
 'mag_anderson_darling_normal_X',
 'mag_bins_window1.0_offset0.0_beyond_1_std_X',
 'mag_bins_window1.0_offset0.0_beyond_2_std_X',
#  'mag_bins_window1.0_offset0.0_eta_e_X',
 'mag_bins_window1.0_offset0.0_kurtosis_X',
 'mag_bins_window1.0_offset0.0_linear_trend_X',
 'mag_bins_window1.0_offset0.0_linear_trend_sigma_X',
#  'mag_bins_window1.0_offset0.0_linear_trend_noise_X',
 'mag_bins_window1.0_offset0.0_maximum_slope_X',
 'mag_bins_window1.0_offset0.0_minimum_time_interval_X',
 'mag_bins_window1.0_offset0.0_observation_count_X',
 'mag_bins_window1.0_offset0.0_skew_X',
 'mag_duration_X',
 'mag_inter_percentile_range_1_X',
#  'mag_linear_fit_slope_X',
#  'mag_linear_fit_slope_sigma_X',
#  'mag_linear_fit_reduced_chi2_X',
#  'mag_maximum_time_interval_X',
#  'mag_observation_count_X',
#  'mag_period_0_X',
 'mag_period_s_to_n_0_X',
#  'mag_periodogram_median_X',
#  'mag_periodogram_percent_difference_magnitude_percentile_25_X',
#  'mag_chi2_X',
 'mag_stetson_K_X',
 'mag_weighted_mean_X',
 'mag_anderson_darling_normal_Y',
 'mag_bins_window1.0_offset0.0_beyond_1_std_Y',
 'mag_bins_window1.0_offset0.0_beyond_2_std_Y',
#  'mag_bins_window1.0_offset0.0_eta_e_Y',
 'mag_bins_window1.0_offset0.0_kurtosis_Y',
 'mag_bins_window1.0_offset0.0_linear_trend_Y',
 'mag_bins_window1.0_offset0.0_linear_trend_sigma_Y',
#  'mag_bins_window1.0_offset0.0_linear_trend_noise_Y',
 'mag_bins_window1.0_offset0.0_maximum_slope_Y',
 'mag_bins_window1.0_offset0.0_minimum_time_interval_Y',
 'mag_bins_window1.0_offset0.0_observation_count_Y',
 'mag_bins_window1.0_offset0.0_skew_Y',
 'mag_duration_Y',
 'mag_inter_percentile_range_1_Y',
#  'mag_linear_fit_slope_Y',
#  'mag_linear_fit_slope_sigma_Y',
#  'mag_linear_fit_reduced_chi2_Y',
#  'mag_maximum_time_interval_Y',
#  'mag_observation_count_Y',
#  'mag_period_0_Y',
 'mag_period_s_to_n_0_Y',
#  'mag_periodogram_median_Y',
#  'mag_periodogram_percent_difference_magnitude_percentile_25_Y',
#  'mag_chi2_Y',
 'mag_stetson_K_Y',
 'mag_weighted_mean_Y',
#  'flux_kurtosis_g',
#  'flux_skew_g',
#  'flux_kurtosis_r',
#  'flux_skew_r',
#  'flux_kurtosis_i',
#  'flux_skew_i',
#  'flux_kurtosis_z',
#  'flux_skew_z',
#  'flux_kurtosis_X',
#  'flux_skew_X',
#  'flux_kurtosis_Y',
#  'flux_skew_Y',
#  'fullflux_bazin_fit_amplitude_g',
#  'fullflux_bazin_fit_baseline_g',
#  'fullflux_bazin_fit_reference_time_g',
#  'fullflux_bazin_fit_rise_time_g',
#  'fullflux_bazin_fit_fall_time_g',
#  'fullflux_bazin_fit_reduced_chi2_g',
#  'fullflux_villar_fit_amplitude_g',
#  'fullflux_villar_fit_baseline_g',
#  'fullflux_villar_fit_reference_time_g',
#  'fullflux_villar_fit_rise_time_g',
#  'fullflux_villar_fit_fall_time_g',
#  'fullflux_villar_fit_plateau_rel_amplitude_g',
#  'fullflux_villar_fit_plateau_duration_g',
#  'fullflux_villar_fit_reduced_chi2_g',
#  'fullflux_bazin_fit_amplitude_r',
#  'fullflux_bazin_fit_baseline_r',
#  'fullflux_bazin_fit_reference_time_r',
#  'fullflux_bazin_fit_rise_time_r',
#  'fullflux_bazin_fit_fall_time_r',
#  'fullflux_bazin_fit_reduced_chi2_r',
#  'fullflux_villar_fit_amplitude_r',
#  'fullflux_villar_fit_baseline_r',
#  'fullflux_villar_fit_reference_time_r',
#  'fullflux_villar_fit_rise_time_r',
#  'fullflux_villar_fit_fall_time_r',
#  'fullflux_villar_fit_plateau_rel_amplitude_r',
#  'fullflux_villar_fit_plateau_duration_r',
#  'fullflux_villar_fit_reduced_chi2_r',
#  'fullflux_bazin_fit_amplitude_i',
#  'fullflux_bazin_fit_baseline_i',
#  'fullflux_bazin_fit_reference_time_i',
#  'fullflux_bazin_fit_rise_time_i',
#  'fullflux_bazin_fit_fall_time_i',
#  'fullflux_bazin_fit_reduced_chi2_i',
#  'fullflux_villar_fit_amplitude_i',
#  'fullflux_villar_fit_baseline_i',
#  'fullflux_villar_fit_reference_time_i',
#  'fullflux_villar_fit_rise_time_i',
#  'fullflux_villar_fit_fall_time_i',
#  'fullflux_villar_fit_plateau_rel_amplitude_i',
#  'fullflux_villar_fit_plateau_duration_i',
#  'fullflux_villar_fit_reduced_chi2_i',
#  'fullflux_bazin_fit_amplitude_z',
#  'fullflux_bazin_fit_baseline_z',
#  'fullflux_bazin_fit_reference_time_z',
#  'fullflux_bazin_fit_rise_time_z',
#  'fullflux_bazin_fit_fall_time_z',
#  'fullflux_bazin_fit_reduced_chi2_z',
#  'fullflux_villar_fit_amplitude_z',
#  'fullflux_villar_fit_baseline_z',
#  'fullflux_villar_fit_reference_time_z',
#  'fullflux_villar_fit_rise_time_z',
#  'fullflux_villar_fit_fall_time_z',
#  'fullflux_villar_fit_plateau_rel_amplitude_z',
#  'fullflux_villar_fit_plateau_duration_z',
#  'fullflux_villar_fit_reduced_chi2_z',
#  'fullflux_bazin_fit_amplitude_X',
#  'fullflux_bazin_fit_baseline_X',
#  'fullflux_bazin_fit_reference_time_X',
#  'fullflux_bazin_fit_rise_time_X',
#  'fullflux_bazin_fit_fall_time_X',
#  'fullflux_bazin_fit_reduced_chi2_X',
#  'fullflux_villar_fit_amplitude_X',
#  'fullflux_villar_fit_baseline_X',
#  'fullflux_villar_fit_reference_time_X',
#  'fullflux_villar_fit_rise_time_X',
#  'fullflux_villar_fit_fall_time_X',
#  'fullflux_villar_fit_plateau_rel_amplitude_X',
#  'fullflux_villar_fit_plateau_duration_X',
#  'fullflux_villar_fit_reduced_chi2_X',
#  'fullflux_bazin_fit_amplitude_Y',
#  'fullflux_bazin_fit_baseline_Y',
#  'fullflux_bazin_fit_reference_time_Y',
#  'fullflux_bazin_fit_rise_time_Y',
#  'fullflux_bazin_fit_fall_time_Y',
#  'fullflux_bazin_fit_reduced_chi2_Y',
#  'fullflux_villar_fit_amplitude_Y',
#  'fullflux_villar_fit_baseline_Y',
#  'fullflux_villar_fit_reference_time_Y',
#  'fullflux_villar_fit_rise_time_Y',
#  'fullflux_villar_fit_fall_time_Y',
#  'fullflux_villar_fit_plateau_rel_amplitude_Y',
#  'fullflux_villar_fit_plateau_duration_Y',
#  'fullflux_villar_fit_reduced_chi2_Y'
                    ]


# lc_feature_names = [
#      'fullflux_bazin_fit_amplitude_X',
#  'fullflux_bazin_fit_baseline_X',
#  'fullflux_bazin_fit_reference_time_X',
#  'fullflux_bazin_fit_rise_time_X',
#  'fullflux_bazin_fit_fall_time_X',
#  'fullflux_bazin_fit_reduced_chi2_X',
#  'fullflux_villar_fit_amplitude_X',
#  'fullflux_villar_fit_baseline_X',
#  'fullflux_villar_fit_reference_time_X',
#  'fullflux_villar_fit_rise_time_X',
#  'fullflux_villar_fit_fall_time_X',
#  'fullflux_villar_fit_plateau_rel_amplitude_X',
#  'fullflux_villar_fit_plateau_duration_X',
#  'fullflux_villar_fit_reduced_chi2_X',
#  'fullflux_bazin_fit_amplitude_Y',
#  'fullflux_bazin_fit_baseline_Y',
#  'fullflux_bazin_fit_reference_time_Y',
#  'fullflux_bazin_fit_rise_time_Y',
#  'fullflux_bazin_fit_fall_time_Y',
#  'fullflux_bazin_fit_reduced_chi2_Y',
#  'fullflux_villar_fit_amplitude_Y',
#  'fullflux_villar_fit_baseline_Y',
#  'fullflux_villar_fit_reference_time_Y',
#  'fullflux_villar_fit_rise_time_Y',
#  'fullflux_villar_fit_fall_time_Y',
#  'fullflux_villar_fit_plateau_rel_amplitude_Y',
#  'fullflux_villar_fit_plateau_duration_Y',
#  'fullflux_villar_fit_reduced_chi2_Y'
#                     ]

In [17]:
# only keep features of bank
lc_dataset_bank = dr1_lc_allfeat_df[lc_feature_names]
        
# drop row if any feature values are nan
lc_dataset_bank = lc_dataset_bank[~lc_dataset_bank[lc_feature_names].isnull().any(axis=1)]

#mask = (dataset_bank['num_mag_values'] >= 40) #& (dataset_bank['num_mag_values'] <= 40)
#dataset_bank = dataset_bank[mask]
lc_dataset_bank.index.name = 'TransientName'
lc_dataset_bank

Unnamed: 0_level_0,mag_anderson_darling_normal_X,mag_bins_window1.0_offset0.0_beyond_1_std_X,mag_bins_window1.0_offset0.0_beyond_2_std_X,mag_bins_window1.0_offset0.0_kurtosis_X,mag_bins_window1.0_offset0.0_linear_trend_X,mag_bins_window1.0_offset0.0_linear_trend_sigma_X,mag_bins_window1.0_offset0.0_maximum_slope_X,mag_bins_window1.0_offset0.0_minimum_time_interval_X,mag_bins_window1.0_offset0.0_observation_count_X,mag_bins_window1.0_offset0.0_skew_X,...,mag_bins_window1.0_offset0.0_linear_trend_sigma_Y,mag_bins_window1.0_offset0.0_maximum_slope_Y,mag_bins_window1.0_offset0.0_minimum_time_interval_Y,mag_bins_window1.0_offset0.0_observation_count_Y,mag_bins_window1.0_offset0.0_skew_Y,mag_duration_Y,mag_inter_percentile_range_1_Y,mag_period_s_to_n_0_Y,mag_stetson_K_Y,mag_weighted_mean_Y
TransientName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019lbi,1.085429,0.222222,0.000000,0.128628,0.018062,0.000526,0.121042,3.0,9.0,-1.286109,...,0.000262,0.101480,1.0,32.0,-0.872049,344.044006,3.648268,12.382952,0.831432,18.198195
2019pmd,0.726459,0.428571,0.000000,-1.441692,0.022316,0.004344,0.338969,2.0,21.0,-0.249814,...,0.002092,0.826923,1.0,21.0,0.295671,145.619019,4.427258,6.954203,0.933364,16.962160
2019ppi,1.309915,0.160000,0.080000,1.636233,0.011189,0.001569,0.376831,1.0,25.0,1.374264,...,0.000814,0.340778,1.0,43.0,0.206909,147.801025,1.946657,12.733513,0.831187,19.062292
2019tvv,4.366697,0.177778,0.066667,3.655688,0.023404,0.001816,0.296412,1.0,45.0,-1.953969,...,0.000712,0.202045,1.0,36.0,-0.788632,128.914917,4.447578,11.143915,0.920395,16.863365
2019ucc,0.130168,0.250000,0.000000,-4.296865,-0.012319,0.024869,0.061535,3.0,4.0,0.191147,...,0.004494,0.187197,2.0,11.0,0.686212,84.753052,2.013756,4.932055,0.842247,18.999735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021zep,0.509108,0.454545,0.000000,-1.627568,0.012449,0.013671,0.287601,2.0,11.0,0.150971,...,0.005810,0.150899,2.0,12.0,0.523085,75.838989,1.926023,4.859226,0.846542,19.030943
2021zqa,0.400820,0.142857,0.000000,1.401948,-0.032981,0.023129,0.217682,2.0,7.0,1.237870,...,0.010014,0.142833,2.0,7.0,0.229846,39.899994,1.112675,3.395781,0.838068,19.922844
2021zri,0.470219,0.357143,0.000000,-1.397271,0.053136,0.007138,0.347882,2.0,14.0,-0.203854,...,0.006439,0.157699,2.0,14.0,0.598503,65.880981,1.896738,5.734282,0.860473,17.555134
2021ztz,0.712603,0.285714,0.000000,-0.986181,0.001336,0.022291,0.165399,2.0,7.0,0.930388,...,0.008504,0.135857,2.0,9.0,0.722285,52.910980,1.722006,3.969246,0.842078,19.646832


In [18]:
# YSE DR1 features and host
dataset_bank = pd.concat([lc_dataset_bank, host_dataset_bank], axis=1)
dataset_bank = dataset_bank[~dataset_bank.isnull().any(axis=1)]
dataset_bank

Unnamed: 0_level_0,mag_anderson_darling_normal_X,mag_bins_window1.0_offset0.0_beyond_1_std_X,mag_bins_window1.0_offset0.0_beyond_2_std_X,mag_bins_window1.0_offset0.0_kurtosis_X,mag_bins_window1.0_offset0.0_linear_trend_X,mag_bins_window1.0_offset0.0_linear_trend_sigma_X,mag_bins_window1.0_offset0.0_maximum_slope_X,mag_bins_window1.0_offset0.0_minimum_time_interval_X,mag_bins_window1.0_offset0.0_observation_count_X,mag_bins_window1.0_offset0.0_skew_X,...,yApFlux,yKronFlux,yKronRad,yExtNSigma,gApMag_gKronMag,rApMag_rKronMag,iApMag_iKronMag,zApMag_zKronMag,yApMag_yKronMag,dist/DLR
TransientName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019lbi,1.085429,0.222222,0.000000,0.128628,0.018062,0.000526,0.121042,3.0,9.0,-1.286109,...,0.000282,0.000992,8.03673,45.397099,1.111500,1.045700,1.095100,1.185299,1.364901,1.456105
2019pmd,0.726459,0.428571,0.000000,-1.441692,0.022316,0.004344,0.338969,2.0,21.0,-0.249814,...,0.002036,0.008462,10.86290,31.184401,0.976000,1.221400,1.323200,1.253700,1.546600,0.420208
2019tvv,4.366697,0.177778,0.066667,3.655688,0.023404,0.001816,0.296412,1.0,45.0,-1.953969,...,0.000432,0.001037,7.87224,27.926201,0.928600,1.125900,1.000999,1.031599,0.950701,1.163162
2019ucc,0.130168,0.250000,0.000000,-4.296865,-0.012319,0.024869,0.061535,3.0,4.0,0.191147,...,0.001170,0.003381,10.28090,23.053499,1.068901,1.143600,1.316299,1.099200,1.152301,0.379586
2019uev,0.643400,0.294118,0.058824,-0.643401,0.008710,0.003826,0.174856,1.0,17.0,-0.629725,...,0.000255,0.000554,6.07050,32.341202,0.619699,0.749001,0.713200,0.764200,0.841801,0.612023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021zcj,0.546062,0.428571,0.000000,-0.651149,0.000512,0.004607,0.093615,1.0,7.0,-0.953949,...,0.000337,0.000429,4.03488,19.229700,0.235800,0.318800,0.279600,0.317801,0.263401,0.338579
2021zep,0.509108,0.454545,0.000000,-1.627568,0.012449,0.013671,0.287601,2.0,11.0,0.150971,...,0.000649,0.000926,5.89197,20.240700,0.361198,0.466999,0.493700,0.393900,0.385300,1.273129
2021zri,0.470219,0.357143,0.000000,-1.397271,0.053136,0.007138,0.347882,2.0,14.0,-0.203854,...,0.000840,0.002136,7.89484,49.709301,0.693201,0.732700,0.931401,0.949600,1.013801,0.129584
2021ztz,0.712603,0.285714,0.000000,-0.986181,0.001336,0.022291,0.165399,2.0,7.0,0.930388,...,0.000116,0.000120,3.12731,16.291100,0.176001,0.198000,0.186800,0.065802,0.041300,0.125253


In [19]:
# https://github.com/uiucsn/laiss/blob/main/LAISS/notebooks/LAISS_tests.ipynb
idx_arr = np.hstack(np.array(dataset_bank.index, dtype='object'))
feat_arr = np.vstack(np.array(dataset_bank, dtype='object'))

scaler = StandardScaler() # Create a scalar
#scaler = MinMaxScaler() # Create a scalar
scaler.fit(feat_arr) # Fit only to training data
feat_arr_scaled = scaler.transform(feat_arr) # What your model learns on

# Build ANNOY index of scaled features of dataset_bank
index = AnnoyIndex(feat_arr_scaled, idx_arr, metric='euclidean')
index.build()


In [23]:
# Reference

#dataset_ref = dataset_bank[dataset_bank.index == '2020awg'] # SN II, in spiral (medium size), matches all some II, some in Ia z=0.04 to z=0.06.
#dataset_ref = dataset_bank[dataset_bank.index == '2021ihp'] # SN II, in spiral (medium size)
#dataset_ref = dataset_bank[dataset_bank.index == '2020ulz'] # SN Ia, in spiral (medium size), matches all likely Ia. Most spiral
#dataset_ref = dataset_bank[dataset_bank.index == '2019pmd'] # SN Ia, in spiral (medium size), matches all Ia in z=0.02 to z=0.06. Most spiral
#dataset_ref = dataset_bank[dataset_bank.index == '2021xbg'] # SN Ibc, spiral on edge pointing down, matches to spirals, some Ib some II
dataset_ref = dataset_bank[dataset_bank.index == '2021aadc'] # SLSN-II, small smudgy host. Even in its ANN matches are the missed SLSN candidate
dataset_ref 

Unnamed: 0_level_0,mag_anderson_darling_normal_X,mag_bins_window1.0_offset0.0_beyond_1_std_X,mag_bins_window1.0_offset0.0_beyond_2_std_X,mag_bins_window1.0_offset0.0_kurtosis_X,mag_bins_window1.0_offset0.0_linear_trend_X,mag_bins_window1.0_offset0.0_linear_trend_sigma_X,mag_bins_window1.0_offset0.0_maximum_slope_X,mag_bins_window1.0_offset0.0_minimum_time_interval_X,mag_bins_window1.0_offset0.0_observation_count_X,mag_bins_window1.0_offset0.0_skew_X,...,yApFlux,yKronFlux,yKronRad,yExtNSigma,gApMag_gKronMag,rApMag_rKronMag,iApMag_iKronMag,zApMag_zKronMag,yApMag_yKronMag,dist/DLR
TransientName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021aadc,0.966049,0.142857,0.071429,1.933685,-0.023921,0.00378,0.09705,2.0,14.0,1.485604,...,4.8e-05,4.9e-05,2.59588,4.85476,0.019501,-0.059401,-0.021601,-0.008301,0.029701,0.128074


In [24]:
dataset_ref.columns.to_list()

['mag_anderson_darling_normal_X',
 'mag_bins_window1.0_offset0.0_beyond_1_std_X',
 'mag_bins_window1.0_offset0.0_beyond_2_std_X',
 'mag_bins_window1.0_offset0.0_kurtosis_X',
 'mag_bins_window1.0_offset0.0_linear_trend_X',
 'mag_bins_window1.0_offset0.0_linear_trend_sigma_X',
 'mag_bins_window1.0_offset0.0_maximum_slope_X',
 'mag_bins_window1.0_offset0.0_minimum_time_interval_X',
 'mag_bins_window1.0_offset0.0_observation_count_X',
 'mag_bins_window1.0_offset0.0_skew_X',
 'mag_duration_X',
 'mag_inter_percentile_range_1_X',
 'mag_period_s_to_n_0_X',
 'mag_stetson_K_X',
 'mag_weighted_mean_X',
 'mag_anderson_darling_normal_Y',
 'mag_bins_window1.0_offset0.0_beyond_1_std_Y',
 'mag_bins_window1.0_offset0.0_beyond_2_std_Y',
 'mag_bins_window1.0_offset0.0_kurtosis_Y',
 'mag_bins_window1.0_offset0.0_linear_trend_Y',
 'mag_bins_window1.0_offset0.0_linear_trend_sigma_Y',
 'mag_bins_window1.0_offset0.0_maximum_slope_Y',
 'mag_bins_window1.0_offset0.0_minimum_time_interval_Y',
 'mag_bins_window1.

In [25]:
feat_arr_ref = np.vstack(np.array(dataset_ref, dtype='object'))

#scaler.fit(feat_arr_ref) # Fit only to training data
scaled_feat_arr_ref = scaler.transform(feat_arr_ref) #scaler.transform(feat_arr) # What your model learns on

snid = dataset_ref.index.values[0]
lcdata = {'name': snid, 'vector': scaled_feat_arr_ref[0]}
#lcdata = {'name': ztf_id, 'vector': feat_arr_scaled[-5:-4][0]}

#result_id, result_dist = index.index.get_nns_by_item(, n=10, include_distances=True)
result_id, result_dist = index.index.get_nns_by_vector(lcdata['vector'], n=10, include_distances=True)
#result_name = index.query(lcdata['vector'])

print(f"ZTF ID name (ref) {snid}") 
link_ref = display_markdown(f'https://ziggy.ucolick.org/yse/transient_detail/{snid}', raw=True)
print(link_ref)
print("%%%%%%")
print("ANN\t\tIDX\t\t\t\tZTF_id\t\t\t\tDIST")#\tFEAT")
for n, (i, d) in enumerate(zip(result_id, result_dist)):#, feat_arr):
    print(f"{n}\t\t{i}\t\t\t\t{display_markdown(f'https://ziggy.ucolick.org/yse/transient_detail/{idx_arr[i]}', raw=True)}\t\t\t{round(d, 4)}")#\t{f}")

ZTF ID name (ref) 2021aadc


https://ziggy.ucolick.org/yse/transient_detail/2021aadc

None
%%%%%%
ANN		IDX				ZTF_id				DIST


https://ziggy.ucolick.org/yse/transient_detail/2021aadc

0		391				None			0.0


https://ziggy.ucolick.org/yse/transient_detail/2020vpn

1		352				None			5.472


https://ziggy.ucolick.org/yse/transient_detail/2020jvi

2		170				None			5.8975


https://ziggy.ucolick.org/yse/transient_detail/2020jlj

3		160				None			6.1749


https://ziggy.ucolick.org/yse/transient_detail/2020qlp

4		259				None			6.3496


https://ziggy.ucolick.org/yse/transient_detail/2020ohy

5		227				None			6.3691


https://ziggy.ucolick.org/yse/transient_detail/2021cbe

6		445				None			6.3994


https://ziggy.ucolick.org/yse/transient_detail/2021dpj

7		469				None			6.474


https://ziggy.ucolick.org/yse/transient_detail/2020nis

8		216				None			6.5073


https://ziggy.ucolick.org/yse/transient_detail/2021nsm

9		581				None			6.757
