# ALeRCE classes

https://github.com/ZwickyTransientFacility/ztf-avro-alert

1. **AGN:** Active Galactic Nuclei
1. **Blazar:** Blazar
1. **CV/Nova:** Cataclysmic Variable Star/Nova
1. **Ceph:** Cepheid Variable Star
1. **DSCT:** Delta Scuti Star
1. **EA:** Eclipsing Algol
1. **EB/EW:** Eclipsing Binaries/Eclipsing W Ursa Majoris
1. **LPV:** Long Period Variable
1. **Periodic-Other:** Periodic-Other
1. **QSO:** Quasi-Stellar Object
1. **RRL:** RRLyrae Variable Star
1. **RSCVn:** RS Canum Venaticorum
1. **SLSN:** Super Luminous Supernova
1. **SNII:** Supernova II
1. **SNIIb:** Supernova IIb
1. **SNIIn:** Supernova IIn
1. **SNIa:** Supernova Ia
1. **SNIbc:** Supernova Ibc
1. **TDE:** Tidal disruption event (to remove)
1. **YSO:** Young Stellar Object
1. **ZZ:** ZZ Ceti Stars (to remove)

# Columns names
1. **oid:** object id
1. **classALeRCE:** object class name
1. **fid:** band index, g=1, r=2

In [1]:
import sys
sys.path.append('../')
sys.path.append('../../')

In [2]:
import numpy as np
import pandas as pd

#survey_name = 'alerceZTFv5.1'
survey_name = 'alerceZTFv7.1' # use this dataset
df_index_names = {
    'oid':'oid', # object id
    'label':'classALeRCE', # object class name
    'ra':'ra',
    'dec':'dec',
    'band':'fid', # band
    'obs_day':'mjd', # days
    'obs':'magpsf_corr', # observations
    'obs_error':'sigmapsf_corr', # observation errors
}

### load files
load_root_dir = f'../data/{survey_name}'
labels_df = pd.read_parquet(f'{load_root_dir}/labels.parquet')
print(f'labels_df; columns={list(labels_df.columns)}; id={labels_df.index.name}')

features_train_df = pd.read_parquet(f'{load_root_dir}/features_train.parquet')
print(f'features_train_df; id={features_train_df.index.name}')
for k,c in enumerate(list(features_train_df.columns)):
    print(f'({k}) - {c}')

features_test_df = pd.read_parquet(f'{load_root_dir}/features_test.parquet')
#print(f'features_test_df - columns: {list(features_test_df.columns)} - id: {features_test_df.index.name}')

labels_df; columns=['classALeRCE', 'ra', 'dec', 'period', 'source', 'id_source', 'class_source', 'separation_arcsec']; id=oid
features_train_df; id=oid
(0) - Amplitude_1
(1) - Amplitude_2
(2) - AndersonDarling_1
(3) - AndersonDarling_2
(4) - Autocor_length_1
(5) - Autocor_length_2
(6) - Beyond1Std_1
(7) - Beyond1Std_2
(8) - Con_1
(9) - Con_2
(10) - Eta_e_1
(11) - Eta_e_2
(12) - ExcessVar_1
(13) - ExcessVar_2
(14) - GP_DRW_sigma_1
(15) - GP_DRW_sigma_2
(16) - GP_DRW_tau_1
(17) - GP_DRW_tau_2
(18) - Gskew_1
(19) - Gskew_2
(20) - Harmonics_mag_1_1
(21) - Harmonics_mag_1_2
(22) - Harmonics_mag_2_1
(23) - Harmonics_mag_2_2
(24) - Harmonics_mag_3_1
(25) - Harmonics_mag_3_2
(26) - Harmonics_mag_4_1
(27) - Harmonics_mag_4_2
(28) - Harmonics_mag_5_1
(29) - Harmonics_mag_5_2
(30) - Harmonics_mag_6_1
(31) - Harmonics_mag_6_2
(32) - Harmonics_mag_7_1
(33) - Harmonics_mag_7_2
(34) - Harmonics_mse_1
(35) - Harmonics_mse_2
(36) - Harmonics_phase_2_1
(37) - Harmonics_phase_2_2
(38) - Harmonics_phase_3

In [3]:
%load_ext autoreload
%autoreload 2
from mismatch.level_bars import LevelBar

classes, counts = np.unique(labels_df[df_index_names['label']].values, return_counts=True)
population_cdict = {c:counts[kc] for kc,c in enumerate(classes)}
print(LevelBar(population_cdict, ncols=60))

<tqdm.std.tqdm object at 0x7fbe90178650> AGN - 4,248/118,469 (3.59%)
<tqdm.std.tqdm object at 0x7fbe4cbbc550> Blazar - 1,234/118,469 (1.04%)
<tqdm.std.tqdm object at 0x7fbe4db4a250> CV/Nova - 884/118,469 (0.75%)
<tqdm.std.tqdm object at 0x7fbe4d2ca150> Ceph - 613/118,469 (0.52%)
<tqdm.std.tqdm object at 0x7fbdada75fd0> DSCT - 731/118,469 (0.62%)
<tqdm.std.tqdm object at 0x7fbe4d1a2750> EA - 6,196/118,469 (5.23%)
<tqdm.std.tqdm object at 0x7fbe90178650> EB/EW - 31,940/118,469 (26.96%)
<tqdm.std.tqdm object at 0x7fbe4cbbc550> LPV - 14,374/118,469 (12.13%)
<tqdm.std.tqdm object at 0x7fbe4db4a250> NLAGN - 5/118,469 (0.00%)
<tqdm.std.tqdm object at 0x7fbe4cbbc550> NLQSO - 74/118,469 (0.06%)
<tqdm.std.tqdm object at 0x7fbdada75fd0> Periodic-Other - 399/118,469 (0.34%)
<tqdm.std.tqdm object at 0x7fbe4d2ca150> QSO - 20,622/118,469 (17.41%)
<tqdm.std.tqdm object at 0x7fbe90178650> RRL - 32,607/118,469 (27.52%)
<tqdm.std.tqdm object at 0x7fbe4d1a2750> RSCVn - 995/118,469 (0.84%)
<tqdm.std.tqdm o

In [4]:
print(labels_df.info())
labels_df[:20]

<class 'pandas.core.frame.DataFrame'>
Index: 118469 entries, ZTF19abmposz to ZTF18abgqxlw
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   classALeRCE        118469 non-null  object 
 1   ra                 118469 non-null  float64
 2   dec                118469 non-null  float64
 3   period             66253 non-null   object 
 4   source             118469 non-null  object 
 5   id_source          118469 non-null  object 
 6   class_source       118469 non-null  object 
 7   separation_arcsec  118469 non-null  float64
dtypes: float64(3), object(5)
memory usage: 8.1+ MB
None


Unnamed: 0_level_0,classALeRCE,ra,dec,period,source,id_source,class_source,separation_arcsec
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ZTF19abmposz,AGN,26.670031,-8.354787,,Oh2015,5.877271806085038e+17,AGN_galaxy_dominated,0.21399
ZTF19aapcxhy,AGN,154.202129,18.723076,,Oh2015,5.877420127343739e+17,AGN_galaxy_dominated,0.227455
ZTF18abtyspw,AGN,25.660298,0.087434,,Oh2015,5.880155092805878e+17,AGN_galaxy_dominated,0.141792
ZTF18aaodoxr,AGN,208.231995,25.483231,,Oh2015,5.877398104846501e+17,AGN_galaxy_dominated,0.100498
ZTF18acidtzj,AGN,173.724508,22.45237,,Oh2015,5.877420143531131e+17,AGN_galaxy_dominated,0.354353
ZTF18aaqdpre,AGN,172.851656,34.503369,,Oh2015,5.877394073146492e+17,AGN_galaxy_dominated,0.20117
ZTF18aawqdsi,AGN,172.457413,37.281501,,Oh2015,5.877386175635128e+17,AGN_galaxy_dominated,0.078565
ZTF19aaoznlm,AGN,196.13763,3.299838,,Oh2015,5.877260338661951e+17,AGN_galaxy_dominated,0.141731
ZTF19aauiwtc,AGN,199.213783,5.946401,,Oh2015,5.877291605789902e+17,AGN_galaxy_dominated,0.316322
ZTF19aapvkcg,AGN,184.594672,38.845402,,Oh2015,5.880179772823306e+17,AGN_galaxy_dominated,0.048622


In [5]:
print(features_train_df.info())
features_train_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 118469 entries, ZTF17aaabgdt to ZTF20abfpkfh
Columns: 172 entries, Amplitude_1 to sgscore1
dtypes: float32(2), float64(170)
memory usage: 155.5+ MB
None


Unnamed: 0_level_0,Amplitude_1,Amplitude_2,AndersonDarling_1,AndersonDarling_2,Autocor_length_1,Autocor_length_2,Beyond1Std_1,Beyond1Std_2,Con_1,Con_2,...,n_non_det_after_fid_1,n_non_det_after_fid_2,n_non_det_before_fid_1,n_non_det_before_fid_2,n_pos_1,n_pos_2,positive_fraction_1,positive_fraction_2,rb,sgscore1
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF17aaabgdt,0.164834,,0.762395,,1.0,,0.285714,,0.0,,...,10.0,,0.0,,3.0,,0.428571,,0.874286,0.942
ZTF17aaadfsa,0.196974,0.185956,1.0,1.0,2.0,1.0,0.405405,0.470588,0.0,0.0,...,33.0,63.0,0.0,0.0,15.0,17.0,0.405405,0.5,0.822857,0.990833
ZTF17aaadxdb,0.253045,0.34963,1.0,1.0,1.0,1.0,0.5,0.35,0.0,0.0,...,72.0,106.0,0.0,0.0,27.0,17.0,0.385714,0.425,0.731429,0.973417
ZTF17aaadzlq,2.482923,1.983951,1.0,1.0,13.0,14.0,0.531646,0.469925,0.038298,0.049242,...,35.0,30.0,0.0,0.0,35.0,25.0,0.147679,0.093985,0.828571,0.939375
ZTF17aaaenfy,0.487668,0.455345,1.0,1.0,1.0,1.0,0.344444,0.318182,0.0,0.0,...,99.0,90.0,1.0,0.0,52.0,53.0,0.577778,0.481818,0.905714,1.0
ZTF17aaagrhs,0.466147,0.295652,1.0,1.0,1.0,1.0,0.331461,0.481781,0.0,0.0,...,239.0,288.0,0.0,0.0,151.0,61.0,0.424157,0.246964,0.778571,0.999
ZTF17aaagvzo,0.450122,0.347737,1.0,1.0,1.0,1.0,0.480769,0.326087,0.0,0.0,...,21.0,57.0,0.0,1.0,18.0,35.0,0.346154,0.76087,0.915,0.994583
ZTF17aaagwfr,0.481463,0.448709,1.0,1.0,1.0,1.0,0.4,0.368421,0.0,0.0,...,26.0,46.0,2.0,3.0,11.0,20.0,0.44,0.526316,0.82,1.0
ZTF17aaahtas,0.192741,0.105586,1.0,0.981686,1.0,1.0,0.428571,0.357143,0.0,0.0,...,25.0,38.0,9.0,1.0,15.0,0.0,0.535714,0.0,0.882857,0.979917
ZTF17aaaiogt,0.242838,0.16634,1.0,0.999619,1.0,1.0,0.3125,0.272727,0.0,0.0,...,14.0,17.0,0.0,0.0,11.0,7.0,0.6875,0.636364,0.788571,0.994375


In [6]:
print(features_test_df.info())
features_test_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 779846 entries, ZTF17aaaafan to ZTF20abpvghz
Columns: 172 entries, Amplitude_1 to sgscore1
dtypes: float32(2), float64(170)
memory usage: 1023.4+ MB
None


Unnamed: 0_level_0,Amplitude_1,Amplitude_2,AndersonDarling_1,AndersonDarling_2,Autocor_length_1,Autocor_length_2,Beyond1Std_1,Beyond1Std_2,Con_1,Con_2,...,n_non_det_after_fid_1,n_non_det_after_fid_2,n_non_det_before_fid_1,n_non_det_before_fid_2,n_pos_1,n_pos_2,positive_fraction_1,positive_fraction_2,rb,sgscore1
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF17aaaafan,0.094794,0.26939,0.500701,0.997029,2.0,1.0,0.285714,0.571429,0.0,0.0,...,179.0,151.0,6.0,11.0,0.0,3.0,0.0,0.428571,0.644286,0.984375
ZTF17aaaafbs,0.399396,0.368885,1.0,1.0,1.0,1.0,0.294118,0.308824,0.0,0.0,...,134.0,117.0,7.0,1.0,36.0,38.0,0.705882,0.558824,0.764286,0.986917
ZTF17aaabdlz,0.151955,0.143481,1.0,1.0,1.0,1.0,0.214286,0.409091,0.0,0.0,...,86.0,88.0,1.0,0.0,11.0,13.0,0.785714,0.590909,0.720714,0.977125
ZTF17aaabelc,0.247272,0.114171,1.0,0.481141,1.0,2.0,0.272727,0.4,0.0,0.0,...,76.0,103.0,3.0,14.0,16.0,0.0,0.727273,0.0,0.85119,1.0
ZTF17aaabmro,0.530131,0.33439,1.0,1.0,1.0,1.0,0.352941,0.366667,0.0,0.0,...,29.0,32.0,5.0,0.0,4.0,19.0,0.235294,0.633333,0.931429,0.99875
ZTF17aaacvqh,0.208042,0.192441,1.0,1.0,1.0,1.0,0.52381,0.370968,0.0,0.0,...,88.0,60.0,0.0,2.0,7.0,33.0,0.166667,0.532258,0.792143,0.966208
ZTF17aaadina,1.004704,1.046327,0.818453,0.894374,1.0,1.0,0.230769,0.24,0.0,0.0,...,14.0,27.0,0.0,0.0,14.0,14.0,0.538462,0.56,0.793333,0.983125
ZTF17aaadkeg,0.203057,0.424995,0.325656,1.0,1.0,2.0,0.3,0.415094,0.0,0.039216,...,257.0,208.0,3.0,2.0,0.0,8.0,0.0,0.150943,0.787143,0.924417
ZTF17aaadotj,0.282452,0.239367,1.0,1.0,1.0,1.0,0.387097,0.407407,0.0,0.0,...,34.0,40.0,1.0,1.0,11.0,15.0,0.354839,0.555556,0.811429,0.987262
ZTF17aaadskg,0.490247,0.393772,1.0,1.0,1.0,1.0,0.383562,0.333333,0.0,0.0,...,35.0,65.0,0.0,1.0,19.0,37.0,0.260274,0.822222,0.917857,1.0


In [7]:
%load_ext autoreload
%autoreload 2
from mismatch import _C
from mismatch.utils import get_object_features
from dask import dataframe as dd

### example using dask
features_train_ddf = dd.from_pandas(features_train_df, npartitions=_C.N_DASK) # dask dataframe can be faster
features_test_ddf = dd.from_pandas(features_test_df, npartitions=_C.N_DASK) # dask dataframe can be faster
labels_ddf = dd.from_pandas(labels_df, npartitions=_C.N_DASK) # dask dataframe can be faster

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
### get all features from an object
obj_name = 'ZTF18abvpirg' # from train
features, c, features_names = get_object_features(features_train_ddf, labels_ddf, obj_name)
fdict = {f:features[kf] for kf,f in enumerate(features_names)}
print(f'obj={obj_name}; class={c}; features={len(features)}')
for k,key in enumerate(fdict.keys()):
    print(f'({k}); {key}={fdict[key]}')

obj=ZTF18abvpirg; class=Ceph; features=172
(0); Amplitude_1=1.031099678461823
(1); Amplitude_2=0.7456786694723085
(2); AndersonDarling_1=0.9999358483604481
(3); AndersonDarling_2=0.9988798628424108
(4); Autocor_length_1=4.0
(5); Autocor_length_2=4.0
(6); Beyond1Std_1=0.34615384615384615
(7); Beyond1Std_2=0.34782608695652173
(8); Con_1=0.0
(9); Con_2=0.0
(10); Eta_e_1=0.010806181667720864
(11); Eta_e_2=0.003990760020892097
(12); ExcessVar_1=0.001266376138233806
(13); ExcessVar_2=0.0009696251301614164
(14); GP_DRW_sigma_1=0.20221442913297177
(15); GP_DRW_sigma_2=0.19696539950947367
(16); GP_DRW_tau_1=60.67272055353473
(17); GP_DRW_tau_2=76.49194777361599
(18); Gskew_1=0.21646808847086163
(19); Gskew_2=0.03816690760611152
(20); Harmonics_mag_1_1=0.6572860165719324
(21); Harmonics_mag_1_2=0.534231113216618
(22); Harmonics_mag_2_1=0.06727560624790807
(23); Harmonics_mag_2_2=0.09318407631460364
(24); Harmonics_mag_3_1=0.1119856901696408
(25); Harmonics_mag_3_2=0.07179964852751121
(26); Harmo

In [9]:
### get features per band from an object
obj_name = 'ZTF18abvpirg' # from train
band = 1
features, c, features_names = get_object_features(features_train_ddf, labels_ddf, obj_name, band=band)
fdict = {f:features[kf] for kf,f in enumerate(features_names)}
print(f'obj={obj_name}; class={c}; features={len(features)}')
for k,key in enumerate(fdict.keys()):
    print(f'({k}) {key}={fdict[key]}')

obj=ZTF18abvpirg; class=Ceph; features=78
(0) Amplitude_1=1.031099678461823
(1) AndersonDarling_1=0.9999358483604481
(2) Autocor_length_1=4.0
(3) Beyond1Std_1=0.34615384615384615
(4) Con_1=0.0
(5) Eta_e_1=0.010806181667720864
(6) ExcessVar_1=0.001266376138233806
(7) GP_DRW_sigma_1=0.20221442913297177
(8) GP_DRW_tau_1=60.67272055353473
(9) Gskew_1=0.21646808847086163
(10) Harmonics_mag_1_1=0.6572860165719324
(11) Harmonics_mag_2_1=0.06727560624790807
(12) Harmonics_mag_3_1=0.1119856901696408
(13) Harmonics_mag_4_1=0.12274118261108367
(14) Harmonics_mag_5_1=0.05710346340791948
(15) Harmonics_mag_6_1=0.06308390416802627
(16) Harmonics_mag_7_1=0.005024974773290941
(17) Harmonics_mse_1=0.04065159637204645
(18) Harmonics_phase_2_1=2.0009756351458234
(19) Harmonics_phase_3_1=6.04389265879566
(20) Harmonics_phase_4_1=6.168103655501289
(21) Harmonics_phase_5_1=1.8012355360321504
(22) Harmonics_phase_6_1=2.4778906023433684
(23) Harmonics_phase_7_1=0.04875669736539834
(24) IAR_phi_1=0.99115042444

In [10]:
%load_ext autoreload
%autoreload 2
from mismatch import _C
from mismatch.utils import get_object_features
from dask import dataframe as dd

### get features non-band-wise features
obj_name = 'ZTF18abvpirg' # from train
band = -1
features, c, features_names = get_object_features(features_train_ddf, labels_ddf, obj_name, band=band)
fdict = {f:features[kf] for kf,f in enumerate(features_names)}
print(f'obj={obj_name}; class={c}; features={len(features)}')
for k,key in enumerate(fdict.keys()):
    print(f'({k}); {key}={fdict[key]}')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
obj=ZTF18abvpirg; class=Ceph; features=15
(0); Multiband_period=120.48186199781824
(1); Period_fit=0.14047516331716914
(2); Power_rate_1/2=0.017695672810077667
(3); Power_rate_1/3=0.010019431822001934
(4); Power_rate_1/4=0.00591428903862834
(5); Power_rate_3=0.0020129734184592962
(6); Power_rate_4=0.008709192276000977
(7); g-r_max=1.3278560638427734
(8); g-r_max_corr=1.2227473430663895
(9); g-r_mean=1.1644687317685545
(10); g-r_mean_corr=1.386890650482556
(11); gal_b=-63.463776668220724
(12); gal_l=70.93750879526392
(13); rb=0.8971428871154785
(14); sgscore1=1.0


In [11]:
### get features from a non-labeled sample. returned class is None
obj_name = 'ZTF17aaacvqh' # from test
features, c, features_names = get_object_features(features_test_ddf, labels_ddf, obj_name, band=1)
fdict = {f:features[kf] for kf,f in enumerate(features_names)}
print(f'obj={obj_name}; class={c}; features={len(features)}')
for k,key in enumerate(fdict.keys()):
    print(f'({k}); {key}={fdict[key]}')

obj=ZTF17aaacvqh; class=None; features=78
(0); Amplitude_1=0.20804178165415887
(1); AndersonDarling_1=0.9999999996158513
(2); Autocor_length_1=1.0
(3); Beyond1Std_1=0.5238095238095238
(4); Con_1=0.0
(5); Eta_e_1=0.9310294146838879
(6); ExcessVar_1=7.536394261540391e-05
(7); GP_DRW_sigma_1=0.019762510785134756
(8); GP_DRW_tau_1=2.960422197639985
(9); Gskew_1=-0.24195298835015322
(10); Harmonics_mag_1_1=0.5847174335570929
(11); Harmonics_mag_2_1=0.44293660317862016
(12); Harmonics_mag_3_1=0.41101928752323746
(13); Harmonics_mag_4_1=0.2558478360411064
(14); Harmonics_mag_5_1=0.1591656406596421
(15); Harmonics_mag_6_1=0.13253637673847404
(16); Harmonics_mag_7_1=0.055432980343226385
(17); Harmonics_mse_1=0.00043336169026280227
(18); Harmonics_phase_2_1=3.98942263231893
(19); Harmonics_phase_3_1=1.1418135515878554
(20); Harmonics_phase_4_1=4.4329810273447166
(21); Harmonics_phase_5_1=1.9883027636802488
(22); Harmonics_phase_6_1=5.535276901546911
(23); Harmonics_phase_7_1=2.2313715695888803
(