# ALeRCE classes

https://github.com/ZwickyTransientFacility/ztf-avro-alert

1. **AGN:** Active Galactic Nuclei
1. **Blazar:** Blazar
1. **CV/Nova:** Cataclysmic Variable Star/Nova
1. **Ceph:** Cepheid Variable Star
1. **DSCT:** Delta Scuti Star
1. **EA:** Eclipsing Algol
1. **EB/EW:** Eclipsing Binaries/Eclipsing W Ursa Majoris
1. **LPV:** Long Period Variable
1. **Periodic-Other:** Periodic-Other
1. **QSO:** Quasi-Stellar Object
1. **RRL:** RRLyrae Variable Star
1. **RSCVn:** RS Canum Venaticorum
1. **SLSN:** Super Luminous Supernova
1. **SNII:** Supernova II
1. **SNIIb:** Supernova IIb
1. **SNIIn:** Supernova IIn
1. **SNIa:** Supernova Ia
1. **SNIbc:** Supernova Ibc
1. **TDE:** Tidal disruption event (to remove)
1. **YSO:** Young Stellar Object
1. **ZZ:** ZZ Ceti Stars (to remove)

In [1]:
import sys
sys.path.append('../../')

In [2]:
%load_ext autoreload
%autoreload 2
from mismatch.alerce_utils import process_df_labels, process_df_detections, keep_only_valid_objs
import numpy as np
import pandas as pd

load_rootdir = '../../../../tesis/surveys_data'
survey_name = 'alerceZTFv7.1'
df_index_names = {
    'oid':'oid', # object id
    'oid_det':'index', # object id
    'label':'classALeRCE', # object class name
    'ra':'ra',
    'dec':'dec',
    'band':'fid', # band
    'obs_day':'mjd', # days
    'obs':'magpsf_corr', # observations
    'obs_error':'sigmapsf_corr', # observation errors
}

### load files and processing
features_df = pd.read_parquet(f'{load_rootdir}/{survey_name}/storage/ztf_workspace/historic_data_20200916')
features_df = features_df.set_index(['index'])
features_df, det_objs = process_df_detections(features_df, df_index_names['oid_det'], df_index_names['oid'])
print(f'features_df - columns: {list(features_df.columns)} - id: {features_df.index.name}')

labels_df = pd.read_csv(f'{load_rootdir}/{survey_name}/dfcrossmatches_prioritized_v7.0.1.csv')
labels_df, label_objs = process_df_labels(labels_df, df_index_names['oid'], det_objs)
print(f'labels - columns: {list(labels_df.columns)} - id: {labels_df.index.name}')

### filter
#valid_objs = list(set(det_objs) & set(label_objs))
#labels_df = keep_only_valid_objs(labels_df, valid_objs)
#features_df = keep_only_valid_objs(features_df, valid_objs)

### print info
classes = set(labels_df[df_index_names['label']].values)
print('classes:', classes)

features_df - columns: ['Amplitude_1', 'Amplitude_2', 'AndersonDarling_1', 'AndersonDarling_2', 'Autocor_length_1', 'Autocor_length_2', 'Beyond1Std_1', 'Beyond1Std_2', 'Con_1', 'Con_2', 'Eta_e_1', 'Eta_e_2', 'ExcessVar_1', 'ExcessVar_2', 'GP_DRW_sigma_1', 'GP_DRW_sigma_2', 'GP_DRW_tau_1', 'GP_DRW_tau_2', 'Gskew_1', 'Gskew_2', 'Harmonics_mag_1_1', 'Harmonics_mag_1_2', 'Harmonics_mag_2_1', 'Harmonics_mag_2_2', 'Harmonics_mag_3_1', 'Harmonics_mag_3_2', 'Harmonics_mag_4_1', 'Harmonics_mag_4_2', 'Harmonics_mag_5_1', 'Harmonics_mag_5_2', 'Harmonics_mag_6_1', 'Harmonics_mag_6_2', 'Harmonics_mag_7_1', 'Harmonics_mag_7_2', 'Harmonics_mse_1', 'Harmonics_mse_2', 'Harmonics_phase_2_1', 'Harmonics_phase_2_2', 'Harmonics_phase_3_1', 'Harmonics_phase_3_2', 'Harmonics_phase_4_1', 'Harmonics_phase_4_2', 'Harmonics_phase_5_1', 'Harmonics_phase_5_2', 'Harmonics_phase_6_1', 'Harmonics_phase_6_2', 'Harmonics_phase_7_1', 'Harmonics_phase_7_2', 'IAR_phi_1', 'IAR_phi_2', 'LinearTrend_1', 'LinearTrend_2', 'MHP

In [3]:
%load_ext autoreload
%autoreload 2
from mismatch.level_bars import LevelBar

classes, counts = np.unique(labels_df[df_index_names['label']].values, return_counts=True)
population_cdict = {c:counts[kc] for kc,c in enumerate(classes)}
print(LevelBar(population_cdict, ncols=60))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
|██                                                        | AGN - 4,248/118,469 (3.59%)
|▌                                                         | Blazar - 1,234/118,469 (1.04%)
|▍                                                         | CV/Nova - 884/118,469 (0.75%)
|▎                                                         | Ceph - 613/118,469 (0.52%)
|▎                                                         | DSCT - 731/118,469 (0.62%)
|███                                                       | EA - 6,196/118,469 (5.23%)
|███████████████▋                                          | EB/EW - 31,940/118,469 (26.96%)
|███████                                                   | LPV - 14,374/118,469 (12.13%)
|                                                          | NLAGN - 5/118,469 (0.00%)
|                                                          | NLQSO - 74/118,469 (0.06%)
|▏               

In [4]:
print('features_df:', features_df.info())
features_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 898315 entries, ZTF17aaaafan to ZTF20abpvghz
Columns: 172 entries, Amplitude_1 to sgscore1
dtypes: float32(2), float64(170)
memory usage: 1.2+ GB
features_df: None


Unnamed: 0_level_0,Amplitude_1,Amplitude_2,AndersonDarling_1,AndersonDarling_2,Autocor_length_1,Autocor_length_2,Beyond1Std_1,Beyond1Std_2,Con_1,Con_2,...,n_non_det_after_fid_1,n_non_det_after_fid_2,n_non_det_before_fid_1,n_non_det_before_fid_2,n_pos_1,n_pos_2,positive_fraction_1,positive_fraction_2,rb,sgscore1
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF17aaaafan,0.094794,0.26939,0.500701,0.997029,2.0,1.0,0.285714,0.571429,0.0,0.0,...,179.0,151.0,6.0,11.0,0.0,3.0,0.0,0.428571,0.644286,0.984375
ZTF17aaaafbs,0.399396,0.368885,1.0,1.0,1.0,1.0,0.294118,0.308824,0.0,0.0,...,134.0,117.0,7.0,1.0,36.0,38.0,0.705882,0.558824,0.764286,0.986917
ZTF17aaabdlz,0.151955,0.143481,1.0,1.0,1.0,1.0,0.214286,0.409091,0.0,0.0,...,86.0,88.0,1.0,0.0,11.0,13.0,0.785714,0.590909,0.720714,0.977125
ZTF17aaabelc,0.247272,0.114171,1.0,0.481141,1.0,2.0,0.272727,0.4,0.0,0.0,...,76.0,103.0,3.0,14.0,16.0,0.0,0.727273,0.0,0.85119,1.0
ZTF17aaabgdt,0.164834,,0.762395,,1.0,,0.285714,,0.0,,...,10.0,,0.0,,3.0,,0.428571,,0.874286,0.942
ZTF17aaabmro,0.530131,0.33439,1.0,1.0,1.0,1.0,0.352941,0.366667,0.0,0.0,...,29.0,32.0,5.0,0.0,4.0,19.0,0.235294,0.633333,0.931429,0.99875
ZTF17aaacvqh,0.208042,0.192441,1.0,1.0,1.0,1.0,0.52381,0.370968,0.0,0.0,...,88.0,60.0,0.0,2.0,7.0,33.0,0.166667,0.532258,0.792143,0.966208
ZTF17aaadfsa,0.196974,0.185956,1.0,1.0,2.0,1.0,0.405405,0.470588,0.0,0.0,...,33.0,63.0,0.0,0.0,15.0,17.0,0.405405,0.5,0.822857,0.990833
ZTF17aaadina,1.004704,1.046327,0.818453,0.894374,1.0,1.0,0.230769,0.24,0.0,0.0,...,14.0,27.0,0.0,0.0,14.0,14.0,0.538462,0.56,0.793333,0.983125
ZTF17aaadkeg,0.203057,0.424995,0.325656,1.0,1.0,2.0,0.3,0.415094,0.0,0.039216,...,257.0,208.0,3.0,2.0,0.0,8.0,0.0,0.150943,0.787143,0.924417


In [5]:
print('labels_df:', labels_df.info())
labels_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 118469 entries, ZTF19abmposz to ZTF18abgqxlw
Data columns (total 8 columns):
classALeRCE          118469 non-null object
ra                   118469 non-null float64
dec                  118469 non-null float64
period               66253 non-null object
source               118469 non-null object
id_source            118469 non-null object
class_source         118469 non-null object
separation_arcsec    118469 non-null float64
dtypes: float64(3), object(5)
memory usage: 8.1+ MB
labels_df: None


Unnamed: 0_level_0,classALeRCE,ra,dec,period,source,id_source,class_source,separation_arcsec
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ZTF19abmposz,AGN,26.670031,-8.354787,,Oh2015,5.877271806085038e+17,AGN_galaxy_dominated,0.21399
ZTF19aapcxhy,AGN,154.202129,18.723076,,Oh2015,5.877420127343739e+17,AGN_galaxy_dominated,0.227455
ZTF18abtyspw,AGN,25.660298,0.087434,,Oh2015,5.880155092805878e+17,AGN_galaxy_dominated,0.141792
ZTF18aaodoxr,AGN,208.231995,25.483231,,Oh2015,5.877398104846501e+17,AGN_galaxy_dominated,0.100498
ZTF18acidtzj,AGN,173.724508,22.45237,,Oh2015,5.877420143531131e+17,AGN_galaxy_dominated,0.354353
ZTF18aaqdpre,AGN,172.851656,34.503369,,Oh2015,5.877394073146492e+17,AGN_galaxy_dominated,0.20117
ZTF18aawqdsi,AGN,172.457413,37.281501,,Oh2015,5.877386175635128e+17,AGN_galaxy_dominated,0.078565
ZTF19aaoznlm,AGN,196.13763,3.299838,,Oh2015,5.877260338661951e+17,AGN_galaxy_dominated,0.141731
ZTF19aauiwtc,AGN,199.213783,5.946401,,Oh2015,5.877291605789902e+17,AGN_galaxy_dominated,0.316322
ZTF19aapvkcg,AGN,184.594672,38.845402,,Oh2015,5.880179772823306e+17,AGN_galaxy_dominated,0.048622


In [6]:
%load_ext autoreload
%autoreload 2
from mismatch.alerce_utils import get_train_test_split

train_features_df, test_features_df = get_train_test_split(features_df, labels_df, df_index_names['oid'])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# prints

In [7]:
print(train_features_df.info())
train_features_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 118469 entries, ZTF17aaabgdt to ZTF20abfpkfh
Columns: 172 entries, Amplitude_1 to sgscore1
dtypes: float32(2), float64(170)
memory usage: 155.5+ MB
None


Unnamed: 0_level_0,Amplitude_1,Amplitude_2,AndersonDarling_1,AndersonDarling_2,Autocor_length_1,Autocor_length_2,Beyond1Std_1,Beyond1Std_2,Con_1,Con_2,...,n_non_det_after_fid_1,n_non_det_after_fid_2,n_non_det_before_fid_1,n_non_det_before_fid_2,n_pos_1,n_pos_2,positive_fraction_1,positive_fraction_2,rb,sgscore1
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF17aaabgdt,0.164834,,0.762395,,1.0,,0.285714,,0.0,,...,10.0,,0.0,,3.0,,0.428571,,0.874286,0.942
ZTF17aaadfsa,0.196974,0.185956,1.0,1.0,2.0,1.0,0.405405,0.470588,0.0,0.0,...,33.0,63.0,0.0,0.0,15.0,17.0,0.405405,0.5,0.822857,0.990833
ZTF17aaadxdb,0.253045,0.34963,1.0,1.0,1.0,1.0,0.5,0.35,0.0,0.0,...,72.0,106.0,0.0,0.0,27.0,17.0,0.385714,0.425,0.731429,0.973417
ZTF17aaadzlq,2.482923,1.983951,1.0,1.0,13.0,14.0,0.531646,0.469925,0.038298,0.049242,...,35.0,30.0,0.0,0.0,35.0,25.0,0.147679,0.093985,0.828571,0.939375
ZTF17aaaenfy,0.487668,0.455345,1.0,1.0,1.0,1.0,0.344444,0.318182,0.0,0.0,...,99.0,90.0,1.0,0.0,52.0,53.0,0.577778,0.481818,0.905714,1.0
ZTF17aaagrhs,0.466147,0.295652,1.0,1.0,1.0,1.0,0.331461,0.481781,0.0,0.0,...,239.0,288.0,0.0,0.0,151.0,61.0,0.424157,0.246964,0.778571,0.999
ZTF17aaagvzo,0.450122,0.347737,1.0,1.0,1.0,1.0,0.480769,0.326087,0.0,0.0,...,21.0,57.0,0.0,1.0,18.0,35.0,0.346154,0.76087,0.915,0.994583
ZTF17aaagwfr,0.481463,0.448709,1.0,1.0,1.0,1.0,0.4,0.368421,0.0,0.0,...,26.0,46.0,2.0,3.0,11.0,20.0,0.44,0.526316,0.82,1.0
ZTF17aaahtas,0.192741,0.105586,1.0,0.981686,1.0,1.0,0.428571,0.357143,0.0,0.0,...,25.0,38.0,9.0,1.0,15.0,0.0,0.535714,0.0,0.882857,0.979917
ZTF17aaaiogt,0.242838,0.16634,1.0,0.999619,1.0,1.0,0.3125,0.272727,0.0,0.0,...,14.0,17.0,0.0,0.0,11.0,7.0,0.6875,0.636364,0.788571,0.994375


In [8]:
print(test_features_df.info())
test_features_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 779846 entries, ZTF17aaaafan to ZTF20abpvghz
Columns: 172 entries, Amplitude_1 to sgscore1
dtypes: float32(2), float64(170)
memory usage: 1023.4+ MB
None


Unnamed: 0_level_0,Amplitude_1,Amplitude_2,AndersonDarling_1,AndersonDarling_2,Autocor_length_1,Autocor_length_2,Beyond1Std_1,Beyond1Std_2,Con_1,Con_2,...,n_non_det_after_fid_1,n_non_det_after_fid_2,n_non_det_before_fid_1,n_non_det_before_fid_2,n_pos_1,n_pos_2,positive_fraction_1,positive_fraction_2,rb,sgscore1
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF17aaaafan,0.094794,0.26939,0.500701,0.997029,2.0,1.0,0.285714,0.571429,0.0,0.0,...,179.0,151.0,6.0,11.0,0.0,3.0,0.0,0.428571,0.644286,0.984375
ZTF17aaaafbs,0.399396,0.368885,1.0,1.0,1.0,1.0,0.294118,0.308824,0.0,0.0,...,134.0,117.0,7.0,1.0,36.0,38.0,0.705882,0.558824,0.764286,0.986917
ZTF17aaabdlz,0.151955,0.143481,1.0,1.0,1.0,1.0,0.214286,0.409091,0.0,0.0,...,86.0,88.0,1.0,0.0,11.0,13.0,0.785714,0.590909,0.720714,0.977125
ZTF17aaabelc,0.247272,0.114171,1.0,0.481141,1.0,2.0,0.272727,0.4,0.0,0.0,...,76.0,103.0,3.0,14.0,16.0,0.0,0.727273,0.0,0.85119,1.0
ZTF17aaabmro,0.530131,0.33439,1.0,1.0,1.0,1.0,0.352941,0.366667,0.0,0.0,...,29.0,32.0,5.0,0.0,4.0,19.0,0.235294,0.633333,0.931429,0.99875
ZTF17aaacvqh,0.208042,0.192441,1.0,1.0,1.0,1.0,0.52381,0.370968,0.0,0.0,...,88.0,60.0,0.0,2.0,7.0,33.0,0.166667,0.532258,0.792143,0.966208
ZTF17aaadina,1.004704,1.046327,0.818453,0.894374,1.0,1.0,0.230769,0.24,0.0,0.0,...,14.0,27.0,0.0,0.0,14.0,14.0,0.538462,0.56,0.793333,0.983125
ZTF17aaadkeg,0.203057,0.424995,0.325656,1.0,1.0,2.0,0.3,0.415094,0.0,0.039216,...,257.0,208.0,3.0,2.0,0.0,8.0,0.0,0.150943,0.787143,0.924417
ZTF17aaadotj,0.282452,0.239367,1.0,1.0,1.0,1.0,0.387097,0.407407,0.0,0.0,...,34.0,40.0,1.0,1.0,11.0,15.0,0.354839,0.555556,0.811429,0.987262
ZTF17aaadskg,0.490247,0.393772,1.0,1.0,1.0,1.0,0.383562,0.333333,0.0,0.0,...,35.0,65.0,0.0,1.0,19.0,37.0,0.260274,0.822222,0.917857,1.0


In [9]:
import pandas as pd

### save files
save_root_dir = f'../../data/{survey_name}'
labels_df.to_parquet(f'{save_root_dir}/labels.parquet')
train_features_df.to_parquet(f'{save_root_dir}/features_train.parquet')
test_features_df.to_parquet(f'{save_root_dir}/features_test.parquet')