# ALeRCE classes

https://github.com/ZwickyTransientFacility/ztf-avro-alert

1. **AGN:** Active Galactic Nuclei
1. **Blazar:** Blazar
1. **CV/Nova:** Cataclysmic Variable Star/Nova
1. **Ceph:** Cepheid Variable Star
1. **DSCT:** Delta Scuti Star
1. **EA:** Eclipsing Algol
1. **EB/EW:** Eclipsing Binaries/Eclipsing W Ursa Majoris
1. **LPV:** Long Period Variable
1. **Periodic-Other:** Periodic-Other
1. **QSO:** Quasi-Stellar Object
1. **RRL:** RRLyrae Variable Star
1. **RSCVn:** RS Canum Venaticorum
1. **SLSN:** Super Luminous Supernova
1. **SNII:** Supernova II
1. **SNIIb:** Supernova IIb
1. **SNIIn:** Supernova IIn
1. **SNIa:** Supernova Ia
1. **SNIbc:** Supernova Ibc
1. **TDE:** Tidal disruption event (to remove)
1. **YSO:** Young Stellar Object
1. **ZZ:** ZZ Ceti Stars (to remove)

In [1]:
import numpy as np
import pandas as pd

def subset_df_columns(df, subset_cols):
    if subset_cols is None:
        return df
    df_cols = list(df.columns)
    return df[[c for c in subset_cols if c in df_cols]]

def set_index(df, index_name):
    if not df.index.name is None and df.index.name==index_name:
        return df
    df_cols = list(df.columns)
    assert index_name in df_cols
    return df.set_index([index_name])

def df_to_float32(df):
    for c in df.columns:
        if df[c].dtype=='float64':
            df[c] = df[c].astype(np.float32)

In [2]:
import numpy as np
import pandas as pd

load_root_dir = '../../TESIS/surveys_data'
survey_name = 'alerceZTF_v7.1'
df_index_names = {
    'oid':'oid', # object id
    'oid_det':'objectId', # object id
    'label':'classALeRCE', # object class name
    'ra':'ra',
    'dec':'dec',
    'band':'fid', # band
    'obs_day':'mjd', # days
    'obs':'magpsf_corr', # observations
    'obs_error':'sigmapsf_corr', # observation errors
}
subset_columns_names = {
    'labels':['oid', 'classALeRCE', 'ra', 'dec'],
    'detections':['objectId', 'fid', 'mjd', 'magpsf_corr', 'sigmapsf_corr'],
}

### load files and processing
labels_df = pd.read_csv(f'{load_root_dir}/{survey_name}/dfcrossmatches_prioritized_v7.0.1.csv')
print(f'labels - columns: {list(labels_df.columns)} - id: {labels_df.index.name}')
#labels_df = subset_df_columns(labels_df, subset_columns_names['labels']) # sub sample columns
labels_df = set_index(labels_df, df_index_names['oid']) # set index

detections_df = pd.read_parquet(f'{load_root_dir}/{survey_name}/detections_with_xmatch')
print(f'detections_df - columns: {list(detections_df.columns)} - id: {detections_df.index.name}')
detections_df = subset_df_columns(detections_df, subset_columns_names['detections']) # sub sample columns
detections_df = set_index(detections_df, df_index_names['oid_det']) # set index
detections_df.index.rename(df_index_names['oid'], inplace=True)
df_to_float32(detections_df)

features_df = pd.read_parquet(f'{load_root_dir}/{survey_name}/storage/ztf_workspace/historic_data_20200916')
print(f'features_df - columns: {list(features_df.columns)} - id: {features_df.index.name}')
features_df = set_index(features_df, 'index') # set index
features_df.index.rename(df_index_names['oid'], inplace=True)
df_to_float32(features_df)

### print info
classes = np.unique(labels_df[df_index_names['label']].values)
print('classes:', classes)

labels - columns: ['oid', 'classALeRCE', 'ra', 'dec', 'period', 'source', 'id_source', 'class_source', 'separation_arcsec'] - id: None
detections_df - columns: ['fid', 'isdiffpos', 'sigmapsf', 'field', 'fwhm', 'dec', 'magpsf', 'rcid', 'ra', 'sky', 'rb', 'ssmagnr', 'distpsnr3', 'sgscore2', 'maggaiabright', 'distpsnr2', 'distpsnr1', 'maggaia', 'exptime', 'drb', 'sgscore3', 'neargaia', 'sgscore1', 'mjd', 'corrected', 'magpsf_corr', 'sigmapsf_corr', 'sigmapsf_corr_ext', 'dubious', 'has_stamp'] - id: objectId
features_df - columns: ['index', 'Amplitude_1', 'Amplitude_2', 'AndersonDarling_1', 'AndersonDarling_2', 'Autocor_length_1', 'Autocor_length_2', 'Beyond1Std_1', 'Beyond1Std_2', 'Con_1', 'Con_2', 'Eta_e_1', 'Eta_e_2', 'ExcessVar_1', 'ExcessVar_2', 'GP_DRW_sigma_1', 'GP_DRW_sigma_2', 'GP_DRW_tau_1', 'GP_DRW_tau_2', 'Gskew_1', 'Gskew_2', 'Harmonics_mag_1_1', 'Harmonics_mag_1_2', 'Harmonics_mag_2_1', 'Harmonics_mag_2_2', 'Harmonics_mag_3_1', 'Harmonics_mag_3_2', 'Harmonics_mag_4_1', 'Harmo

In [3]:
print('labels_df:', labels_df.info())
print(f'id: {labels_df.index.name}')
labels_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 173879 entries, ZTF19abegncu to ZTF18abgqxlw
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   classALeRCE        173879 non-null  object 
 1   ra                 173879 non-null  float64
 2   dec                173879 non-null  float64
 3   period             92683 non-null   object 
 4   source             173879 non-null  object 
 5   id_source          173879 non-null  object 
 6   class_source       173879 non-null  object 
 7   separation_arcsec  173879 non-null  float64
dtypes: float64(3), object(5)
memory usage: 11.9+ MB
labels_df: None
id: oid


Unnamed: 0_level_0,classALeRCE,ra,dec,period,source,id_source,class_source,separation_arcsec
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ZTF19abegncu,AGN,357.296363,-8.941186,,Oh2015,5.8772718059579e+17,AGN_galaxy_dominated,0.243081
ZTF18acejdhu,AGN,11.958444,-10.496615,,Oh2015,5.877272256953059e+17,AGN_galaxy_dominated,0.301126
ZTF18acdzene,AGN,17.47862,-10.11025,,Oh2015,5.877271789938934e+17,AGN_galaxy_dominated,0.158643
ZTF18abwzuzw,AGN,25.038255,-10.35243,,Oh2015,5.877272294485526e+17,AGN_galaxy_dominated,0.485322
ZTF19abmposz,AGN,26.670031,-8.354787,,Oh2015,5.877271806085038e+17,AGN_galaxy_dominated,0.21399
ZTF19abeytti,AGN,321.623931,10.576962,,Oh2015,5.877272213970623e+17,AGN_galaxy_dominated,0.340093
ZTF18abwvoze,AGN,323.925552,0.55196,,Oh2015,5.87730847965905e+17,AGN_galaxy_dominated,0.149686
ZTF18acatykj,AGN,326.479279,12.176132,,Oh2015,5.877272224729663e+17,AGN_galaxy_dominated,0.300553
ZTF19aapcxhy,AGN,154.202129,18.723076,,Oh2015,5.877420127343739e+17,AGN_galaxy_dominated,0.227455
ZTF19aawscoj,AGN,342.102634,0.155742,,Oh2015,5.877343048816394e+17,AGN_galaxy_dominated,0.333872


In [4]:
print('detections_df:', detections_df.info())
print(f'id: {detections_df.index.name}')
detections_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 26688137 entries, ZTF17aaaemke to ZTF20abfpkfh
Data columns (total 4 columns):
 #   Column         Dtype  
---  ------         -----  
 0   fid            int64  
 1   mjd            float32
 2   magpsf_corr    float32
 3   sigmapsf_corr  float32
dtypes: float32(3), int64(1)
memory usage: 712.7+ MB
detections_df: None
id: oid


Unnamed: 0_level_0,fid,mjd,magpsf_corr,sigmapsf_corr
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ZTF17aaaemke,1,58700.460938,16.478577,0.008541
ZTF17aaaemke,1,58754.289062,16.520813,100.0
ZTF17aaaemke,1,58763.378906,16.586325,0.02466
ZTF17aaaemke,1,58718.414062,16.350399,100.0
ZTF17aaaemke,1,58372.402344,16.530643,0.014738
ZTF17aaaemke,1,58679.445312,16.080139,100.0
ZTF17aaaemke,1,58510.128906,16.741459,0.026502
ZTF17aaaemke,1,58736.460938,16.580482,0.025537
ZTF17aaaemke,1,58789.207031,16.571587,100.0
ZTF17aaaemke,1,58482.128906,16.707977,0.000809


In [5]:
print('features_df:', features_df.info())
print(f'id: {features_df.index.name}')
features_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 898315 entries, ZTF17aaaafan to ZTF20abpvghz
Columns: 172 entries, Amplitude_1 to sgscore1
dtypes: float32(172)
memory usage: 596.3+ MB
features_df: None
id: oid


Unnamed: 0_level_0,Amplitude_1,Amplitude_2,AndersonDarling_1,AndersonDarling_2,Autocor_length_1,Autocor_length_2,Beyond1Std_1,Beyond1Std_2,Con_1,Con_2,...,n_non_det_after_fid_1,n_non_det_after_fid_2,n_non_det_before_fid_1,n_non_det_before_fid_2,n_pos_1,n_pos_2,positive_fraction_1,positive_fraction_2,rb,sgscore1
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF17aaaafan,0.094794,0.26939,0.500701,0.997029,2.0,1.0,0.285714,0.571429,0.0,0.0,...,179.0,151.0,6.0,11.0,0.0,3.0,0.0,0.428571,0.644286,0.984375
ZTF17aaaafbs,0.399396,0.368885,1.0,1.0,1.0,1.0,0.294118,0.308824,0.0,0.0,...,134.0,117.0,7.0,1.0,36.0,38.0,0.705882,0.558824,0.764286,0.986917
ZTF17aaabdlz,0.151955,0.143481,1.0,1.0,1.0,1.0,0.214286,0.409091,0.0,0.0,...,86.0,88.0,1.0,0.0,11.0,13.0,0.785714,0.590909,0.720714,0.977125
ZTF17aaabelc,0.247272,0.114171,1.0,0.481141,1.0,2.0,0.272727,0.4,0.0,0.0,...,76.0,103.0,3.0,14.0,16.0,0.0,0.727273,0.0,0.85119,1.0
ZTF17aaabgdt,0.164834,,0.762395,,1.0,,0.285714,,0.0,,...,10.0,,0.0,,3.0,,0.428571,,0.874286,0.942
ZTF17aaabmro,0.530131,0.33439,1.0,1.0,1.0,1.0,0.352941,0.366667,0.0,0.0,...,29.0,32.0,5.0,0.0,4.0,19.0,0.235294,0.633333,0.931429,0.99875
ZTF17aaacvqh,0.208042,0.192441,1.0,1.0,1.0,1.0,0.52381,0.370968,0.0,0.0,...,88.0,60.0,0.0,2.0,7.0,33.0,0.166667,0.532258,0.792143,0.966208
ZTF17aaadfsa,0.196974,0.185956,1.0,1.0,2.0,1.0,0.405405,0.470588,0.0,0.0,...,33.0,63.0,0.0,0.0,15.0,17.0,0.405405,0.5,0.822857,0.990833
ZTF17aaadina,1.004704,1.046327,0.818453,0.894374,1.0,1.0,0.230769,0.24,0.0,0.0,...,14.0,27.0,0.0,0.0,14.0,14.0,0.538462,0.56,0.793333,0.983125
ZTF17aaadkeg,0.203057,0.424995,0.325656,1.0,1.0,2.0,0.3,0.415094,0.0,0.039216,...,257.0,208.0,3.0,2.0,0.0,8.0,0.0,0.150943,0.787143,0.924417


In [6]:
def isin_filter_df(df, index, values, inverse=False):
    df = df.reset_index()
    df = df[~df[index].isin(values) if inverse else df[index].isin(values)].set_index(index)
    return df

train_indexs = list(labels_df.index.values)
train_detections_df = isin_filter_df(detections_df, df_index_names['oid'], train_indexs)
train_features_df = isin_filter_df(features_df, df_index_names['oid'], train_indexs)

test_detections_df = isin_filter_df(detections_df, df_index_names['oid'], train_indexs, inverse=True)
test_features_df = isin_filter_df(features_df, df_index_names['oid'], train_indexs, inverse=True)

# prints

In [7]:
print(train_detections_df.info())
train_detections_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 26688137 entries, ZTF17aaaemke to ZTF20abfpkfh
Data columns (total 4 columns):
 #   Column         Dtype  
---  ------         -----  
 0   fid            int64  
 1   mjd            float32
 2   magpsf_corr    float32
 3   sigmapsf_corr  float32
dtypes: float32(3), int64(1)
memory usage: 712.7+ MB
None


Unnamed: 0_level_0,fid,mjd,magpsf_corr,sigmapsf_corr
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ZTF17aaaemke,1,58700.460938,16.478577,0.008541
ZTF17aaaemke,1,58754.289062,16.520813,100.0
ZTF17aaaemke,1,58763.378906,16.586325,0.02466
ZTF17aaaemke,1,58718.414062,16.350399,100.0
ZTF17aaaemke,1,58372.402344,16.530643,0.014738
ZTF17aaaemke,1,58679.445312,16.080139,100.0
ZTF17aaaemke,1,58510.128906,16.741459,0.026502
ZTF17aaaemke,1,58736.460938,16.580482,0.025537
ZTF17aaaemke,1,58789.207031,16.571587,100.0
ZTF17aaaemke,1,58482.128906,16.707977,0.000809


In [8]:
print(train_features_df.info())
train_features_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 118469 entries, ZTF17aaabgdt to ZTF20abfpkfh
Columns: 172 entries, Amplitude_1 to sgscore1
dtypes: float32(172)
memory usage: 78.6+ MB
None


Unnamed: 0_level_0,Amplitude_1,Amplitude_2,AndersonDarling_1,AndersonDarling_2,Autocor_length_1,Autocor_length_2,Beyond1Std_1,Beyond1Std_2,Con_1,Con_2,...,n_non_det_after_fid_1,n_non_det_after_fid_2,n_non_det_before_fid_1,n_non_det_before_fid_2,n_pos_1,n_pos_2,positive_fraction_1,positive_fraction_2,rb,sgscore1
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF17aaabgdt,0.164834,,0.762395,,1.0,,0.285714,,0.0,,...,10.0,,0.0,,3.0,,0.428571,,0.874286,0.942
ZTF17aaadfsa,0.196974,0.185956,1.0,1.0,2.0,1.0,0.405405,0.470588,0.0,0.0,...,33.0,63.0,0.0,0.0,15.0,17.0,0.405405,0.5,0.822857,0.990833
ZTF17aaadxdb,0.253045,0.34963,1.0,1.0,1.0,1.0,0.5,0.35,0.0,0.0,...,72.0,106.0,0.0,0.0,27.0,17.0,0.385714,0.425,0.731429,0.973417
ZTF17aaadzlq,2.482923,1.983951,1.0,1.0,13.0,14.0,0.531646,0.469925,0.038298,0.049242,...,35.0,30.0,0.0,0.0,35.0,25.0,0.147679,0.093985,0.828571,0.939375
ZTF17aaaenfy,0.487668,0.455345,1.0,1.0,1.0,1.0,0.344444,0.318182,0.0,0.0,...,99.0,90.0,1.0,0.0,52.0,53.0,0.577778,0.481818,0.905714,1.0
ZTF17aaagrhs,0.466147,0.295652,1.0,1.0,1.0,1.0,0.331461,0.481781,0.0,0.0,...,239.0,288.0,0.0,0.0,151.0,61.0,0.424157,0.246964,0.778571,0.999
ZTF17aaagvzo,0.450122,0.347737,1.0,1.0,1.0,1.0,0.480769,0.326087,0.0,0.0,...,21.0,57.0,0.0,1.0,18.0,35.0,0.346154,0.76087,0.915,0.994583
ZTF17aaagwfr,0.481463,0.448709,1.0,1.0,1.0,1.0,0.4,0.368421,0.0,0.0,...,26.0,46.0,2.0,3.0,11.0,20.0,0.44,0.526316,0.82,1.0
ZTF17aaahtas,0.192741,0.105586,1.0,0.981686,1.0,1.0,0.428571,0.357143,0.0,0.0,...,25.0,38.0,9.0,1.0,15.0,0.0,0.535714,0.0,0.882857,0.979917
ZTF17aaaiogt,0.242838,0.16634,1.0,0.999619,1.0,1.0,0.3125,0.272727,0.0,0.0,...,14.0,17.0,0.0,0.0,11.0,7.0,0.6875,0.636364,0.788571,0.994375


In [9]:
print(test_detections_df.info())
test_detections_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fid            0 non-null      int64  
 1   mjd            0 non-null      float32
 2   magpsf_corr    0 non-null      float32
 3   sigmapsf_corr  0 non-null      float32
dtypes: float32(3), int64(1)
memory usage: 0.0+ bytes
None


Unnamed: 0_level_0,fid,mjd,magpsf_corr,sigmapsf_corr
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [10]:
print(test_features_df.info())
test_features_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 779846 entries, ZTF17aaaafan to ZTF20abpvghz
Columns: 172 entries, Amplitude_1 to sgscore1
dtypes: float32(172)
memory usage: 517.6+ MB
None


Unnamed: 0_level_0,Amplitude_1,Amplitude_2,AndersonDarling_1,AndersonDarling_2,Autocor_length_1,Autocor_length_2,Beyond1Std_1,Beyond1Std_2,Con_1,Con_2,...,n_non_det_after_fid_1,n_non_det_after_fid_2,n_non_det_before_fid_1,n_non_det_before_fid_2,n_pos_1,n_pos_2,positive_fraction_1,positive_fraction_2,rb,sgscore1
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZTF17aaaafan,0.094794,0.26939,0.500701,0.997029,2.0,1.0,0.285714,0.571429,0.0,0.0,...,179.0,151.0,6.0,11.0,0.0,3.0,0.0,0.428571,0.644286,0.984375
ZTF17aaaafbs,0.399396,0.368885,1.0,1.0,1.0,1.0,0.294118,0.308824,0.0,0.0,...,134.0,117.0,7.0,1.0,36.0,38.0,0.705882,0.558824,0.764286,0.986917
ZTF17aaabdlz,0.151955,0.143481,1.0,1.0,1.0,1.0,0.214286,0.409091,0.0,0.0,...,86.0,88.0,1.0,0.0,11.0,13.0,0.785714,0.590909,0.720714,0.977125
ZTF17aaabelc,0.247272,0.114171,1.0,0.481141,1.0,2.0,0.272727,0.4,0.0,0.0,...,76.0,103.0,3.0,14.0,16.0,0.0,0.727273,0.0,0.85119,1.0
ZTF17aaabmro,0.530131,0.33439,1.0,1.0,1.0,1.0,0.352941,0.366667,0.0,0.0,...,29.0,32.0,5.0,0.0,4.0,19.0,0.235294,0.633333,0.931429,0.99875
ZTF17aaacvqh,0.208042,0.192441,1.0,1.0,1.0,1.0,0.52381,0.370968,0.0,0.0,...,88.0,60.0,0.0,2.0,7.0,33.0,0.166667,0.532258,0.792143,0.966208
ZTF17aaadina,1.004704,1.046327,0.818453,0.894374,1.0,1.0,0.230769,0.24,0.0,0.0,...,14.0,27.0,0.0,0.0,14.0,14.0,0.538462,0.56,0.793333,0.983125
ZTF17aaadkeg,0.203057,0.424995,0.325656,1.0,1.0,2.0,0.3,0.415094,0.0,0.039216,...,257.0,208.0,3.0,2.0,0.0,8.0,0.0,0.150943,0.787143,0.924417
ZTF17aaadotj,0.282452,0.239367,1.0,1.0,1.0,1.0,0.387097,0.407407,0.0,0.0,...,34.0,40.0,1.0,1.0,11.0,15.0,0.354839,0.555556,0.811429,0.987262
ZTF17aaadskg,0.490247,0.393771,1.0,1.0,1.0,1.0,0.383562,0.333333,0.0,0.0,...,35.0,65.0,0.0,1.0,19.0,37.0,0.260274,0.822222,0.917857,1.0


In [11]:
import pandas as pd

### save files
save_root_dir = f'data/{survey_name}'
labels_df.to_csv(f'{save_root_dir}/labels.csv')
#train_detections_df.to_parquet(f'{save_root_dir}/detections_train.parquet')
train_features_df.to_parquet(f'{save_root_dir}/features_train.parquet')
#test_detections_df.to_parquet(f'{save_root_dir}/detections_test.parquet')
test_features_df.to_parquet(f'{save_root_dir}/features_test.parquet')