# ALeRCE classes

https://github.com/ZwickyTransientFacility/ztf-avro-alert

1. **AGN:** Active Galactic Nuclei
1. **Blazar:** Blazar
1. **CV/Nova:** Cataclysmic Variable Star/Nova
1. **Ceph:** Cepheid Variable Star
1. **DSCT:** Delta Scuti Star
1. **EA:** Eclipsing Algol
1. **EB/EW:** Eclipsing Binaries/Eclipsing W Ursa Majoris
1. **LPV:** Long Period Variable
1. **Periodic-Other:** Periodic-Other
1. **QSO:** Quasi-Stellar Object
1. **RRL:** RRLyrae Variable Star
1. **RSCVn:** RS Canum Venaticorum
1. **SLSN:** Super Luminous Supernova
1. **SNII:** Supernova II
1. **SNIIb:** Supernova IIb
1. **SNIIn:** Supernova IIn
1. **SNIa:** Supernova Ia
1. **SNIbc:** Supernova Ibc
1. **TDE:** Tidal disruption event (to remove)
1. **YSO:** Young Stellar Object
1. **ZZ:** ZZ Ceti Stars (to remove)

In [1]:
import numpy as np
import pandas as pd

def subset_df_columns(df, subset_cols):
    df_cols = list(df.columns)
    return df[[c for c in subset_cols if c in df_cols]]

def set_index(df, index_name):
    if not df.index.name is None and df.index.name==index_name:
        return df
    df_cols = list(df.columns)
    assert index_name in df_cols
    return df.set_index([index_name])

def df_to_float32(df):
    for c in df.columns:
        if df[c].dtype=='float64':
            df[c] = df[c].astype(np.float32)

In [2]:
import numpy as np
import pandas as pd

load_root_dir = '../../TESIS/surveys_data'
survey_name = 'alerceZTFv5.1'
df_index_names = {
    'oid':'oid', # object id
    'oid_det':'oid', # object id
    'label':'classALeRCE', # object class name
    'ra':'ra',
    'dec':'dec',
    'band':'fid', # band
    'obs_day':'mjd', # days
    'obs':'magpsf_corr', # observations
    'obs_error':'sigmapsf_corr', # observation errors
}
subset_columns_names = {
    'labels':['oid', 'classALeRCE', 'ra', 'dec'],
    'detections':['oid', 'fid', 'mjd', 'magpsf_corr', 'sigmapsf_corr'],
}

### load files and processing
labels_df = pd.read_csv(f'{load_root_dir}/{survey_name}/dfcrossmatches_prioritized_v5.1.csv')
print(f'labels - columns: {list(labels_df.columns)} - id: {labels_df.index.name}')
labels_df = subset_df_columns(labels_df, subset_columns_names['labels']) # sub sample columns
labels_df = set_index(labels_df, df_index_names['oid']) # set index

detections_df = pd.read_csv(f'{load_root_dir}/{survey_name}/detections.csv')
print(f'detections_df - columns: {list(detections_df.columns)} - id: {detections_df.index.name}')
detections_df = subset_df_columns(detections_df, subset_columns_names['detections']) # sub sample columns
detections_df = set_index(detections_df, df_index_names['oid_det']) # set index
detections_df.index.rename(df_index_names['oid'], inplace=True)
df_to_float32(detections_df)

### print info
classes = np.unique(labels_df[df_index_names['label']].values)
print('classes:', classes)

labels - columns: ['oid', 'classALeRCE', 'ra', 'dec', 'period', 'source', 'id_source', 'class_source', 'separation_arcsec'] - id: None


  interactivity=interactivity, compiler=compiler, result=result)


detections_df - columns: ['oid', 'candid', 'mjd', 'fid', 'diffmaglim', 'magpsf', 'magap', 'sigmapsf', 'sigmagap', 'ra', 'dec', 'sigmara', 'sigmadec', 'isdiffpos', 'distpsnr1', 'sgscore1', 'field', 'rcid', 'magnr', 'sigmagnr', 'rb', 'magpsf_corr', 'magap_corr', 'sigmapsf_corr', 'sigmagap_corr', 'has_stamps', 'parent_candid'] - id: None
classes: ['AGN' 'Blazar' 'CV/Nova' 'Ceph' 'DSCT' 'EA' 'EB/EW' 'LPV'
 'Periodic-Other' 'QSO' 'RRL' 'RSCVn' 'SLSN' 'SNII' 'SNIIb' 'SNIIn' 'SNIa'
 'SNIbc' 'TDE' 'YSO' 'ZZ']


In [3]:
print('labels_df:', labels_df.info())
print(f'id: {labels_df.index.name}')
labels_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 170976 entries, ZTF19aavdfaf to ZTF18aaarlcy
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   classALeRCE  170976 non-null  object 
 1   ra           170976 non-null  float64
 2   dec          170976 non-null  float64
dtypes: float64(2), object(1)
memory usage: 5.2+ MB
labels_df: None
id: oid


Unnamed: 0_level_0,classALeRCE,ra,dec
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ZTF19aavdfaf,CV/Nova,323.746659,40.671826
ZTF19aaniokz,CV/Nova,306.822433,43.689485
ZTF18accatjz,CV/Nova,331.153495,53.506598
ZTF19aarfrrf,CV/Nova,271.876088,45.858938
ZTF19aarfrre,CV/Nova,271.876073,45.859003
ZTF19aapbkgn,CV/Nova,286.886476,52.974638
ZTF18abcoxgp,CV/Nova,329.384641,8.92086
ZTF18aclmvom,CV/Nova,45.945404,64.909859
ZTF18aaavxnm,CV/Nova,225.67054,33.573209
ZTF18acgtkde,CV/Nova,89.349715,72.698083


In [4]:
print('detections_df:', detections_df.info())
print(f'id: {detections_df.index.name}')
detections_df[:10]

<class 'pandas.core.frame.DataFrame'>
Index: 33950426 entries, ZTF18abnusmf to ZTF18abnusmf
Data columns (total 4 columns):
 #   Column         Dtype  
---  ------         -----  
 0   fid            int64  
 1   mjd            float32
 2   magpsf_corr    float32
 3   sigmapsf_corr  float32
dtypes: float32(3), int64(1)
memory usage: 906.6+ MB
detections_df: None
id: oid


Unnamed: 0_level_0,fid,mjd,magpsf_corr,sigmapsf_corr
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ZTF18abnusmf,1,58493.175781,16.425383,0.035894
ZTF18abnusmf,1,58490.113281,16.462883,0.028562
ZTF18abnusmf,1,58426.316406,16.422421,0.03471
ZTF18abcoetg,1,58645.402344,15.652393,0.023788
ZTF18abcoetg,1,58643.421875,15.625578,0.020292
ZTF18abcoetg,2,58642.476562,14.838137,0.027933
ZTF18abcoetg,1,58642.382812,15.422068,0.018376
ZTF18abcoetg,2,58638.292969,14.868429,0.025667
ZTF18abcoetg,2,58634.347656,14.854899,0.026655
ZTF18abcoetg,1,58632.4375,15.402876,0.01662


In [5]:
def get_valid_classes_ids(df, target_classes, df_index_names):
    new_df = df.reset_index()
    valid_oids = new_df.loc[new_df[df_index_names['label']].isin(target_classes)][df_index_names['oid']]
    return list(valid_oids.values)
    
target_classes = [
    #'EB/EW', 'EA', # Eclipsing Binaries
    'EB/EW', # Eclipsing Binaries
    'Ceph', # Cefeidas
    'RRL', # RR Lyrae
    'DSCT', # Delta Scuti
    'LPV', # Long Period Variables
]
valid_oids = get_valid_classes_ids(labels_df, target_classes, df_index_names)
print(valid_oids)

['ZTF18actbgzh', 'ZTF18acrttyk', 'ZTF18abclwyc', 'ZTF18abtpabq', 'ZTF17aaahryv', 'ZTF18abxdqcy', 'ZTF18abuknoz', 'ZTF18abifsvz', 'ZTF17aaarspi', 'ZTF18abfiwlb', 'ZTF18abiguhd', 'ZTF18adidkhj', 'ZTF19acshofd', 'ZTF19acdxcud', 'ZTF18aceihjp', 'ZTF18absgnis', 'ZTF18abasdby', 'ZTF17aaawwmu', 'ZTF18abtxznj', 'ZTF18abcphdo', 'ZTF18absgstj', 'ZTF18abskobh', 'ZTF18abascjw', 'ZTF18abcoxet', 'ZTF18abatafa', 'ZTF18abbfbhd', 'ZTF17aabumkz', 'ZTF18abascir', 'ZTF18abajxhg', 'ZTF18abcoxfl', 'ZTF18abdeyof', 'ZTF18abadjra', 'ZTF17aaadoxz', 'ZTF18abastui', 'ZTF18abcjrlo', 'ZTF17aabvhsb', 'ZTF18abadjtk', 'ZTF18aazuywy', 'ZTF18abmwrae', 'ZTF17aabuxmy', 'ZTF17aaafhpz', 'ZTF17aabuxrb', 'ZTF18abtpdln', 'ZTF18abvmijd', 'ZTF18abwbcpe', 'ZTF18abuyktn', 'ZTF17aacxkye', 'ZTF18acgehyy', 'ZTF19acjkrvh', 'ZTF17aabzwwn', 'ZTF18aaaygjr', 'ZTF18acowxdl', 'ZTF17aadlopo', 'ZTF19ackeyrc', 'ZTF18acxwqme', 'ZTF18abxtnhx', 'ZTF18acectrt', 'ZTF17aadmiyr', 'ZTF18aczehgf', 'ZTF19aaaokno', 'ZTF18aalkuax', 'ZTF18adbczks', 'ZTF19a

In [6]:
new_labels_df = labels_df[labels_df.index.isin(valid_oids)]
print(new_labels_df.info())
new_labels_df[:50]

<class 'pandas.core.frame.DataFrame'>
Index: 123957 entries, ZTF18actbgzh to ZTF18aaarlcy
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   classALeRCE  123957 non-null  object 
 1   ra           123957 non-null  float64
 2   dec          123957 non-null  float64
dtypes: float64(2), object(1)
memory usage: 3.8+ MB
None


Unnamed: 0_level_0,classALeRCE,ra,dec
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ZTF18actbgzh,RRL,84.253885,-6.553095
ZTF18acrttyk,LPV,67.336799,27.702003
ZTF18abclwyc,LPV,252.839219,-16.049543
ZTF18abtpabq,LPV,312.420821,72.264397
ZTF17aaahryv,EB/EW,86.549444,0.540504
ZTF18abxdqcy,LPV,315.529398,73.430312
ZTF18abuknoz,LPV,323.927968,68.652012
ZTF18abifsvz,LPV,277.018908,0.830122
ZTF17aaarspi,EB/EW,86.549471,0.540504
ZTF18abfiwlb,LPV,277.154754,1.828487


In [7]:
new_detections_df = detections_df[detections_df.index.isin(valid_oids)]
print(new_detections_df.info())
new_detections_df[:50]

<class 'pandas.core.frame.DataFrame'>
Index: 7023095 entries, ZTF18aanveug to ZTF18abfzzue
Data columns (total 4 columns):
 #   Column         Dtype  
---  ------         -----  
 0   fid            int64  
 1   mjd            float32
 2   magpsf_corr    float32
 3   sigmapsf_corr  float32
dtypes: float32(3), int64(1)
memory usage: 187.5+ MB
None


Unnamed: 0_level_0,fid,mjd,magpsf_corr,sigmapsf_corr
oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ZTF18aanveug,1,58647.386719,13.094085,0.011792
ZTF18aanveug,1,58641.429688,13.502243,0.010722
ZTF18aanveug,2,58632.320312,13.003044,0.012715
ZTF18aanveug,2,58627.40625,13.339897,0.020102
ZTF18aanveug,1,58616.472656,13.500403,0.011066
ZTF18aanveug,1,58577.496094,13.671066,0.023224
ZTF18aanveug,2,58612.445312,13.078698,0.014321
ZTF18aanveug,2,58580.488281,13.022718,0.012975
ZTF18aanveug,2,58512.539062,13.416394,0.018671
ZTF18aanveug,1,58573.496094,13.07948,0.011031


In [8]:
import pandas as pd

### save files
save_root_dir = f'data/{survey_name}'
new_labels_df.to_csv(f'{save_root_dir}/labels_vs.csv')
new_detections_df.to_parquet(f'{save_root_dir}/detections_vs.parquet')