In [2]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os

from astroquery.gaia import Gaia 

# DATASET

Dataset de ***RR Lyrae*** y de ***Eclipse binaria***:
- **N** ejemplos cada uno
- periodo de catálogo no nulo
- al menos **L** puntos en cada banda
- magnitud promedio en banda g menor a **M**.

In [12]:
def crear_carpeta_dataset(parent_dir, directory):
    path = os.path.join(parent_dir, directory)  
    os.mkdir(path)
    # Lista de directorios
    directories = ['ECL', 'RR']

    # crea carpetas para cada directories
    for directory in directories:
        parent_dir = "../../PeriodogramsGaia/dataset"
        path = os.path.join(parent_dir, directory)  
        os.mkdir(path)

In [13]:
def consulta(n, star_type, star_list):
    star_list_str = ",".join(["'{}'".format(item) for item in star_list])

    if star_type == "vari_rrlyrae":
        # Hace la consulta
        query = f"""
        select top {n} *
        from gaiadr3.{star_type}
        where pf IS NOT NULL AND source_id not in ({star_list_str})
        """
        type_star = "RR"
    else:
        query = f"""
        select top {n} *
        from gaiadr3.{star_type}
        where frequency IS NOT NULL AND source_id not in ({star_list_str})
        order by global_ranking desc
        """
        type_star = "ECL"

    job = Gaia.launch_job(query)
    ids = job.get_results().to_pandas("SOURCE_ID")
    
    # Extrae las curvas de Gaia DR3
    datalink = Gaia.load_data(ids=ids.index, 
                    data_release='Gaia DR3', 
                    retrieval_type='EPOCH_PHOTOMETRY', 
                    format='csv', 
                    valid_data=False)
    
    return datalink, type_star

In [14]:
def verificar(lc, direct, type_star, L, M):
    # Ver que la mascara no de vacia para time, flux y flux_err
    mask = lc["rejected_by_variability"]=="false"
    mask2 = lc["rejected_by_variability"]== False
    mask3 = mask | mask2

    name = lc.source_id[0]
    
    lc_1 = lc.loc[mask3]

    bands = ['G', 'BP', 'RP']
    points_G = points_BP = points_RP = mag_mean_G = 0

    if all(band in lc_1["band"].unique() for band in bands):                    
        points_G = lc_1.groupby("band").get_group("G").shape[0]
        points_BP = lc_1.groupby("band").get_group("BP").shape[0]
        points_RP = lc_1.groupby("band").get_group("RP").shape[0]
        mag_mean_G = np.mean(lc_1.groupby("band").get_group("G")["mag"])


        # Ver si cumple las condiciones de puntos en cada banda, mag promedio en la banda G y que no este repetido. Ademas que la mascara no sea vacia para alguna banda
        if ((points_BP >= L) & (points_G >= L) & (points_RP >= L) & (mag_mean_G < M)):
            # Creo carpeta para cada lc
            parent_dir = os.path.join(direct, type_star)
            path = os.path.join(parent_dir, str(name))
            # print(parent_dir + " -> " + path)
            os.mkdir(path) 
            path_lc = os.path.join(path, str(name)+'.pkl')
            lc.to_pickle(path_lc)
            
            return True, name
        
        return False, name
    
    return False, name


In [15]:
def create_df(type_star, data_global, ids_stars, tabla):
    if type_star == "ECL":
        query = f"""
            select source_id,frequency
            from gaiadr3.{tabla}
            where source_id IN {tuple(ids_stars)}
        """
        job = Gaia.launch_job(query)
        info_star_type = job.get_results().to_pandas("SOURCE_ID")
        info_star_type["frequency"] = 1/info_star_type["frequency"]
        info_star_type = info_star_type.rename(columns= {"frequency":"pf"})
    else:
        query = f"""
            select source_id,pf
            from gaiadr3.{tabla}
            where source_id IN {tuple(ids_stars)}
        """
        job = Gaia.launch_job(query)
        info_star_type = job.get_results().to_pandas("SOURCE_ID")
        
    info_star_type["type"] = type_star

    return pd.concat([data_global,info_star_type])

In [16]:
def dataset(N, L, M):

    # Crear carpeta dataset
    directory = "dataset"
    parent_dir = "../../PeriodogramsGaia/"
    path = os.path.join(parent_dir, directory)

    crear_carpeta_dataset(parent_dir, directory)

    # Crea dataFrame con source_id, periodo y tipo
    data_global = pd.DataFrame()

    agregados = []
    no_agregados = ['0000000000000000000'] # id inicial para que consulta no se caiga

    # ["vari_eclipsing_binary", "vari_rrlyrae"]
    for tabla in ["vari_eclipsing_binary", "vari_rrlyrae"]:
        # contador de las ya agregadas
        NN = 0
        while (NN < N):
            # 2000 es el máximo de curvas que retorna
            if (N-NN > 2000):
                datalink, type_star = consulta(2000, tabla, agregados + no_agregados)
            else:    
                datalink, type_star = consulta((N-NN), tabla, agregados + no_agregados)
            for key, value in datalink.items():
                lc = value[0].to_pandas()
                es_valido, name = verificar(lc, path, type_star, L, M)
                if es_valido:
                    agregados.append(name)
                    NN += 1
                else:
                    #print(name)
                    no_agregados.append(name)
        
        # DataFrame con pf, tipo y source_id
        data_global = create_df(type_star, data_global, agregados, tabla)

    print(all ([agregado in list(data_global.index) for agregado in agregados]))

    return data_global

In [7]:
def init_dataset():

    # Crear carpeta dataset
    directory = "dataset"
    parent_dir = "../../PeriodogramsGaia/"
    path = os.path.join(parent_dir, directory)

    crear_carpeta_dataset(parent_dir, directory)

    # Crea dataFrame con source_id, periodo y tipo
    data_global = pd.DataFrame()

    agregados = []
    no_agregados = ['0000000000000000000'] # id inicial para que consulta no se caiga
    
    return path, data_global, agregados, no_agregados

In [18]:
path, data_global, agregados, no_agregados = init_dataset()
N = 200000
L = 10
M = 18

In [None]:
# ["vari_eclipsing_binary", "vari_rrlyrae"]
for tabla in ["vari_eclipsing_binary"]:
    while (len(agregados) < N) and (len(no_agregados)+len(agregados)-1 < 271779):
        # 2000 es el máximo de curvas que retorna
        if (N-len(agregados) > 2000):
            datalink, type_star = consulta(2000, tabla, agregados + no_agregados)
        else:    
            datalink, type_star = consulta((N-len(agregados)), tabla, agregados + no_agregados)
        for key, value in datalink.items():
            lc = value[0].to_pandas()
            es_valido, name = verificar(lc, path, type_star, L, M)
            if es_valido:
                agregados.append(name)
            else:
                no_agregados.append(name)

In [None]:
# DataFrame con pf, tipo y source_id
data_global = create_df(type_star, data_global, agregados, tabla)

In [17]:
data_global = dataset(900,10,18)

True


In [18]:
data_global.to_pickle("../../PeriodogramsGaia/dataset/data_global.pkl")

In [19]:
data_global

Unnamed: 0_level_0,pf,type
SOURCE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
30259178472081408,0.900459,ECL
46008479948911360,0.590689,ECL
60393566669599744,0.609095,ECL
130944967216353536,3.021296,ECL
141075008362446080,0.479051,ECL
...,...,...
6070356594053780352,0.460950,RR
6070357345669486336,0.612006,RR
6070406170857888384,0.572528,RR
6070466781433899904,0.779386,RR
