In [2]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os

from astroquery.gaia import Gaia 

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# DATASET

Dataset de ***RR Lyrae*** y de ***Eclipse binaria***:
- **N** ejemplos cada uno
- periodo de catálogo no nulo
- al menos **L** puntos en cada banda
- magnitud promedio en banda g menor a **M**.

In [32]:
def dataset(N, L, M):

    # Crear carpeta dataset
    directory = "dataset"
    parent_dir = "../../PeriodogramsGaia/"
    path = os.path.join(parent_dir, directory)  
    os.mkdir(path)

    # Lista de directorios
    directories = ['ECL', 'RR']
    
    # crea carpetas para cada directories
    for directory in directories:
        parent_dir = "../../PeriodogramsGaia/dataset"
        path = os.path.join(parent_dir, directory)  
        os.mkdir(path)

    # Crea dataFrame con source_id, periodo y tipo
    data_global = pd.DataFrame()



    for tabla in ["vari_eclipsing_binary","vari_rrlyrae"]:
    
        ids_stars = []
        NN = 0
        while (NN < N):
        
            if tabla == "vari_rrlyrae":
                # Hace la consulta
                query = f"""
                select top { N-NN } *
                from gaiadr3.{tabla}
                where pf IS NOT NULL
                """
                type_star = "RR"
            else:
                query = f"""
                select top { N-NN } *
                from gaiadr3.{tabla}
                where frequency IS NOT NULL
                order by global_ranking desc
                """
                type_star = "ECL"
            job = Gaia.launch_job(query)
            ids = job.get_results().to_pandas("source_id")
            # Extrae las curvas de Gaia DR3
            datalink = Gaia.load_data(ids=ids.index, 
                            data_release='Gaia DR3', 
                            retrieval_type='EPOCH_PHOTOMETRY', 
                            format='csv', 
                            valid_data=False)
            type(datalink), len(datalink)
            # Evalua las curvas
            for key, value in datalink.items():

                lc = value[0].to_pandas()
                # Ver que la mascara no de vacia para time, flux y flux_err
                mask_empty = False
                for (name, data) in lc.groupby("band"):
                    mask = data["rejected_by_variability"]=="false"
                    time, flux, flux_err = data.loc[mask][['time', 'flux', 'flux_error']].values.T
                    if len(time) == 0:
                        mask_empty = True
                points_G = lc.groupby("band").get_group("G").shape[0]
                points_BP = lc.groupby("band").get_group("BP").shape[0]
                points_RP = lc.groupby("band").get_group("RP").shape[0]
                mag_mean_G = np.mean(lc.groupby("band").get_group("G")["mag"])

                name = lc.source_id[0]
                # Ver si cumple las condiciones de puntos en cada banda, mag promedio en la banda G y que no este repetido. Ademas que la mascara no sea vacia para alguna banda
                if ((points_BP >= L) & (points_G >= L) & (points_RP >= L) & (mag_mean_G < M) & (name not in ids_stars) & (mask_empty == False)):
                    ids_stars.append(name)
                    NN += 1
                    # Creo carpeta para cada lc
                    direct = "../../PeriodogramsGaia/dataset"
                    parent_dir = os.path.join(direct, type_star)
                    path = os.path.join(parent_dir, str(name))
                    os.mkdir(path) 
                    path_lc = os.path.join(path, str(name)+'.pkl')
                    lc.to_pickle(path_lc)
        
        # DataFrame con pf, tipo y source_id
        if type_star == "ECL":
            query = f"""
                select source_id,frequency
                from gaiadr3.{tabla}
                where source_id IN {tuple(ids_stars)}
            """

            job = Gaia.launch_job(query)
            info_star_type = job.get_results().to_pandas("source_id")
            info_star_type["frequency"] = 1/info_star_type["frequency"]
            info_star_type = info_star_type.rename(columns= {"frequency":"pf"})
        else:
            query = f"""
                select source_id,pf
                from gaiadr3.{tabla}
                where source_id IN {tuple(ids_stars)}
            """
            job = Gaia.launch_job(query)
            info_star_type = job.get_results().to_pandas("source_id")
        info_star_type["type"] = type_star
        data_global = pd.concat([data_global,info_star_type])

    return data_global

In [34]:
data_global = dataset(100,10,18)

KeyboardInterrupt: 

In [26]:

N = 100
L = 5
M=100
# Crear carpeta dataset
directory = "dataset"
parent_dir = "../../PeriodogramsGaia/"
path = os.path.join(parent_dir, directory)  
os.mkdir(path)

# Lista de directorios
directories = ['ECL', 'RR']

# crea carpetas para cada directories
for directory in directories:
    parent_dir = "../../PeriodogramsGaia/dataset"
    path = os.path.join(parent_dir, directory)  
    os.mkdir(path)

# Crea dataFrame con source_id, periodo y tipo
data_global = pd.DataFrame()



for tabla in ["vari_eclipsing_binary","vari_rrlyrae"]:
    
    ids_stars = []
    NN = 0
    while (NN < N):
        
        if tabla == "vari_rrlyrae":
            # Hace la consulta
            query = f"""
            select top { N-NN } *
            from gaiadr3.{tabla}
            where pf IS NOT NULL
            """
            type_star = "RR"
        else:
            query = f"""
            select top { N-NN } *
            from gaiadr3.{tabla}
            where frequency IS NOT NULL
            order by global_ranking desc
            """
            type_star = "ECL"
        job = Gaia.launch_job(query)
        ids = job.get_results().to_pandas("source_id")
        # Extrae las curvas de Gaia DR3
        datalink = Gaia.load_data(ids=ids.index, 
                        data_release='Gaia DR3', 
                        retrieval_type='EPOCH_PHOTOMETRY', 
                        format='csv', 
                        valid_data=False)
        type(datalink), len(datalink)
        # Evalua las curvas
        for key, value in datalink.items():

            lc = value[0].to_pandas()
            # Ver que la mascara no de vacia para time, flux y flux_err
            mask_empty = False
            for (name, data) in lc.groupby("band"):
                print(name)
                mask = data["rejected_by_variability"]=="false"
                time, flux, flux_err = data.loc[mask][['time', 'flux', 'flux_error']].values.T
                if len(time) == 0:
                    mask_empty = True
            points_G = lc.groupby("band").get_group("G").shape[0]
            points_BP = lc.groupby("band").get_group("BP").shape[0]
            points_RP = lc.groupby("band").get_group("RP").shape[0]
            mag_mean_G = np.mean(lc.groupby("band").get_group("G")["mag"])

            name = lc.source_id[0]
            # Ver si cumple las condiciones de puntos en cada banda, mag promedio en la banda G y que no este repetido. Ademas que la mascara no sea vacia para alguna banda
            if ((points_BP >= L) & (points_G >= L) & (points_RP >= L) & (mag_mean_G < M) & (name not in ids_stars) & (mask_empty == False)):
                ids_stars.append(name)
                NN += 1
                # Creo carpeta para cada lc
                direct = "../../PeriodogramsGaia/dataset"
                parent_dir = os.path.join(direct, type_star)
                path = os.path.join(parent_dir, str(name))
                os.mkdir(path) 
                path_lc = os.path.join(path, str(name)+'.pkl')
                lc.to_pickle(path_lc)
        
    # DataFrame con pf, tipo y source_id
    if type_star == "ECL":
        query = f"""
            select source_id,frequency
            from gaiadr3.{tabla}
            where source_id IN {tuple(ids_stars)}
        """

        job = Gaia.launch_job(query)
        info_star_type = job.get_results().to_pandas("source_id")
        info_star_type["frequency"] = 1/info_star_type["frequency"]
        info_star_type = info_star_type.rename(columns= {"frequency":"pf"})
    else:
        query = f"""
            select source_id,pf
            from gaiadr3.{tabla}
            where source_id IN {tuple(ids_stars)}
        """
        job = Gaia.launch_job(query)
        info_star_type = job.get_results().to_pandas("source_id")
    info_star_type["type"] = type_star
    data_global = pd.concat([data_global,info_star_type])

BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP
BP
G
RP


KeyboardInterrupt: 

In [30]:
data_global.to_pickle("../../PeriodogramsGaia/dataset/data_global.pkl")

In [31]:
data_global

Unnamed: 0_level_0,pf,type
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3129144706069228928,0.572676,ECL
3326843532470471168,0.375135,ECL
3341917626431837824,21.96483,ECL
3342230338707766912,1.563882,ECL
3351646380193556480,0.569235,ECL
4092207868554142976,8.46748,ECL
4100457405574370176,0.407195,ECL
4185297337638694528,1.873843,ECL
6095201502389599488,1.044925,ECL
6751021430570803968,2.564934,ECL
