In [3]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os

from astroquery.gaia import Gaia 

# DATASET

Dataset de ***RR Lyrae*** y de ***Eclipse binaria***:
- **N** ejemplos cada uno
- periodo de catálogo no nulo
- al menos **L** puntos en cada banda
- magnitud promedio en banda g menor a **M**.

In [80]:
def dataset(N, L, M):

    # Crear carpeta dataset
    directory = "dataset"
    parent_dir = "../../PeriodogramsGaia/"
    path = os.path.join(parent_dir, directory)  
    os.mkdir(path) 

    # Lista de directorios
    directories = ['ECL', 'RR']
    # crea carpetas para cada directories
    for directory in directories:
        parent_dir = "../../PeriodogramsGaia/dataset"
        path = os.path.join(parent_dir, directory)  
        os.mkdir(path)

    # Crea dataFrame con source_id, periodo y tipo
    data_global = pd.DataFrame()



    for tabla in ["vari_eclipsing_binary","vari_rrlyrae"]:
        ids_stars = []
        NN = 0
        while (NN < N):
        
            if tabla == "vari_rrlyrae":
                # Hace la consulta
                query = f"""
                select top { N-NN } *
                from gaiadr3.{tabla}
                where pf IS NOT NULL
                """
                type_star = "RR"
            else:
                query = f"""
                select top { N-NN } *
                from gaiadr3.{tabla}
                where frequency IS NOT NULL
                order by global_ranking desc
                """
                type_star = "ECL"
            job = Gaia.launch_job(query)
            ids = job.get_results().to_pandas("source_id")
            # Extrae las curvas de Gaia DR3
            datalink = Gaia.load_data(ids=ids.index, 
                            data_release='Gaia DR3', 
                            retrieval_type='EPOCH_PHOTOMETRY', 
                            format='csv', 
                            valid_data=False)
            type(datalink), len(datalink)
            # Evalua las curvas
            for key, value in datalink.items():
            
                lc = value[0].to_pandas()
                points_BP = lc.groupby(['band']).count().source_id['BP']
                points_G = lc.groupby(['band']).count().source_id['G']
                points_RP = lc.groupby(['band']).count().source_id['RP']
                mag_mean_G = lc.groupby(['band']).mean().mag['G'] 

                name = lc.source_id[0]
                # Ver si cumple las condiciones de puntos en cada banda, mag promedio en la banda G y que no este repetido
                if ((points_BP >= L) & (points_G >= L) & (points_RP >= L) & (mag_mean_G < M) & (name not in ids_stars)):
                    ids_stars.append(name)
                    NN += 1
                    # Creo carpeta para cada lc
                    direct = "../../PeriodogramsGaia/dataset"
                    parent_dir = os.path.join(direct, type_star)
                    path = os.path.join(parent_dir, str(name))
                    os.mkdir(path) 
                    path_lc = os.path.join(path, str(name)+'.pkl')
                    lc.to_pickle(path_lc)
        
        # DataFrame con pf, tipo y source_id
        if type_star == "ECL":
            query = f"""
                select source_id,frequency
                from gaiadr3.{tabla}
                where source_id IN {tuple(ids_stars)}
            """

            job = Gaia.launch_job(query)
            info_star_type = job.get_results().to_pandas("source_id")
            info_star_type["frequency"] = 1/info_star_type["frequency"]
            info_star_type = info_star_type.rename(columns= {"frequency":"pf"})
        else:
            query = f"""
                select source_id,pf
                from gaiadr3.{tabla}
                where source_id IN {tuple(ids_stars)}
            """
            job = Gaia.launch_job(query)
            info_star_type = job.get_results().to_pandas("source_id")
        info_star_type["type"] = type_star
        data_global = pd.concat([data_global,info_star_type])

    return data_global

In [83]:
data_global = dataset(1000,5,100)

  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G = lc.groupby(['band']).mean().mag['G']
  mag_mean_G

In [84]:
data_global.to_pickle("../../PeriodogramsGaia/dataset/data_global.pkl")

In [85]:
data_global

Unnamed: 0_level_0,pf,type
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1
30259178472081408,0.900459,ECL
46008479948911360,0.590689,ECL
60393566669599744,0.609095,ECL
130944967216353536,3.021296,ECL
141075008362446080,0.479051,ECL
...,...,...
6070365699376026496,0.638262,RR
6070406170857888384,0.572528,RR
6070466781433899904,0.779386,RR
6070483205388188800,0.644421,RR
