In [2]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os

from astroquery.gaia import Gaia 

In [24]:
query = """
select top 10 source_id,frequency, global_ranking 
from gaiadr3.vari_eclipsing_binary 
order by global_ranking desc
"""

job = Gaia.launch_job(query)
eclipsing_binaries = job.get_results().to_pandas("source_id")
eclipsing_binaries

Unnamed: 0_level_0,frequency,global_ranking
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3129144706069228928,1.746188,0.842134
4185297337638694528,0.533663,0.829747
3326843532470471168,2.665704,0.819494
3342230338707766912,0.639434,0.817321
4100457405574370176,2.455824,0.816518
6751021430570803968,0.389874,0.813557
6095201502389599488,0.957006,0.811477
3341917626431837824,0.045527,0.810064
4092207868554142976,0.118099,0.808853
3351646380193556480,1.756745,0.808433


In [4]:
type(eclipsing_binaries)

pandas.core.frame.DataFrame

In [5]:
datalink = Gaia.load_data(ids=eclipsing_binaries.index, 
                          data_release='Gaia DR3', 
                          retrieval_type='EPOCH_PHOTOMETRY', 
                          format='csv', 
                          valid_data=False)

type(datalink), len(datalink)

(dict, 10)

In [19]:
for key, value in datalink.items():
    break
print(key)
lc = value[0].to_pandas()
lc

dict_items

In [22]:
type(eclipsing_binaries)

astropy.table.table.Table

In [2]:
lista = ["123", "456", "789"]
lista_str = ",".join(["'{}'".format(item) for item in lista])

consulta = f"""
SELECT source_id
FROM x
WHERE source_id NOT IN ({lista_str})
"""

print(consulta)



SELECT source_id
FROM x
WHERE source_id NOT IN ('123','456','789')



In [3]:
lista_str

"'123','456','789'"

In [15]:

N = 100
L = 5
M=100
agregados = []
no_agregados = []

# Crear carpeta dataset
directory = "dataset"
parent_dir = "../../PeriodogramsGaia/"
path = os.path.join(parent_dir, directory)  
os.mkdir(path)

# agregados de directorios
directories = ['ECL', 'RR']

# crea carpetas para cada directories
for directory in directories:
    parent_dir = "../../PeriodogramsGaia/dataset"
    path = os.path.join(parent_dir, directory)  
    os.mkdir(path)

# Crea dataFrame con source_id, periodo y tipo
data_global = pd.DataFrame()



for tabla in ["vari_eclipsing_binary","vari_rrlyrae"]:
    NN = 0
    while (NN < N):
        lista1_str = ",".join(["'{}'".format(item) for item in agregados])
        lista2_str = ",".join(["'{}'".format(item) for item in no_agregados])
        if tabla == "vari_rrlyrae":
            # Hace la consulta
            query = f"""
            select top { N-NN } *
            from gaiadr3.{tabla}
            where pf IS NOT NULL
            """
            type_star = "RR"
        else:
            query = f"""
            select top { N-NN } *
            from gaiadr3.{tabla}
            where frequency IS NOT NULL
            order by global_ranking desc
            """
            type_star = "ECL"
            
        job = Gaia.launch_job(query)
        ids = job.get_results().to_pandas("source_id")

        # Extrae las curvas de Gaia DR3
        datalink = Gaia.load_data(ids=ids.index, 
                        data_release='Gaia DR3', 
                        retrieval_type='EPOCH_PHOTOMETRY', 
                        format='csv', 
                        valid_data=False)
        type(datalink), len(datalink)

        # Evalua las curvas
        for key, value in datalink.items():
            lc = value[0].to_pandas()

            mask = lc["rejected_by_variability"]=="false"
            mask2 = lc["rejected_by_variability"]== False
            mask3 = mask | mask2

            lc = lc.loc[mask3]

            bands = ['G', 'BP', 'RP']
            points_G = points_BP = points_RP = mag_mean_G = 0

            if all(band in lc["band"].unique() for band in bands):                    
                points_G = lc.groupby("band").get_group("G").shape[0]
                points_BP = lc.groupby("band").get_group("BP").shape[0]
                points_RP = lc.groupby("band").get_group("RP").shape[0]
                mag_mean_G = np.mean(lc.groupby("band").get_group("G")["mag"])

                name = lc["source_id"].iloc[0]
                # Ver si cumple las condiciones de puntos en cada banda, mag promedio en la banda G y que no este repetido. Ademas que la mascara no sea vacia para alguna banda
                if ((points_BP >= L) & (points_G >= L) & (points_RP >= L) & (mag_mean_G < M)):
                    agregados.append(name)
                    NN += 1
                    # Creo carpeta para cada lc
                    direct = "../../PeriodogramsGaia/dataset"
                    parent_dir = os.path.join(direct, type_star)
                    path = os.path.join(parent_dir, str(name))
                    os.mkdir(path) 
                    path_lc = os.path.join(path, str(name)+'.pkl')
                    lc.to_pickle(path_lc)
                else:
                    print("nou", name)
                    no_agregados.append(name)
        
    # DataFrame con pf, tipo y source_id
    if type_star == "ECL":
        query = f"""
            select source_id,frequency
            from gaiadr3.{tabla}
            where source_id IN {tuple(agregados)}
        """

        job = Gaia.launch_job(query)
        info_star_type = job.get_results().to_pandas("source_id")
        info_star_type["frequency"] = 1/info_star_type["frequency"]
        info_star_type = info_star_type.rename(columns= {"frequency":"pf"})
    else:
        query = f"""
            select source_id,pf
            from gaiadr3.{tabla}
            where source_id IN {tuple(agregados)}
        """
        job = Gaia.launch_job(query)
        info_star_type = job.get_results().to_pandas("source_id")
    info_star_type["type"] = type_star
    data_global = pd.concat([data_global,info_star_type])

nou 5835725794871365248
nou 3352102441297986560
nou 5835722668131699584
nou 4317937403912800384
nou 5835722771288644096
nou 5937091180561923968
nou 4312131638903915008
nou 5902591804246642944
nou 4312275915434370944
nou 414152842984521856
nou 5817459814331374464
nou 4312180876429210752
nou 4312093499581028992
nou 3352989128702678912


FileExistsError: [Errno 17] File exists: '../../PeriodogramsGaia/dataset/RR/3352538913055092736'

In [12]:
query = f"""
select top 10 *
from gaiadr3.vari_eclipsing_binary 
where frequency IS NOT NULL and source_id not in ('3129144706069228928', '4185297337638694528')
order by global_ranking desc
"""

job = Gaia.launch_job(query)
eclipsing_binaries = job.get_results().to_pandas("source_id")
eclipsing_binaries

Unnamed: 0_level_0,solution_id,global_ranking,reference_time,frequency,frequency_error,geom_model_reference_level,geom_model_reference_level_error,geom_model_gaussian1_phase,geom_model_gaussian1_phase_error,geom_model_gaussian1_sigma,...,derived_primary_ecl_duration,derived_primary_ecl_duration_error,derived_primary_ecl_depth,derived_primary_ecl_depth_error,derived_secondary_ecl_phase,derived_secondary_ecl_phase_error,derived_secondary_ecl_duration,derived_secondary_ecl_duration_error,derived_secondary_ecl_depth,derived_secondary_ecl_depth_error
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3326843532470471168,375316653866487564,0.819494,2274.619273,2.665704,1.2e-05,15.084185,0.000419,0.431179,2.9e-05,0.041335,...,0.231474,0.000171,5.082809,0.0,,,,,,
3342230338707766912,375316653866487564,0.817321,2256.612373,0.639434,1.9e-05,14.262313,0.00064,0.206075,0.001488,0.018492,...,0.103554,0.016111,0.587355,0.115889,0.709763,0.001984,0.096218,0.006379,0.555792,0.111453
4100457405574370176,375316653866487564,0.816518,2251.443895,2.455824,1e-06,15.797403,0.001802,0.260684,0.00034,0.04775,...,0.26519,0.007469,0.427325,0.00155,0.260684,0.00034,0.2674,0.007114,0.41543,0.003225
6751021430570803968,375316653866487564,0.813557,2216.52525,0.389874,1.1e-05,14.426311,0.000288,0.355864,0.009735,0.0105,...,0.116823,0.162784,0.588057,0.332215,0.355774,0.009837,0.058801,0.050499,0.002903,0.001382
6095201502389599488,375316653866487564,0.811477,2227.871266,0.957006,0.000186,14.189692,0.000337,0.286422,0.00064,0.044224,...,0.247654,0.006733,0.512218,0.006222,0.583061,0.017243,0.187291,0.083279,0.003428,0.000623
3341917626431837824,375316653866487564,0.810064,2257.915601,0.045527,2e-06,13.19306,0.002168,0.276434,0.000241,0.040749,...,0.228193,0.002231,0.674043,0.005798,0.781169,0.0425,0.218736,0.13658,0.667595,169.202942
4092207868554142976,375316653866487564,0.808853,2278.700703,0.118099,4.1e-05,16.510635,0.002507,0.687832,0.000915,0.034834,...,0.117697,0.000458,4.548398,0.016942,0.688042,0.001828,0.195072,0.008846,0.033274,0.001026
3351646380193556480,375316653866487564,0.808433,2198.458593,1.756745,4e-06,15.510347,0.002324,0.292442,0.000401,0.041034,...,0.221929,0.022245,0.630327,0.175819,0.292442,0.000401,0.229789,0.002708,0.622444,0.005009
4105521859197026688,375316653866487564,0.808133,2230.337185,1.202608,4.1e-05,14.376719,0.000794,0.524402,0.004964,0.021843,...,0.283049,0.023482,0.838137,0.177745,0.524532,0.005199,0.122319,0.044525,0.004709,0.000922
4313561656883135232,375316653866487564,0.805022,2151.356456,0.306668,1e-06,15.11642,0.000587,0.791276,5e-06,0.014746,...,0.082578,0.00014,0.636917,0.0,,,,,,


In [5]:
datalink = Gaia.load_data(ids=eclipsing_binaries.index, 
                          data_release='Gaia DR3', 
                          retrieval_type='EPOCH_PHOTOMETRY', 
                          format='csv', 
                          valid_data=False)

type(datalink), len(datalink)

(dict, 10)

In [19]:
def crear_carpeta_dataset(parent_dir, directory):
    path = os.path.join(parent_dir, directory)  
    os.mkdir(path)
    # Lista de directorios
    directories = ['ECL', 'RR']

    # crea carpetas para cada directories
    for directory in directories:
        parent_dir = "../../PeriodogramsGaia/dataset"
        path = os.path.join(parent_dir, directory)  
        os.mkdir(path)

In [84]:
def consulta(n, star_type, star_list):
    star_list_str = ",".join(["'{}'".format(item) for item in star_list])

    if star_type == "vari_rrlyrae":
        # Hace la consulta
        query = f"""
        select top {n} *
        from gaiadr3.{tabla}
        where pf IS NOT NULL AND source_id not in ({star_list_str})
        """
        type_star = "RR"
    else:
        query = f"""
        select top {n} *
        from gaiadr3.{tabla}
        where frequency IS NOT NULL AND source_id not in ({star_list_str})
        order by global_ranking desc
        """
        type_star = "ECL"

    job = Gaia.launch_job(query)
    ids = job.get_results().to_pandas("source_id")
    
    # Extrae las curvas de Gaia DR3
    datalink = Gaia.load_data(ids=ids.index, 
                    data_release='Gaia DR3', 
                    retrieval_type='EPOCH_PHOTOMETRY', 
                    format='csv', 
                    valid_data=False)
    
    return datalink, type_star

In [71]:
def verificar(lc, direct, type_star):
    # Ver que la mascara no de vacia para time, flux y flux_err
    mask = lc["rejected_by_variability"]=="false"
    mask2 = lc["rejected_by_variability"]== False
    mask3 = mask | mask2

    name = lc.source_id[0]
    
    lc = lc.loc[mask3]

    bands = ['G', 'BP', 'RP']
    points_G = points_BP = points_RP = mag_mean_G = 0

    if all(band in lc["band"].unique() for band in bands):                    
        points_G = lc.groupby("band").get_group("G").shape[0]
        points_BP = lc.groupby("band").get_group("BP").shape[0]
        points_RP = lc.groupby("band").get_group("RP").shape[0]
        mag_mean_G = np.mean(lc.groupby("band").get_group("G")["mag"])


        # Ver si cumple las condiciones de puntos en cada banda, mag promedio en la banda G y que no este repetido. Ademas que la mascara no sea vacia para alguna banda
        if ((points_BP >= L) & (points_G >= L) & (points_RP >= L) & (mag_mean_G < M)):
            # Creo carpeta para cada lc
            parent_dir = os.path.join(direct, type_star)
            path = os.path.join(parent_dir, str(name))
            # print(parent_dir + " -> " + path)
            os.mkdir(path) 
            path_lc = os.path.join(path, str(name)+'.pkl')
            lc.to_pickle(path_lc)
            
            return True, name
        
        return False, name
    
    return False, name


In [78]:
def create_df(type_star, data_global, ids_stars):
    if type_star == "ECL":
        query = f"""
            select source_id,frequency
            from gaiadr3.{tabla}
            where source_id IN {tuple(ids_stars)}
        """
        job = Gaia.launch_job(query)
        info_star_type = job.get_results().to_pandas("source_id")
        info_star_type["frequency"] = 1/info_star_type["frequency"]
        info_star_type = info_star_type.rename(columns= {"frequency":"pf"})
    else:
        query = f"""
            select source_id,pf
            from gaiadr3.{tabla}
            where source_id IN {tuple(ids_stars)}
        """
        job = Gaia.launch_job(query)
        info_star_type = job.get_results().to_pandas("source_id")
        
    info_star_type["type"] = type_star

    return pd.concat([data_global,info_star_type])

In [94]:

N = 100
L = 5
M = 100

# Crear carpeta dataset
directory = "dataset"
parent_dir = "../../PeriodogramsGaia/"
path = os.path.join(parent_dir, directory)

crear_carpeta_dataset(parent_dir, directory)

# Crea dataFrame con source_id, periodo y tipo
data_global = pd.DataFrame()

agregados = []
no_agregados = ['0000000000000000000'] # id inicial para que consulta no se caiga

for tabla in ["vari_eclipsing_binary", "vari_rrlyrae"]:
    NN = 0
    while (NN < N):
        datalink, type_star = consulta((N-NN), tabla, agregados + no_agregados)
        for key, value in datalink.items():
            lc = value[0].to_pandas()
            es_valido, name = verificar(lc, path, type_star)
            if es_valido:
                agregados.append(name)
                NN += 1
            else:
                #print(name)
                no_agregados.append(name)

        
    # DataFrame con pf, tipo y source_id
    data_global = create_df(type_star, data_global, agregados)


In [102]:
all ([agregado in list(data_global.index) for agregado in agregados])

True

In [None]:
'''
def dataset(N, L, M):

    # Crear carpeta dataset
    directory = "dataset"
    parent_dir = "../../PeriodogramsGaia/"
    path = os.path.join(parent_dir, directory)  
    os.mkdir(path)

    # Lista de directorios
    directories = ['ECL', 'RR']
    
    # crea carpetas para cada directories
    for directory in directories:
        parent_dir = "../../PeriodogramsGaia/dataset"
        path = os.path.join(parent_dir, directory)  
        os.mkdir(path)

    # Crea dataFrame con source_id, periodo y tipo
    data_global = pd.DataFrame()



    for tabla in ["vari_eclipsing_binary","vari_rrlyrae"]:
    
        ids_stars = []
        NN = 0
        while (NN < N):
        
            if tabla == "vari_rrlyrae":
                # Hace la consulta
                query = f"""
                select top { N-NN } *
                from gaiadr3.{tabla}
                where pf IS NOT NULL
                """
                type_star = "RR"
            else:
                query = f"""
                select top { N-NN } *
                from gaiadr3.{tabla}
                where frequency IS NOT NULL
                order by global_ranking desc
                """
                type_star = "ECL"
            job = Gaia.launch_job(query)
            ids = job.get_results().to_pandas("source_id")
            # Extrae las curvas de Gaia DR3
            datalink = Gaia.load_data(ids=ids.index, 
                            data_release='Gaia DR3', 
                            retrieval_type='EPOCH_PHOTOMETRY', 
                            format='csv', 
                            valid_data=False)
            type(datalink), len(datalink)
            # Evalua las curvas
            for key, value in datalink.items():

                lc = value[0].to_pandas()
                # Ver que la mascara no de vacia para time, flux y flux_err
                mask_empty = False
                for (name, data) in lc.groupby("band"):
                    mask = data["rejected_by_variability"]=="false"
                    time, flux, flux_err = data.loc[mask][['time', 'flux', 'flux_error']].values.T
                    if len(time) == 0:
                        mask_empty = True
                points_G = lc.groupby("band").get_group("G").shape[0]
                points_BP = lc.groupby("band").get_group("BP").shape[0]
                points_RP = lc.groupby("band").get_group("RP").shape[0]
                mag_mean_G = np.mean(lc.groupby("band").get_group("G")["mag"])

                name = lc.source_id[0]
                # Ver si cumple las condiciones de puntos en cada banda, mag promedio en la banda G y que no este repetido. Ademas que la mascara no sea vacia para alguna banda
                if ((points_BP >= L) & (points_G >= L) & (points_RP >= L) & (mag_mean_G < M) & (name not in ids_stars) & (mask_empty == False)):
                    ids_stars.append(name)
                    NN += 1
                    # Creo carpeta para cada lc
                    direct = "../../PeriodogramsGaia/dataset"
                    parent_dir = os.path.join(direct, type_star)
                    path = os.path.join(parent_dir, str(name))
                    os.mkdir(path) 
                    path_lc = os.path.join(path, str(name)+'.pkl')
                    lc.to_pickle(path_lc)
        
        # DataFrame con pf, tipo y source_id
        if type_star == "ECL":
            query = f"""
                select source_id,frequency
                from gaiadr3.{tabla}
                where source_id IN {tuple(ids_stars)}
            """

            job = Gaia.launch_job(query)
            info_star_type = job.get_results().to_pandas("source_id")
            info_star_type["frequency"] = 1/info_star_type["frequency"]
            info_star_type = info_star_type.rename(columns= {"frequency":"pf"})
        else:
            query = f"""
                select source_id,pf
                from gaiadr3.{tabla}
                where source_id IN {tuple(ids_stars)}
            """
            job = Gaia.launch_job(query)
            info_star_type = job.get_results().to_pandas("source_id")
        info_star_type["type"] = type_star
        data_global = pd.concat([data_global,info_star_type])

    return data_global
'''