In [1]:
import os
import gzip
import urllib
import numpy as np
import pandas as pd

from IPython.display import clear_output, display

In [2]:
data_path = "d:\\gaia_dr2\\"

## Download namelist

In [3]:
def get_path_list():

    path_list = 'http://cdn.gea.esac.esa.int/Gaia/gdr2/gaia_source/csv/MD5SUM.txt'

    with urllib.request.urlopen(path_list) as url:
        md5sum = pd.read_csv(url, sep=' ', header=None).iloc[2:][2].values
        
    return md5sum

In [4]:
def download_files(md5sum, n_min=0, n_max=100):
    """
    Parameters
    ----------
        - md5sum : array of str
          Array, containing Gaia DR2 datafile names
        - n : int
          First n number of files got to be downloaded
    """
    url_template = 'http://cdn.gea.esac.esa.int/Gaia/gdr2/gaia_source/csv/{}'

    for data_set in md5sum[n_min:n_max]:
        
        if os.path.exists(data_path + data_set):
            continue
        
        else:
            response = urllib.request.urlopen(url_template.format(data_set))
            html = response.read()
            
            print('Current :', data_set)
            clear_output(wait=True)
            
            with open(data_path + data_set, 'wb') as f:
                f.write(html)

In [5]:
def aggregate(gaia_source, n_min=24000, n_max=24100, mode='coords'):
    """
    Parameters
    ----------
        - gaia_source : array of str
          Array, containing already downloaded Gaia DR2 datafile names
        - n : int
          First n number of files got to be aggregated
    """
    
    if 'c' in mode.lower():
        annote = 'coords'
        columns = ['l', 'b']

    elif 'h' in mode.lower():
        annote = 'hrd'
        columns = ['parallax', 'phot_g_mean_mag', 'bp_rp', 'bp_g']

    file_name = 'd:\\gaia_dr2_{2}\\dr2_data_{2}_{0}-{1}.csv'.format(n_min, n_max, annote)
    if os.path.exists(file_name):
        return 0
    
    else:
        li = []
        for data_set in gaia_source[n_min-42000:n_max-42000]:
            with gzip.open(data_path + data_set, 'r') as f:
                df = pd.read_csv(f, index_col=None, header=0)[columns]
                li.append(df)

        # Save relevant data into '.csv'
        print("Concatenating data started...")
        pd.concat(li, axis=0, ignore_index=True).to_csv(file_name, index=False, mode='w+')

In [7]:
md5sum = get_path_list()

In [12]:
len(md5sum)

61234

In [13]:
download_files(md5sum, n_min=60000, n_max=61234)

Current : GaiaSource_999922404314639104_1000172126596665472.csv.gz


In [15]:
gaia_source = os.listdir(data_path)

In [16]:
sizes = np.zeros(len(gaia_source))
for i, f in enumerate(gaia_source):
    sizes[i] = (os.stat(data_path + f).st_size) / (1024 * 1024)

In [17]:
sizes.mean()

11.154716697913813

In [18]:
len(gaia_source)

19234

In [22]:
for i in range(60,61):
    clear_output(wait=True)
    print("\rStarting coords aggregating...")
    aggregate(gaia_source, n_min=i*1000, n_max=(i+1)*1000, mode='c')
    clear_output(wait=True)
    print("\rStarting HRD aggregating...")
    aggregate(gaia_source, n_min=i*1000, n_max=(i+1)*1000, mode='h')

Starting HRD aggregating...
Concatenating data started...


In [23]:
aggregate(gaia_source, n_min=61000, n_max=61234, mode='c')
aggregate(gaia_source, n_min=61000, n_max=61234, mode='h')

Concatenating data started...
Concatenating data started...
