In [6]:
import os
import gzip
import urllib
import numpy as np
import pandas as pd

from IPython.display import clear_output, display

In [7]:
data_path = "d:\\gaia_dr2\\"
#aggregated_path = "c:\\Users\\finance\\Documents\\GitHub\\Random_Interesting_Things\\Physics\\Star Map\\data\\"
aggregated_path = "d:\\gaia_dr2_csv\\"

## Download namelist

In [8]:
def get_path_list():

    path_list = 'http://cdn.gea.esac.esa.int/Gaia/gdr2/gaia_source/csv/MD5SUM.txt'

    with urllib.request.urlopen(path_list) as url:
        md5sum = pd.read_csv(url, sep=' ', header=None).iloc[2:][2].values
        
    return md5sum

In [9]:
def download_files(md5sum, n_min=0, n_max=100):
    """
    Parameters
    ----------
        - md5sum : array of str
          Array, containing Gaia DR2 datafile names
        - n : int
          First n number of files got to be downloaded
    """
    url_template = 'http://cdn.gea.esac.esa.int/Gaia/gdr2/gaia_source/csv/{}'

    for data_set in md5sum[n_min:n_max]:
        
        if os.path.exists(data_path + data_set):
            continue
        
        else:
            response = urllib.request.urlopen(url_template.format(data_set))
            html = response.read()
            
            print('Current :', data_set)
            clear_output(wait=True)
            
            with open(data_path + data_set, 'wb') as f:
                f.write(html)

In [10]:
def aggregate(gaia_source, n_min=24000, n_max=24100, mode='coords'):
    """
    Parameters
    ----------
        - gaia_source : array of str
          Array, containing already downloaded Gaia DR2 datafile names
        - n : int
          First n number of files got to be aggregated
    """
    
    if 'c' in mode.lower():
        annote = 'coords'
        columns = ['l', 'b']

    elif 'h' in mode.lower():
        annote = 'hrd'
        columns = ['parallax', 'phot_g_mean_mag', 'bp_rp', 'bp_g']

    file_name = 'd:\\gaia_dr2_{2}\\dr2_data_{2}_{0}-{1}.csv'.format(n_min, n_max, annote)
    if os.path.exists(file_name):
        return 0
    
    else:
        li = []
        for data_set in gaia_source[n_min-24000:n_max-24000]:
            with gzip.open(data_path + data_set, 'r') as f:
                df = pd.read_csv(f, index_col=None, header=0)[columns]
                li.append(df)

        # Save relevant data into '.csv'
        pd.concat(li, axis=0, ignore_index=True).to_csv(file_name, index=False, mode='w+')

In [11]:
def aggregate_coords(l_min, l_max):

    li = []
    for data_set in os.listdir(aggregated_path):
        with open(aggregated_path + data_set, 'r') as f:
            df = pd.read_csv(aggregated_path + data_set, index_col=None, header=0)
            print(data_set)
            li.append(df[(df.l > l_min) & (df.l < l_max) & (df.b > -45) & (df.b < 45)])

    return pd.concat(li, axis=0, ignore_index=True)

In [12]:
md5sum = get_path_list()

In [20]:
download_files(md5sum, n_min=40000, n_max=42000)

Current : GaiaSource_5347467002028376192_5347525108659028864.csv.gz


In [None]:
download_files(md5sum, n_min=42000, n_max=44000)

In [None]:
download_files(md5sum, n_min=44000, n_max=46000)

In [None]:
download_files(md5sum, n_min=46000, n_max=48000)

In [13]:
gaia_source = os.listdir(data_path)

In [14]:
sizes = np.zeros(len(gaia_source))
for i, f in enumerate(gaia_source):
    sizes[i] = (os.stat(data_path + f).st_size) / (1024 * 1024)

In [15]:
sizes.mean()

10.365297745333777

In [16]:
len(gaia_source)

18000

In [18]:
for i in range(34,36):
    aggregate(gaia_source, n_min=i*1000, n_max=(i+1)*1000, mode='c')
    aggregate(gaia_source, n_min=i*1000, n_max=(i+1)*1000, mode='h')