# Download data and create registry

This is a python notebook to download data from http: into the cache, and generate a registry file `../seagliderOG1/seaglider_registry.txt` with the filenames and the sha256 checksums.


In [None]:
import sys
import importlib
import yaml
sys.path.append('/Users/eddifying/Cloudfree/gitlab-cloudfree/seagliderOG1')

# python -c "import pooch; pooch.make_registry('/Users/eddifying/Library/Caches/seagliderOG1','seagliderOG1/registry.txt')"

In [None]:
from seagliderOG1 import fetchers
import xarray as xr
import os
import warnings
import pooch
from seagliderOG1 import vocabularies
warnings.simplefilter("ignore", category=Warning)


In [3]:
# Names of the data files
# Found these by searching on https://catalog.data.gov/dataset/?q=eriksen&sort=score+desc%2C+name+asc&ext_location=&ext_bbox=&ext_prev_extent=&page=4
servers = [
    # Labrador Sea
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/015/20040924/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/014/20040924/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/008/20031002/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/004/20031002/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/016/20050406/",
    # RAPID/MOCHA
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/033/20100729/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/034/20110128/",

    # Either Iceland, Faroes or RAPID/MOCHA
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/005/20090829/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/005/20080606/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/005/20081106/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/012/20070831/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/014/20080214/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/014/20080222/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/016/20061112/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/016/20090605/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/016/20071113/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/016/20080607/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/033/20100518/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/033/20100903/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/101/20081108/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/101/20061112/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/101/20070609/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/102/20061112/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/102/20071113/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/103/20090223/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/103/20071113/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/103/20070218/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/104/20080214/",
    "https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/104/20070901/"
]

for server in servers:
    print(f"Processing server: {server}")

    filenames = fetchers.list_files_in_https_server(server)


    # Base url for the data files
    base_url = server

    # Create a new directory where all the files will be downloaded
    basedir = '/Users/eddifying/Library/Caches/seagliderOG1'
    directory = os.path.join(basedir)
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Download each data file to data_files
    for fname in filenames:
        path = pooch.retrieve(
            url = base_url + fname, known_hash=None, fname=fname, path=directory
        )

    # Create a registry file from the downloaded files
    pooch.make_registry(basedir, "registry_new.txt")

    def update_registry(existing_registry_path, new_registry_path):
        # Read the existing registry file
        with open(existing_registry_path, 'r') as f:
            existing_lines = set(f.readlines())

        # Read the new registry file
        with open(new_registry_path, 'r') as f:
            new_lines = f.readlines()

        # Find lines that are in the new registry but not in the existing registry
        unique_lines = [line for line in new_lines if line not in existing_lines]

        # Append the unique lines to the existing registry file
        with open(existing_registry_path, 'a') as f:
            f.writelines(unique_lines)

        # Remove duplicate lines with None values
        with open(existing_registry_path, 'r') as f:
            lines = f.readlines()

        # Create a dictionary to store the best value for each key
        registry_dict = {}
        for line in lines:
            key, value = line.split()
            if key not in registry_dict or registry_dict[key] == 'None':
                registry_dict[key] = value

        # Write the cleaned registry back to the file
        with open(existing_registry_path, 'w') as f:
            for key, value in registry_dict.items():
                f.write(f"{key} {value}\n")

    # Example usage
    update_registry('../seagliderOG1/seaglider_registry.txt', 'registry_new.txt')

Processing server: https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/015/20040924/
Processing server: https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/014/20040924/
Processing server: https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/008/20031002/
Processing server: https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/004/20031002/
Processing server: https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/016/20050406/
Processing server: https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/033/20100729/
Processing server: https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/034/20110128/
Processing server: https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/005/20090829/
Processing server: https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/005/20080606/
Processing server: https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/005/20081106/
Processing server: https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/012/20070831/

Downloading data from 'https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/102/20071113/p1020105_20071201.nc' to file '/Users/eddifying/Library/Caches/seagliderOG1/p1020105_20071201.nc'.
SHA256 hash of downloaded file: cfc4ba146a54510cf261459f3910f56c37808191e0c587d789fc16f9b5422790
Use this value as the 'known_hash' argument of 'pooch.retrieve' to ensure that the file hasn't changed if it is downloaded again in the future.
Downloading data from 'https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/102/20071113/p1020106_20071202.nc' to file '/Users/eddifying/Library/Caches/seagliderOG1/p1020106_20071202.nc'.
SHA256 hash of downloaded file: 6fe3a7fecdcc07708cc77a3de5e6c4c9470022da9b31425e102ca5af73f4c790
Use this value as the 'known_hash' argument of 'pooch.retrieve' to ensure that the file hasn't changed if it is downloaded again in the future.
Downloading data from 'https://www.ncei.noaa.gov/data/oceans/glider/seaglider/uw/102/20071113/p1020107_20071202.nc' to file '/Users/

KeyboardInterrupt: 