In [1]:
# Install, import libraries
import sys
!{sys.executable} -m pip install python-dotenv pandas pydap geopandas thredds_crawler

import geopandas as gpd
import pandas as pd
from pydap.client import open_url
from pydap.cas.urs import setup_session

import xarray as xr
import requests
import re

%load_ext dotenv
%dotenv
from os import environ



In [2]:
def list_urls(year):
    r = requests.get(f"https://oco2.gesdisc.eosdis.nasa.gov/opendap/OCO2_L2_Lite_FP.9r/{year}/catalog.xml")
    paths = re.findall("ID=\"(/opendap/hyrax/OCO2_L2_Lite_FP.9r/\d{4}/.*.nc4)\"", r.text)
    print(f"Found {len(paths)} files for {year}.")
    return paths

datasets = [*list_urls(2019), *list_urls(2020)]

Found 355 files for 2019.
Found 17 files for 2020.


In [None]:
base_url = "https://oco2.gesdisc.eosdis.nasa.gov"
## Based on code from http://xarray.pydata.org/en/stable/io.html
def read_datasets(files, dim, transform_func=None):
    prog_ = 0
    session = setup_session(environ.get("EARTHDATA_USER"), environ.get("EARTHDATA_PASS"), check_url=base_url+files[0])
    def process_one_path(path,i):
        print(f"Loading dataset {i}/{len(files)}")
        # use a context manager, to ensure the file gets closed after use
        with xr.backends.PydapDataStore.open(path, session=session) as store:
            with xr.open_dataset(store) as ds:
                # transform_func should do some sort of selection or aggregation
                # aggregation
                if transform_func is not None:
                    ds = transform_func(ds)
                ds.load()
                return ds

    paths = sorted(files) # TODO use Glob
    datasets = [process_one_path(base_url+p,i) for i,p in enumerate(paths)]
    
    return xr.concat(datasets, dim)

# Method to reshape one dataset
def reshape_dataset(ds):
    return ds[[ 'sounding_id', 'latitude', 'longitude', 'time', 'xco2' ]]

# here we suppose we only care about the combined mean of each file;
# you might also use indexing operations like .sel to subset datasets
combined = read_datasets(datasets, dim='sounding_id', transform_func=reshape_dataset)

print(f"Done. Loaded {combined.nbytes * (2 ** -20)} MB into a dataset.")

combined.to_netcdf('all_soundings.nc')

combined = combined.set_index(sounding_id=['latitude', 'longitude', 'time'])

Loading dataset 0/372
Loading dataset 1/372
Loading dataset 2/372
Loading dataset 3/372
Loading dataset 4/372
Loading dataset 5/372
Loading dataset 6/372
Loading dataset 7/372
Loading dataset 8/372
Loading dataset 9/372
Loading dataset 10/372
Loading dataset 11/372
Loading dataset 12/372
Loading dataset 13/372
Loading dataset 14/372
Loading dataset 15/372
Loading dataset 16/372
Loading dataset 17/372
Loading dataset 18/372
Loading dataset 19/372
Loading dataset 20/372
Loading dataset 21/372
Loading dataset 22/372
Loading dataset 23/372
Loading dataset 24/372
Loading dataset 25/372
Loading dataset 26/372
Loading dataset 27/372
Loading dataset 28/372
Loading dataset 29/372
Loading dataset 30/372
Loading dataset 31/372
Loading dataset 32/372
Loading dataset 33/372
Loading dataset 34/372
Loading dataset 35/372
Loading dataset 36/372
Loading dataset 37/372
Loading dataset 38/372
Loading dataset 39/372
Loading dataset 40/372
Loading dataset 41/372
Loading dataset 42/372
Loading dataset 43/37

In [None]:
unc = [-79.0558,35.9132]
tol = 1
result = None

try:
    result = combined.sel(
        longitude=slice(unc[0]-tol, unc[0]+tol),
        latitude=slice(unc[1]-tol, unc[1]+tol)
        )
except KeyError as e:
    result = f"Not found: {e}"
    
print(result)

In [None]:
combined.where(combined.latitude <)

In [None]:
base_url = "https://oco2.gesdisc.eosdis.nasa.gov/opendap/OCO2_L2_Lite_FP.9r/"
session = setup_session(environ.get("EARTHDATA_USER"), environ.get("EARTHDATA_PASS"), check_url=base_url+datasets[0])
# use a context manager, to ensure the file gets closed after use
store = xr.backends.PydapDataStore.open(base_url+datasets[0], session=session)
ds = xr.open_dataset(store)

ds = (ds[[ 'sounding_id', 'latitude', 'longitude', 'time', 'xco2' ]]
   .set_index(sounding_id=['latitude', 'longitude', 'time']))

In [None]:
import time

unstacked = None;
for n in [100]:
    start = time.time()
    print(f"Unstacking {n} rows");
    unstacked = ds.isel(sounding_id=slice(0,n)).unstack()
    print(f"Elapsed for {n} time: {time.time()-start}s");

In [None]:
ds.nbytes * (2 ** -20)

In [None]:
unstacked.nbytes * (2 ** -20)