In [2]:
# Install, import libraries
import sys
!{sys.executable} -m pip install python-dotenv pandas pydap geopandas thredds_crawler

import geopandas as gpd
import pandas as pd
from pydap.client import open_url
from pydap.cas.urs import setup_session

import xarray as xr
import requests
import re

%load_ext dotenv
%dotenv
from os import environ

Collecting python-dotenv
  Downloading python_dotenv-0.12.0-py2.py3-none-any.whl (17 kB)
Collecting thredds_crawler
  Using cached thredds_crawler-1.5.4-py3-none-any.whl (10 kB)
Installing collected packages: python-dotenv, thredds-crawler
Successfully installed python-dotenv-0.12.0 thredds-crawler-1.5.4


In [4]:
## Run this if you don't have a .nc file to work with

base_url = "https://oco2.gesdisc.eosdis.nasa.gov"
def list_urls(year):
    r = requests.get(f"https://oco2.gesdisc.eosdis.nasa.gov/opendap/OCO2_L2_Lite_FP.9r/{year}/catalog.xml")
    paths = re.findall("ID=\"(/opendap/hyrax/OCO2_L2_Lite_FP.9r/\d{4}/.*.nc4)\"", r.text)
    print(f"Found {len(paths)} files for {year}.")
    return paths

## Based on code from http://xarray.pydata.org/en/stable/io.html
def read_datasets(files, dim, transform_func=None):
    prog_ = 0
    session = setup_session(environ.get("EARTHDATA_USER"), environ.get("EARTHDATA_PASS"), check_url=base_url+files[0])
    def process_one_path(path,i):
        print(f"Loading dataset {i}/{len(files)}")
        # use a context manager, to ensure the file gets closed after use
        with xr.backends.PydapDataStore.open(path, session=session) as store:
            with xr.open_dataset(store) as ds:
                # transform_func should do some sort of selection or aggregation
                # aggregation
                if transform_func is not None:
                    ds = transform_func(ds)
                ds.load()
                return ds

    paths = sorted(files) # TODO use Glob
    datasets = [process_one_path(base_url+p,i) for i,p in enumerate(paths)]
    
    return xr.concat(datasets, dim)

# Method to reshape one dataset
def reshape_dataset(ds):
    return ds[[ 'sounding_id', 'latitude', 'longitude', 'time', 'xco2' ]]

# here we suppose we only care about the combined mean of each file;
# you might also use indexing operations like .sel to subset datasets
combined = read_datasets(list_urls(2018), dim='sounding_id', transform_func=reshape_dataset)

print(f"Done. Loaded {combined.nbytes * (2 ** -20)} MB into a dataset.")

combined.to_netcdf('2018_soundings.nc')

combined = combined.set_index(sounding_id=['latitude', 'longitude', 'time'])

Found 350 files for 2018.
Loading dataset 0/350
Loading dataset 1/350
Loading dataset 2/350
Loading dataset 3/350
Loading dataset 4/350
Loading dataset 5/350
Loading dataset 6/350
Loading dataset 7/350
Loading dataset 8/350
Loading dataset 9/350
Loading dataset 10/350
Loading dataset 11/350
Loading dataset 12/350
Loading dataset 13/350
Loading dataset 14/350
Loading dataset 15/350
Loading dataset 16/350
Loading dataset 17/350
Loading dataset 18/350
Loading dataset 19/350
Loading dataset 20/350
Loading dataset 21/350
Loading dataset 22/350
Loading dataset 23/350
Loading dataset 24/350
Loading dataset 25/350
Loading dataset 26/350
Loading dataset 27/350
Loading dataset 28/350
Loading dataset 29/350
Loading dataset 30/350
Loading dataset 31/350
Loading dataset 32/350
Loading dataset 33/350
Loading dataset 34/350
Loading dataset 35/350
Loading dataset 36/350
Loading dataset 37/350
Loading dataset 38/350
Loading dataset 39/350
Loading dataset 40/350
Loading dataset 41/350
Loading dataset 42

In [3]:
## Run this if you already have a file

combined = xr.open_dataset('2019_20_soundings.nc').set_index(sounding_id=['latitude', 'longitude', 'time'])
combined

In [None]:
unc = [-79.0558,35.9132]
tol = 1
result = None

try:
    result = combined.sel(
        longitude=slice(unc[0]-tol, unc[0]+tol),
        latitude=slice(unc[1]-tol, unc[1]+tol)
        )
except KeyError as e:
    result = f"Not found: {e}"
    
print(result)

In [None]:
combined.where(combined.latitude <)

In [None]:
base_url = "https://oco2.gesdisc.eosdis.nasa.gov/opendap/OCO2_L2_Lite_FP.9r/"
session = setup_session(environ.get("EARTHDATA_USER"), environ.get("EARTHDATA_PASS"), check_url=base_url+datasets[0])
# use a context manager, to ensure the file gets closed after use
store = xr.backends.PydapDataStore.open(base_url+datasets[0], session=session)
ds = xr.open_dataset(store)

ds = (ds[[ 'sounding_id', 'latitude', 'longitude', 'time', 'xco2' ]]
   .set_index(sounding_id=['latitude', 'longitude', 'time']))

In [None]:
import time

unstacked = None;
for n in [100]:
    start = time.time()
    print(f"Unstacking {n} rows");
    unstacked = ds.isel(sounding_id=slice(0,n)).unstack()
    print(f"Elapsed for {n} time: {time.time()-start}s");

In [None]:
ds.nbytes * (2 ** -20)

In [None]:
unstacked.nbytes * (2 ** -20)

In [8]:
combined.nbytes * (2 ** -30)

1.6487814523279667