In [1]:
# Install, import libraries
import sys
!{sys.executable} -m pip install google-api-python-client earthengine-api

import geopandas as gpd
import pandas as pd
from pydap.client import open_url
from pydap.cas.urs import setup_session

import xarray as xr
import requests
import re

%load_ext dotenv
%dotenv
from os import environ



In [None]:
## Run this if you don't have a .nc file to work with

base_url = "https://oco2.gesdisc.eosdis.nasa.gov"
def list_urls(year):
    r = requests.get(f"https://oco2.gesdisc.eosdis.nasa.gov/opendap/OCO2_L2_Lite_FP.9r/{year}/catalog.xml")
    paths = re.findall("ID=\"(/opendap/hyrax/OCO2_L2_Lite_FP.9r/\d{4}/.*.nc4)\"", r.text)
    print(f"Found {len(paths)} files for {year}.")
    return paths

## Based on code from http://xarray.pydata.org/en/stable/io.html
def read_datasets(files, dim, transform_func=None):
    prog_ = 0
    session = setup_session(environ.get("EARTHDATA_USER"), environ.get("EARTHDATA_PASS"), check_url=base_url+files[0])
    def process_one_path(path,i):
        print(f"Loading dataset {i}/{len(files)}")
        # use a context manager, to ensure the file gets closed after use
        with xr.backends.PydapDataStore.open(path, session=session) as store:
            with xr.open_dataset(store) as ds:
                # transform_func should do some sort of selection or aggregation
                # aggregation
                if transform_func is not None:
                    ds = transform_func(ds)
                ds.load()
                return ds

    paths = sorted(files) # TODO use Glob
    datasets = [process_one_path(base_url+p,i) for i,p in enumerate(paths)]
    
    return xr.concat(datasets, dim)

# Method to reshape one dataset
def reshape_dataset(ds):
    return ds[[ 'sounding_id', 'latitude', 'longitude', 'time', 'xco2' ]]

# here we suppose we only care about the combined mean of each file;
# you might also use indexing operations like .sel to subset datasets
combined = read_datasets(list_urls(2019), dim='sounding_id', transform_func=reshape_dataset)

print(f"Done. Loaded {combined.nbytes * (2 ** -20)} MB into a dataset.")

combined.to_netcdf('2019_soundings.nc')

nc_file = combined.set_index(sounding_id=['latitude', 'longitude', 'time'])

Found 355 files for 2019.
Loading dataset 0/355
Loading dataset 1/355
Loading dataset 2/355
Loading dataset 3/355
Loading dataset 4/355
Loading dataset 5/355
Loading dataset 6/355
Loading dataset 7/355
Loading dataset 8/355
Loading dataset 9/355
Loading dataset 10/355
Loading dataset 11/355
Loading dataset 12/355
Loading dataset 13/355
Loading dataset 14/355
Loading dataset 15/355
Loading dataset 16/355
Loading dataset 17/355
Loading dataset 18/355
Loading dataset 19/355
Loading dataset 20/355
Loading dataset 21/355
Loading dataset 22/355
Loading dataset 23/355
Loading dataset 24/355
Loading dataset 25/355
Loading dataset 26/355
Loading dataset 27/355
Loading dataset 28/355
Loading dataset 29/355
Loading dataset 30/355
Loading dataset 31/355
Loading dataset 32/355
Loading dataset 33/355
Loading dataset 34/355
Loading dataset 35/355
Loading dataset 36/355
Loading dataset 37/355
Loading dataset 38/355
Loading dataset 39/355
Loading dataset 40/355
Loading dataset 41/355
Loading dataset 42

In [7]:
## Run this if you already have a file

nc_file = xr.open_dataset('2017_soundings_backup.nc')
nc_file

In [None]:
pd_frame = nc_file.to_dataframe()
pd_frame['geometry'] = pd_frame.apply(lambda d: f"POINT ({float(d.longitude)} {d.latitude})", axis=1)
pd_frame = pd_frame.drop(columns['longitude', 'latitude', 'sounding_id'])
pd_frame

In [None]:
gpd_frame = gpd.GeoDataFrame(pd_frame, geometry='geometry')
gpd_frame

In [None]:
gpd_frame.to_file('oco2_out/2018/oco2_2018.shp', driver='ESRI Shapefile')