# Read script for xCO2 data
## NOAA Greenhouse Gas Marine Boundary Layer Reference

Can be downloaded from: 
https://gml.noaa.gov/ccgg/mbl/
(Choose 'Surface' with all dates, all latitudes to enable download of entire dataset, though plot will show only 8 years.)
Filename of format:  co2_GHGreference.##########_surface.txt.   

This file should be included with the contents of this notebook (~2.1 MB).  Download to ./data/raw/ if is not. 

### Notes on data file
Data is in rows: 
  - ```decimalYear, measurement, uncertainty, measurement, uncertainty ...``` 
for each of sine latitude points given in the file.  There is only one spatial dimension: data is averaged by latitude.  

Geolocation given by sineLatitude values as 'Sine of latitude steps: ' in comment/header at start of file.   These are converted to latitude values for use in these CO2 calculations. 

Dates are roughly every 8 days.  Nearest neighbor is used initially for corresponding time.  Proper treatment for monthly data requires interpolation by time to daily values, then averaging for each month at each latitude.  

Data starts in the first row after comments/header indicated by '#' at start of each line.

Data source, contacts, and citation are given in the data file.


In [119]:
import os 
import io
import datetime

import numpy as np 
import xarray as xr
import pandas as pd 

from path import Path
import regex as re
import PyAstronomy.pyasl as pyasl

## Read xCO2 data from fully documented downloaded file.  
This file needs to be downloaded from the website  https://gml.noaa.gov/ccgg/mbl/ as instructed above.  Filename is of the format:  co2_GHGreference.##########_surface.txt, with the '#' generated at download time.  Assume in ./data/raw

In [120]:

def check_xCO2_file(mblFile): 

    # -----
    # find xCO2 file in ./data/raw directory
    # look in ./data/raw directory for file named co2_GHGreference.###########_surface.txt, 
    # Get newest version of file based on filetime  

    newest_file = None
   
    if (type(mblFile) is Path):
        # do nothing - is only one file of correct form
        mblFile = mblFile
    elif (type(mblFile) is list) : 
        # get most recent - based on creation date of file (for unix is .getmtime, not .getctime)
        # can iterate over list items 
        for kk in mblFile: 
            if ( newest_file is None or newest_file.getmtime() > kk.getmtime() ):
                newest_file = kk
        mblFile = newest_file 
    else:
        raise Exception("Need xCO2 MBL file in ./data/raw/  of form co2_GHGreference.###########_surface.txt")

    # check is MBL SURFACE FILE 
    fname = mblFile.abspath()
    is_mbl_surface = False
    for start_line, line in enumerate(open(fname)):
        if re.findall("MBL.*SURFACE", line):
            is_mbl_surface = True
        if not line.startswith("#"):
            break
    if not is_mbl_surface:
        raise Exception(
            "The file at the provided url is not an MBL SURFACE file. "
            "Please check that you have downloaded the correct surface file. "
        )

    # TODO:  save name of xCO2 MBL file to log file  
    toLog = str(mblFile.relpath())
    
    return mblFile

In [121]:
def read_xCO2_file(mblFile):

    # READ xCO2 data file 
    # - read header information 
    # - read dataset into dataframe df
    # - convert dates to datetime objects
    # - harmonize dims/coordinates to time, latitude

    # round latitude degrees to number decimals
    ROUND_TO = 2   

    fname = mblFile.abspath(); 

    # read header of datafile 
    thisFile = open(fname)
    header = []
    for line in thisFile:
        if not line.startswith('#'): 
            break
        else: 
            header.append(line)
    thisFile.close()

    # Parse data from header, look for phrases below and split lines at delimiter ':'
    # - Product constraints on dataset: dateRange and degLatRange
    # - sinLat column headings, convert to degLat 

    for line in header: 

        # read data from 'Product Constraints' in header for later error checking 
        if line.find('Date Range') > -1: 
            dateRange = line.split(':')
            dateRange = dateRange[1].split(',')
            dateRange = [x.strip() for x in dateRange]
            dateRange = [datetime.datetime.strptime(x, '%Y-%m-%d') for x in dateRange]

        if line.find('Degree Latitude Range') > -1 :
            degLatRange = line.split(':')
            degLatRange = [float(x) for x in degLatRange[1].split(',')]

        # get sinLatitude values for data columns, and convert to degreesLatitude
        if line.find('Sine of latitude steps') > -1 : 
            sinLat = line.split(':')
            sinLat = [float(x) for x in sinLat[1].split()]
            sinLat = np.array(sinLat)
            degLat = np.rad2deg(np.arcsin(sinLat))
            degLat = np.round(degLat, ROUND_TO)

    # Read dataset into a pandas dataframe;  xarray does not handle csv files so well.  
    df = pd.read_csv(mblFile, skiprows=len(header), skipinitialspace=True, header=None, sep='\s+')  

    # --- convert dates from year.decimalyear to datetime
    # use pyAstronomy.decimalYearGregorianDate to convert to datetime object; give form = str, tuple, or datetime(default)
    # have to iterate through items in df, since pyasl does not handle df series properly; insert into first column (ie col 0)
    # use 'time' as col label here, as consistent with L Gregor code

    times = []
    for ii in df[0]: 
        times.append(pyasl.decimalYearGregorianDate(ii, form='datetime'))
    df.insert(0,'time', times)
    del(times)

    # set the index to time.
    df = df.set_index(['time'])


    # ---- 
    # Convert the dataframe to an xarray dataset  - easier to work with for later calculations because the latitudes are listed as a dimension
    # Variable names are 'xco2' and 'xco2_uncert' 

    colNames = df.columns.values.tolist()

    ds = df.to_xarray() 

    # concat uncertainties first to temp dataset ds2
    ds2 = xr.concat([ds[ii] for ii in colNames[2::2]], 'lat')
    # then concat values back to ds
    ds = xr.concat([ds[ii] for ii in colNames[1::2]], 'lat')

    ds = ds.assign_coords(coords = {'lat': degLat})
    ds = xr.merge([ds, ds2])
    ds = ds.rename({1:'xco2', 2:'xco2_uncert'})
    ds = ds.transpose()   # transpose coordinates to time, lat
    ds = ds.rename({'lat':'latitude'})   # rename dims/coordinates to latitude 

    ds.assign_attrs({'datasetName': 'xco2'})

    #---
    # store the dataset to be used within another notebook as dsXco2
    dsXco2 = ds

    return dsXco2


    # TODO
    # add attributes to file:  file origin, maybe reference, dateRange and degLatRange 


In [124]:
if __name__ == "__main__":  # allows run without calling this part - incase this notebook is run independently of others.
    
    print('running main read_xco2.ipynb')
    rawDir = './data/raw'
    mblFile  = Path(rawDir).files('co2_GHGreference*_surface.txt')[0]
    
    mblFile = check_xCO2_file(mblFile)
    dsXco2 = read_xCO2_file(mblFile)
    
    %store dsXco2


running main read_xco2.ipynb
Stored 'dsXco2' (Dataset)
name __main__
