In [20]:
import pandas as pd
# from py12box_invert import utils
from pathlib import Path
#from py12box_agage import agage_process 
import getpass
from datetime import datetime
import numpy as np

In [21]:
def get_uea_dict():
    ueafn = "/user/home/lw13938/work/py12box_laube/obs_raw/CapeGrim-20220126-for-Luke-112-112a-113-113a-114-114a-.csv"
    df = pd.read_csv(ueafn)
    df_dict = {}
    for i in range(8):
        spdf = df.iloc[:,(i*6):((i+1)*6)]
        day = spdf.iloc[2:,0]
        month = spdf.iloc[2:,1]
        year = spdf.iloc[2:,2]
        mf = spdf.iloc[2:,3]
        error = spdf.iloc[2:,4]
        species = spdf.iloc[0,3]
        if species == "c-C4F8":
            species = "C4F8"
        
        dfdate = pd.DataFrame({'year': [int(y) for y in year.values if isinstance(y,str)],
                        'month': [int(m) for m in month.values if isinstance(m,str)],
                        'day': [int(d) for d in day.values if isinstance(d,str)]})
        dfsp = pd.DataFrame(index=pd.to_datetime(dfdate), 
                data={"mf":[float(f) for f in mf.values if isinstance(f,str)],
                "pmf":[float(f) for f in error.values if isinstance(f,str)]})

        df_dict[species] = dfsp #.resample("1AS").mean().dropna()
    return df_dict

In [22]:
def laube_sites(species, site):
    """
    Read Johanne's files and output in same format as 
    AGAGE Georgia Tech files.
    """
    scale = "UEA"

    df_v_site = get_uea_dict()[species]


    df_format = pd.DataFrame(data={"var":df_v_site.index.values,"mf":df_v_site.mf.values.astype(float), "mf_variability":df_v_site.pmf.values.astype(float), "instrument":["Medusa",]*len(df_v_site.index.values)})
    df_format = df_format.set_index("var")
    df_monthly = df_format.dropna().resample("MS").mean()#.dropna()
    intruments = [ "Unknown" if np.isfinite(item) else np.nan for item in df_monthly.mf ]
    df_monthly.insert(2, "instrument", intruments, True)
    species_columns = df_monthly.columns
    df_monthly["mf_variability"][np.where(df_monthly["mf_variability"] == 0.0)[0]] = np.nanmean(df_monthly["mf_variability"])
    dfs = df_monthly[species_columns]    
    
    if dfs.empty:
        return None, None        
        
    dfs.columns = pd.MultiIndex.from_tuples(((scale,"ppt", "mf"),
                                            (scale,"ppt", f"mf_variability"),
                                            ("-","-", f"instrument")),
                                            names=("scale", "unit", "var"))
    dfs.index.name = None
    
    comment_site = f'# {site} \n#   1. Unknown\n#------------------\n'
    return dfs,comment_site


In [23]:
species_list = ["CFC-112", "CFC-112a", "CFC-113","CFC-113a", "CFC-114", "CFC-114a"]

for species in species_list:
    output_directory=None
    repeatability=0.
    sites = {"CGO":3}

    # Check if species is in data selector
    data_selector = pd.read_csv("/user/home/lw13938/work/py12box_agage/py12box_agage/agage_data_selector.csv",
                                index_col="Species")
        
    dfs = []
    rmsite = []
    comment_string = f"# UEA/FJZ data for {species}\n"
    comment_string += f"# Created by {getpass.getuser()} on {datetime.now()}\n"
    comment_string += f"# Contact data owners before use \n"
    comment_string += f"# This file contains data from the following sites/instruments: \n"
    comment_string += "#===================================================\n"

    for site in sites.keys():
        

        df_site, comment_site = laube_sites(species, site)


        if df_site is not None:
            dfs.append(df_site)
            comment_string += comment_site
        else:
            rmsite = rmsite + [site]

    for rms in rmsite:
        sites.pop(rms)

    if len(dfs) == 0:
        print("No data found")

    for bi in range(3,-1, -1):
        if bi not in sites.values():
            df_empty = dfs[0].copy() #
            df_empty.loc[:] = np.nan
            # dfs.append(df_empty)
            # sites[f"AA{bi}"] = bi
            dfs.insert(0, df_empty)
            sites = {**{f"XX{bi}" : bi}, **sites}

    # Find units
    units = [df.xs("mf", level="var", axis=1).columns.get_level_values("unit").values[0] for df in dfs]
    if len(set(units)) != 1:
        raise(f"ERROR: Units don't match: {units}")

    # Find scales
    scales = [df.xs("mf", level="var", axis=1).columns.get_level_values("scale").values[0] for df in dfs]
    if len(set(scales)) != 1:
        raise(f"ERROR: Units don't match: {scales}")

    # Concatenate and average numeric data
    df_concat = pd.concat([df.xs(units[0], level="unit", axis=1) for df in dfs], 
                        axis=1, keys=[(site, box) for site, box in sites.items()], names=["site", "box"])
    df_grouped = df_concat.groupby(by=["var", "box", "scale"], axis=1).mean()

    # Drop scale level
    df_grouped = df_grouped.droplevel("scale", axis=1)

    # Collect instrument data
    df_instrument_concat = pd.concat([df.xs("instrument", level="var", axis=1) for df in dfs], 
                                    axis=1, keys=[(site, box) for site, box in sites.items()], names=["site", "box"])


    instrument_list = pd.DataFrame(
                            columns=pd.MultiIndex.from_tuples((("instruments", 0),
                                                                ("instruments", 1),
                                                                ("instruments", 2),
                                                                ("instruments", 3)), names=["var", "box"]),
                            index=df_grouped.index)#, dtype=pd.StringDtype())

    for bi in range(4):
        df_instrument_concat_bi = df_instrument_concat.xs(bi, level="box", axis=1)
        instrument_list.iloc[:, bi] = df_instrument_concat_bi.iloc[:, 0]
        for sitei in range(len(df_instrument_concat_bi.columns))[1:]:
            instrument_list.iloc[:, bi] = instrument_list.iloc[:, bi].str.cat(df_instrument_concat_bi.iloc[:,sitei],
                                                                                sep="|",
                                                                                na_rep="")

    # Collect sites
    site_list = pd.DataFrame(
                            columns=pd.MultiIndex.from_tuples((("sites", 0),
                                                                ("sites", 1),
                                                                ("sites", 2),
                                                                ("sites", 3)), names=["var", "box"]),
                            index=df_grouped.index)#, dtype=pd.StringDtype())

    for i in df_concat.index:
        dfi = df_concat.loc[i].dropna()
        if len(dfi) == 0:
            site_list.loc[i] = ["", "", "", ""]
        else:
            site_row = []
            for bi in range(4):
                if bi in dfi.index.get_level_values("box"):
                    site_set = set(dfi.xs(bi, level="box").index.get_level_values("site"))
                    if len(site_set) == 0:
                        site_row.append("")
                    else:
                        site_row.append("|".join(site_set))
                else:
                    site_row.append("")
            site_list.loc[i] = site_row

    # Put measurements, sites and instruments together into one dataframe
    df_grouped = pd.concat([df_grouped, site_list, instrument_list], axis=1)

    # Add a repeatability
    df_grouped["mf_variability"] = np.sqrt(df_grouped["mf_variability"]**2 + (df_grouped["mf"]*repeatability)**2)

    # Append site and units
    comment_string += f"# SCALE: {scales[0]}\n# UNITS: {units[0]}\n"

    # If no output directory given, default to data folder
    if not output_directory:
        #output_directory = Path(__file__).parent.parent / "data" / species / "inputs"
        output_directory = f"/user/home/lw13938/work/py12box_laube/data/{species}/inputs/"
    # If output directories don't exist, create them
    if not Path(output_directory).exists():
        if not Path(output_directory).parent.exists():
            print(f"... creating directory {Path(output_directory).parent}")
            Path(output_directory).parent.mkdir()
        print(f"... creating directory {output_directory}")
        Path(output_directory).mkdir()
        
    with open(Path(output_directory) / f"{species}_obs_laube.csv", 'w') as fout:
        fout.write(comment_string)
        df_grouped.to_csv(fout)

Exiting: CFC-112 not in agage_data_selector.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_monthly["mf_variability"][np.where(df_monthly["mf_variability"] == 0.0)[0]] = np.nanmean(df_monthly["mf_variability"])


Exiting: CFC-112a not in agage_data_selector.csv
... creating directory /user/home/lw13938/work/py12box_laube/data/CFC-112a
... creating directory /user/home/lw13938/work/py12box_laube/data/CFC-112a/inputs/
... creating directory /user/home/lw13938/work/py12box_laube/data/CFC-113
... creating directory /user/home/lw13938/work/py12box_laube/data/CFC-113/inputs/
Exiting: CFC-113a not in agage_data_selector.csv
... creating directory /user/home/lw13938/work/py12box_laube/data/CFC-113a
... creating directory /user/home/lw13938/work/py12box_laube/data/CFC-113a/inputs/
... creating directory /user/home/lw13938/work/py12box_laube/data/CFC-114
... creating directory /user/home/lw13938/work/py12box_laube/data/CFC-114/inputs/
Exiting: CFC-114a not in agage_data_selector.csv
... creating directory /user/home/lw13938/work/py12box_laube/data/CFC-114a
... creating directory /user/home/lw13938/work/py12box_laube/data/CFC-114a/inputs/


3
2
1
0


[('CGO', 3), ('XX0', 0), ('XX1', 1), ('XX2', 2)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_monthly["mf_variability"][np.where(df_monthly["mf_variability"] == 0.0)[0]] = np.nanmean(df_monthly["mf_variability"])


(scale            UEA                         -
 unit             ppt                         -
 var               mf mf_variability instrument
 1978-07-01  0.087399       0.002613    Unknown
 1978-08-01       NaN            NaN        NaN
 1978-09-01       NaN            NaN        NaN
 1978-10-01       NaN            NaN        NaN
 1978-11-01       NaN            NaN        NaN
 ...              ...            ...        ...
 2020-07-01       NaN            NaN        NaN
 2020-08-01       NaN            NaN        NaN
 2020-09-01       NaN            NaN        NaN
 2020-10-01       NaN            NaN        NaN
 2020-11-01  0.386666       0.004041    Unknown
 
 [509 rows x 3 columns],
 '# CGO \n#   1. Unknown\n#------------------\n')

DatetimeIndex(['1978-07-07', '1980-09-26', '1983-08-01', '1985-08-08',
               '1987-05-28', '1989-04-20', '1991-03-13', '1993-06-07',
               '1994-02-11', '1995-06-13', '1997-01-07', '1999-06-21',
               '2000-02-23', '2000-09-29', '2001-04-27', '2001-12-11',
               '2002-06-20', '2003-05-21', '2004-06-30', '2004-09-01',
               '2004-12-16', '2005-04-06', '2005-10-03', '2006-08-23',
               '2007-07-05', '2008-06-13', '2009-03-06', '2009-07-27',
               '2010-12-10', '2011-02-08', '2011-03-17', '2011-04-20',
               '2011-05-26', '2011-06-21', '2012-04-27', '2012-08-30',
               '2012-12-04', '2013-03-14', '2013-07-05', '2013-10-14',
               '2014-02-21', '2014-09-11', '2015-02-16', '2015-07-27',
               '2015-09-22', '2015-11-26', '2016-01-22', '2016-04-07',
               '2016-10-04', '2016-11-25', '2017-01-18', '2017-02-23',
               '2018-02-16', '2018-09-28', '2019-03-26', '2019-07-02',
      