# Process UEA/FZJ H-1202 values

In [23]:
import pandas as pd
from py12box_invert.utils import decimal_to_pandas
import pandas as pd
from pathlib import Path
import getpass
from datetime import datetime
import numpy as np

In [24]:
def get_h1202():
        """
        Read the H1202 data from the CGO site and return a DataFrame with the data
        """
        ueafn = "/user/home/lw13938/work/py12box_laube/obs_raw/UEAFZJ/CGO_H1202.csv"
        df = pd.read_csv(ueafn, delimiter=',', encoding="utf-8", skipinitialspace=True)

        mf = df["H1202"]
        error = mf * 0.031

        dfsp = pd.DataFrame(index=pd.to_datetime(decimal_to_pandas(df["Time"].values)), 
                data={"mf":[float(f) for f in mf.values],
                "pmf":[float(f) for f in error.values]})

        return dfsp


In [27]:

def laube_sites(species, site):
    """
    Read Johanne's files and output in same format as 
    AGAGE Georgia Tech files.
    """
    scale = "UEA"

    df_v_site = get_h1202()


    df_format = pd.DataFrame(data={"var":df_v_site.index.values,"mf":df_v_site.mf.values.astype(float), "mf_variability":df_v_site.pmf.values.astype(float), "instrument":["GCMS",]*len(df_v_site.index.values)})
    df_format = df_format.set_index("var")
    df_monthly = df_format.dropna().resample("MS").mean(numeric_only =True)#.dropna()
    intruments = [ "GCMS" if np.isfinite(item) else np.nan for item in df_monthly.mf ]
    df_monthly.insert(2, "instrument", intruments, True)
    species_columns = df_monthly.columns
    df_monthly["mf_variability"][np.where(df_monthly["mf_variability"] == 0.0)[0]] = np.nanmean(df_monthly["mf_variability"])
    dfs = df_monthly[species_columns]    
    
    if dfs.empty:
        return None, None        
        
    dfs.columns = pd.MultiIndex.from_tuples(((scale,"ppt", "mf"),
                                            (scale,"ppt", f"mf_variability"),
                                            ("-","-", f"instrument")),
                                            names=("scale", "unit", "var"))
    dfs.index.name = None
    
    comment_site = f'# {site} \n#   1. GCMS\n#------------------\n'
    return dfs,comment_site

species_list = ["H-1202"]

for species in species_list:
    output_directory=None
    repeatability=0.
    sites = {"CGO":3}

    # Check if species is in data selector
    data_selector = pd.read_csv("/user/home/lw13938/work/py12box_agage/py12box_agage/agage_data_selector.csv",
                                index_col="Species")
        
    dfs = []
    rmsite = []
    comment_string = f"# UEA/FJZ data for {species}\n"
    comment_string += f"# Created by {getpass.getuser()} on {datetime.now()}\n"
    comment_string += f"# Contact data owners before use \n"
    comment_string += f"# This file contains data from the following sites/instruments: \n"
    comment_string += "#===================================================\n"

    for site in sites.keys():
        

        df_site, comment_site = laube_sites(species, site)


        if df_site is not None:
            dfs.append(df_site)
            comment_string += comment_site
        else:
            rmsite = rmsite + [site]

    for rms in rmsite:
        sites.pop(rms)

    if len(dfs) == 0:
        print("No data found")

    for bi in range(3,-1, -1):
        if bi not in sites.values():
            df_empty = dfs[0].copy() #
            df_empty.loc[:] = np.nan
            # dfs.append(df_empty)
            # sites[f"AA{bi}"] = bi
            dfs.insert(0, df_empty)
            sites = {**{f"XX{bi}" : bi}, **sites}

    # Find units
    units = [df.xs("mf", level="var", axis=1).columns.get_level_values("unit").values[0] for df in dfs]
    if len(set(units)) != 1:
        raise(f"ERROR: Units don't match: {units}")

    # Find scales
    scales = [df.xs("mf", level="var", axis=1).columns.get_level_values("scale").values[0] for df in dfs]
    if len(set(scales)) != 1:
        raise(f"ERROR: Units don't match: {scales}")

    # Concatenate and average numeric data
    df_concat = pd.concat([df.xs(units[0], level="unit", axis=1) for df in dfs], 
                        axis=1, keys=[(site, box) for site, box in sites.items()], names=["site", "box"])
    df_grouped = df_concat.groupby(by=["var", "box", "scale"], axis=1).mean()

    # Drop scale level
    df_grouped = df_grouped.droplevel("scale", axis=1)

    # Collect instrument data
    df_instrument_concat = pd.concat([df.xs("instrument", level="var", axis=1) for df in dfs], 
                                    axis=1, keys=[(site, box) for site, box in sites.items()], names=["site", "box"])


    instrument_list = pd.DataFrame(
                            columns=pd.MultiIndex.from_tuples((("instruments", 0),
                                                                ("instruments", 1),
                                                                ("instruments", 2),
                                                                ("instruments", 3)), names=["var", "box"]),
                            index=df_grouped.index)#, dtype=pd.StringDtype())

    for bi in range(4):
        df_instrument_concat_bi = df_instrument_concat.xs(bi, level="box", axis=1)
        instrument_list.iloc[:, bi] = df_instrument_concat_bi.iloc[:, 0]
        for sitei in range(len(df_instrument_concat_bi.columns))[1:]:
            instrument_list.iloc[:, bi] = instrument_list.iloc[:, bi].str.cat(df_instrument_concat_bi.iloc[:,sitei],
                                                                                sep="|",
                                                                                na_rep="")

    # Collect sites
    site_list = pd.DataFrame(
                            columns=pd.MultiIndex.from_tuples((("sites", 0),
                                                                ("sites", 1),
                                                                ("sites", 2),
                                                                ("sites", 3)), names=["var", "box"]),
                            index=df_grouped.index)#, dtype=pd.StringDtype())

    for i in df_concat.index:
        dfi = df_concat.loc[i].dropna()
        if len(dfi) == 0:
            site_list.loc[i] = ["", "", "", ""]
        else:
            site_row = []
            for bi in range(4):
                if bi in dfi.index.get_level_values("box"):
                    site_set = set(dfi.xs(bi, level="box").index.get_level_values("site"))
                    if len(site_set) == 0:
                        site_row.append("")
                    else:
                        site_row.append("|".join(site_set))
                else:
                    site_row.append("")
            site_list.loc[i] = site_row

    # Put measurements, sites and instruments together into one dataframe
    df_grouped = pd.concat([df_grouped, site_list, instrument_list], axis=1)

    # Add a repeatability
    df_grouped["mf_variability"] = np.sqrt(df_grouped["mf_variability"]**2 + (df_grouped["mf"]*repeatability)**2)

    # Append site and units
    comment_string += f"# SCALE: {scales[0]}\n# UNITS: {units[0]}\n"

    # If no output directory given, default to data folder
    if not output_directory:
        #output_directory = Path(__file__).parent.parent / "data" / species / "inputs"
        output_directory = f"/user/home/lw13938/work/py12box_laube/data/{species}/inputs/"
    # If output directories don't exist, create them
    if not Path(output_directory).exists():
        if not Path(output_directory).parent.exists():
            print(f"... creating directory {Path(output_directory).parent}")
            Path(output_directory).parent.mkdir()
        print(f"... creating directory {output_directory}")
        Path(output_directory).mkdir()
        
    with open(Path(output_directory) / f"{species}_obs_laube.csv", 'w') as fout:
        fout.write(comment_string)
        df_grouped.to_csv(fout)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_monthly["mf_variability"][np.where(df_monthly["mf_variability"] == 0.0)[0]] = np.nanmean(df_monthly["mf_variability"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable