In [65]:
from typing import Union, Optional, Dict
from pathlib import Path
import json
import pandas as pd
from collections import defaultdict

In [2]:
def read_file(
        data_filepath: Union[str, Path],
        site: str,
        network: str,
        inlet: Optional[str] = None,
        instrument: Optional[str] = "shinyei",
        sampling_period: Optional[str] = None,
        measurement_type: Optional[str] = None,
    ) -> Dict:
        """Read BEACO2N data files

        Args:
            filepath: Data filepath
            site: Site name
        Returns:
            dict: Dictionary of data
        """
        import pandas as pd
        from numpy import nan as np_nan
#         from openghg.util import load_json
        from collections import defaultdict
#         from openghg.util import clean_string

        if sampling_period is None:
            sampling_period = "NOT_SET"

        datetime_columns = {"time": ["datetime"]}
        rename_cols = {
            "PM_ug/m3": "pm",
            "PM_ug/m3_QC_level": "pm_qc",
            "co2_ppm": "co2",
            "co2_ppm_QC_level": "co2_qc",
            "co_ppm": "co",
            "co_ppm_QC_level": "co_qc",
        }

        use_cols = [1, 5, 6, 7, 8, 9, 10]
        data_filepath = Path(data_filepath)

        try:
            data = pd.read_csv(
                data_filepath,
                index_col="time",
                parse_dates=datetime_columns,
                na_values=[-999.0, "1a"],
                usecols=use_cols,
            )
        except ValueError as e:
            raise ValueError(
                f"Unable to read data file, please make sure it is in the standard BEACO2N format.\nError: {e}"
            )

#         beaco2n_site_data = load_json("beaco2n_site_data.json")
        
        beaco2n_site_data = json.loads(Path("/Users/gar/Documents/Devel/openghg/openghg/data/beaco2n_site_data.json").read_text())

        try:
            site_metadata = beaco2n_site_data[site.upper()]
        except KeyError:
            raise ValueError(f"Site {site} not recognized.")

        site_metadata["comment"] = "Retrieved from http://beacon.berkeley.edu/"

        # Set all values below zero to NaN
        data[data < 0] = np_nan
        data = data.rename(columns=rename_cols)

        measurement_types = ["pm", "co2"]
        units = {"pm": "ug/m3", "co2": "ppm"}

        gas_data: DefaultDict[str, Dict[str, Union[DataFrame, Dict]]] = defaultdict(dict)
        for mt in measurement_types:
            m_data = data[[mt, f"{mt}_qc"]]
#             m_data = m_data.dropna(axis="rows", how="any")

            species_metadata = {
                "units": units[mt],
                "site": str(site),
                "species": str(mt),
                "inlet": "NA",
                "network": "beaco2n",
                "sampling_period": str(sampling_period),
            }

            gas_data[mt]["data"] = m_data
            gas_data[mt]["metadata"] = species_metadata
            gas_data[mt]["attributes"] = site_metadata

        # TODO - add CF Compliant attributes?

        return gas_data

In [5]:
data_path = "/home/gar/Documents/Devel/RSE/web-scrape/beaco2n/data/174_HILLPARKSECONDARYSCHOOL.csv"

In [6]:
data = read_file(data_filepath=data_path, site="HILLPARKSECONDARYSCHOOL", network="BEACO2N", inlet="50m")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/gar/Documents/Devel/openghg/openghg/data/beaco2n_site_data.json'

In [35]:
data

defaultdict(dict,
            {'pm': {'data':                        pm  pm_qc
              time                            
              2021-07-28 09:00:00   NaN    NaN
              2021-07-28 10:00:00   2.8    NaN
              2021-07-28 11:00:00   3.9    NaN
              2021-07-28 12:00:00   2.5    NaN
              2021-07-28 13:00:00   2.9    NaN
              ...                   ...    ...
              2021-09-29 11:00:00   5.9    NaN
              2021-09-29 12:00:00   6.4    NaN
              2021-09-29 13:00:00   4.8    NaN
              2021-09-29 14:00:00   8.9    NaN
              2021-09-29 15:00:00  10.3    NaN
              
              [1516 rows x 2 columns],
              'metadata': {'units': 'ug/m3',
               'site': 'HILLPARKSECONDARYSCHOOL',
               'species': 'pm',
               'inlet': 'NA',
               'network': 'beaco2n',
               'sampling_period': 'NOT_SET'},
              'attributes': {'deployed': '2021-07-28',
        

In [58]:
data_path = Path(data_path)
data_path = filepath = "/home/gar/Documents/Devel/RSE/web-scrape/beaco2n/data/156_KILLEARNSTIRLINGSHIREGLASGOWS22002.csv"

datetime_columns = {"time": ["datetime"]}
rename_cols = {
    "PM_ug/m3": "pm",
    "PM_ug/m3_QC_level": "pm_qc",
    "co2_ppm": "co2",
    "co2_ppm_QC_level": "co2_qc",
    "co_ppm": "co",
    "co_ppm_QC_level": "co_qc",
}

use_cols = [1, 5, 6, 7, 8, 9, 10]
na_values = [-999.0]

data = pd.read_csv(
                data_path,
                index_col="time",
                usecols=use_cols,
                parse_dates=datetime_columns,
                na_values=[-999.0],
            )

In [59]:
data = data.rename(columns=rename_cols)
measurement_types = ["pm", "co", "co2"]
# Set all values below zero to NaN
data.columns

Index(['pm', 'pm_qc', 'co', 'co_qc', 'co2', 'co2_qc'], dtype='object')

In [62]:
data = data.dropna(axis=0, subset=measurement_types)
# data = data.to_xarray()

In [63]:
data

Unnamed: 0_level_0,pm,pm_qc,co,co_qc,co2,co2_qc
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-03-01 01:00:00,3.3,,0.28,1a,469.2,
2021-03-01 02:00:00,3.3,,0.26,1a,459.4,
2021-03-01 03:00:00,2.1,,0.26,1a,453.3,
2021-03-01 04:00:00,1.1,,0.27,1a,453.1,
2021-03-01 05:00:00,1.2,,0.27,1a,453.8,
...,...,...,...,...,...,...
2021-08-25 13:00:00,2.4,,0.11,1a,427.0,
2021-08-25 14:00:00,1.8,,0.11,1a,424.9,
2021-08-25 15:00:00,2.1,,0.10,1a,424.4,
2021-08-25 16:00:00,10.4,,0.11,1a,422.7,


In [78]:
units = {"pm": "ug/m3", "co2": "ppm", "co": "ppm"}
gas_data = defaultdict(dict)
for mt in measurement_types:
    m_data = data[[mt, f"{mt}_qc"]]
    m_data = m_data.to_xarray()

    species_metadata = {
        "units": units[mt],
        "site": "s",
        "species": "spec",
        "inlet": "inlet",
        "network": "beaco2n",
        "sampling_period": str(1),
    }
    
    gas_data[mt] = m_data

# TODO - add CF Compliant attributes?

gas_data


defaultdict(dict,
            {'pm': <xarray.Dataset>
             Dimensions:  (time: 4265)
             Coordinates:
               * time     (time) datetime64[ns] 2021-03-01T01:00:00 ... 2021-08-25T17:00:00
             Data variables:
                 pm       (time) float64 3.3 3.3 2.1 1.1 1.2 1.2 ... 2.4 1.8 2.1 10.4 22.9
                 pm_qc    (time) float64 nan nan nan nan nan nan ... nan nan nan nan nan nan,
             'co': <xarray.Dataset>
             Dimensions:  (time: 4265)
             Coordinates:
               * time     (time) datetime64[ns] 2021-03-01T01:00:00 ... 2021-08-25T17:00:00
             Data variables:
                 co       (time) float64 0.28 0.26 0.26 0.27 0.27 ... 0.11 0.11 0.1 0.11 0.26
                 co_qc    (time) object '1a' '1a' '1a' '1a' '1a' ... '1a' '1a' '1a' '1a' '1a',
             'co2': <xarray.Dataset>
             Dimensions:  (time: 4265)
             Coordinates:
               * time     (time) datetime64[ns] 2021-03-01T01:

In [37]:
data.groupby("time.year")

DatasetGroupBy, grouped over 'year'
1 groups with labels 2021.

In [38]:
list(data.groupby("time.year"))

[(2021,
  <xarray.Dataset>
  Dimensions:            (time: 1516)
  Coordinates:
    * time               (time) datetime64[ns] 2021-07-28T09:00:00 ... 2021-09-...
  Data variables:
      PM_ug/m3           (time) float64 nan 2.8 3.9 2.5 2.9 ... 6.4 4.8 8.9 10.3
      PM_ug/m3_QC_level  (time) float64 nan nan nan nan nan ... nan nan nan nan
      co_ppm             (time) float64 nan nan 0.02 0.04 ... 0.27 0.31 0.34 nan
      co_ppm_QC_level    (time) object '1a' '1a' '1a' '1a' ... '1a' '1a' '1a' '1a'
      co2_ppm            (time) float64 nan 416.6 417.3 ... 421.5 418.4 419.0
      co2_ppm_QC_level   (time) float64 nan nan nan nan nan ... nan nan nan nan)]