# Harmonization script
## To-do list
- create a lookup table for standardizing column names/descriptions/units
- standardize encoding and micro/mu symbol
- standardize names/descriptions
- convert names/descriptions values to standard unit derived from [essdive-leaf-gas-exchange](https://github.com/ess-dive-community/essdive-leaf-gas-exchange/blob/main/docs/definedVariables.md)
- merge now duplicate columns into one
    - consider differences in value precision between datasets (some datasets have the same data at different precision)
    - ensure values aren't wildly different when merging
- check for out-of-bounds data values and determine if that's due to unit assignments in my checklist, errors in the values themselves, or unit conversion errors
    - if all the errors come from one data source, that probably means the was a unit conversion issue
- resolve all the data issues
- separate point from ACi timeseries data

saunders: temp/description/C
anderegg: temp/description/K --> temp/description/C

expected: temp/description/C

In [92]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import chardet
import glob
import os
import shutil
import unicodedata

## Load data, unit information, and column names of interest

In [93]:
# get data
file_pattern = "../data/**/*_data.csv"
csv_files = glob.glob(file_pattern, recursive=True)
csv_files = [file for file in csv_files if "non_utf8" not in file]

# info about units
files = glob.glob('../data/*_variables.csv')
dfs = []
for file in files:
    df = pd.read_csv(file, usecols=[0,1,2,3, 4])
    dfs.append(df)
bounds = pd.concat(dfs).reset_index(drop=True)

# this file was manually curated based on info from _variables csvs
standard_info = pd.read_csv('../data/columns_of_interest.csv')
standard_info = standard_info.replace('1', 1)

## Export file of all possible header names

In [94]:
# combine data
all_headers = []
for file in csv_files:
    headers = pd.read_csv(file, nrows=3, header=None, encoding='utf-8').T
    headers.columns = ['Variable', 'Description', 'Unit']
    all_headers.append(headers)

# Concatenate all header data into a single DataFrame
combined_headers = pd.concat(all_headers).drop_duplicates()

In [95]:
combined_headers

Unnamed: 0,Variable,Description,Unit
0,site,Canopy crane location,1
1,species,STRI species code,1
2,sample_id,sample number,1
3,yyyymmdd,Measurement date,1
4,hhmmss,Measurement local time,1
...,...,...,...
145,VPD_gs,Average leaf to air vapor pressure deficit dur...,kPa
146,PAR_gs,Average photosynthetically active radiation du...,µmol m-2 s-1
147,aci_id,id corresponding to A/Ci curve file,1
148,leaf_shape,shape of leaf for individual (broadleaf or nee...,1


## Create a lookup table for converting headers/descriptions/units to standard values

In [96]:
combined_headers.to_csv('../data/unique_headers_review.csv', index=False, encoding='utf-8')

## Resolve encoding issues that occur when merging csvs

In [97]:
import pandas as pd
import unicodedata

# Helper function to normalize character encoding for a single text value
def normalize_encoding(text):
    if isinstance(text, str):
        return unicodedata.normalize('NFC', text)  # Normalize Unicode to NFKC form
    return text

# Function to normalize all text in a DataFrame (for headers and data)
def normalize_dataframe(df):
    # Normalize each level in the column headers if it is a multi-index
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = pd.MultiIndex.from_tuples([
            tuple(normalize_encoding(level) for level in col) for col in df.columns
        ], names=df.columns.names)
    else:
        df.columns = [normalize_encoding(col) for col in df.columns]

    # Normalize each cell in the DataFrame
    for col in df.columns:
        df[col] = df[col].apply(lambda x: normalize_encoding(x) if isinstance(x, str) else x)
    
    return df

# Load and normalize the lookup table
lookup_table = pd.read_csv('../data/unique_headers_lookup.csv', encoding='utf-8')
lookup_table = normalize_dataframe(lookup_table)
lookup_table.drop_duplicates(inplace=True)
lookup_table

Unnamed: 0,Variable,Description,Unit,Standard_Variable,Standard_Description,Standard_Unit
0,site,Canopy crane location,1,siteIdentifier,Location descriptor of where data was collected,1
1,species,STRI species code,1,speciesCode,Code used to identify species in data tables,1
2,sample_id,sample number,1,record,Observation record number,1
3,yyyymmdd,Measurement date,1,date,Date of observation,YYYY-MM-DD
4,hhmmss,Measurement local time,1,time,Time of observation,HH:MM:SS
...,...,...,...,...,...,...
309,VPD_gs,Average leaf to air vapor pressure deficit dur...,kPa,growing_season_vpd_mean,The average vapor pressure deficit (VPD) betwe...,kPa
310,PAR_gs,Average photosynthetically active radiation du...,µmol m-2 s-1,growing_season_par_mean,The average photosynthetically active radiatio...,µmol m-2 s-1
311,aci_id,id corresponding to A/Ci curve file,1,record,Observation record number,1
312,leaf_shape,shape of leaf for individual (broadleaf or nee...,1,plantForm,"Life form of the plant, categorizing it as tre...",1


## Standardize all column headers/descriptions but keep the original units

In [42]:
# Create a lookup dictionary from the normalized lookup table
lookup_dict = lookup_table.set_index(['Variable', 'Description', 'Unit']).to_dict('index')

In [43]:
# Function to standardize headers of a single dataset
def standardize_headers(df):
    # First normalize the DataFrame for consistent encoding
    df = normalize_dataframe(df)

    standardized_columns = []
    # Iterate over the multi-header columns in the dataset
    for col in df.columns:
        # Extract the variable, description, and unit for each column
        var, desc, unit = col[0], col[1], col[2]
        lookup_key = (var, desc, unit)
        
        # Check if we have a match in the lookup dictionary
        if lookup_key in lookup_dict:
            # Replace with standardized values
            std_var = lookup_dict[lookup_key]['Standard_Variable']
            std_desc = lookup_dict[lookup_key]['Standard_Description']
            standardized_columns.append((std_var, std_desc, unit))
        else:
            # If no match found, keep the original multi-header
            standardized_columns.append((var, desc, unit))

    # Set the new standardized multi-level columns
    df.columns = pd.MultiIndex.from_tuples(standardized_columns, names=['Standard_Variable', 'Standard_Description', 'Unit'])
    
    # Ensure unique column names within the DataFrame
    df = df.loc[:, ~df.columns.duplicated()]  # Remove duplicated columns
    df.columns = pd.MultiIndex.from_tuples([
        (f"{col[0]}_{i}" if df.columns.duplicated()[i] else col[0], col[1], col[2]) 
        for i, col in enumerate(df.columns)
    ], names=['Standard_Variable', 'Standard_Description', 'Unit'])
    
    return df

# List of CSV files to process
standardized_dfs = []
for file in csv_files:
    df = pd.read_csv(file, header=[0, 1, 2], na_values=[-9999, '-9999'], low_memory=False)
    df = standardize_headers(df)

    # Create a new multi-index for the `data_source` column
    df2 = df.copy()
    df2['data_source', 'File that the data was sourced from', 1] = os.path.basename(file)
    standardized_dfs.append(df2)

# Concatenate all standardized DataFrames without ignoring index
final_df = pd.concat(standardized_dfs, ignore_index=True)

In [44]:
final_df['data_source', 'File that the data was sourced from', 1]

0        2017_SLZ_ACi_comp_data.csv
1        2017_SLZ_ACi_comp_data.csv
2        2017_SLZ_ACi_comp_data.csv
3        2017_SLZ_ACi_comp_data.csv
4        2017_SLZ_ACi_comp_data.csv
                    ...            
91333           smith_2017_data.csv
91334           smith_2017_data.csv
91335           smith_2017_data.csv
91336           smith_2017_data.csv
91337           smith_2017_data.csv
Name: (data_source, File that the data was sourced from, 1), Length: 91338, dtype: object

## Normalize encoding for the data files and units file

In [45]:
# Normalize string encoding
import unicodedata
def normalize_string(s):
    if isinstance(s, str):
        return unicodedata.normalize('NFC', s)
    return s

# Apply normalization to all multiheader rows
final_df.columns = final_df.columns.map(
    lambda x: tuple(normalize_string(item) for item in x)
)

# Apply normalization to bounds values
standard_info = standard_info.map(normalize_string)

In [46]:
# Mix micro issue
def replace_micro_with_mu(s):
    if isinstance(s, str):
        return s.replace('µ', 'μ')
    return s

# Apply normalization to all multiheader rows
final_df.columns = final_df.columns.map(
    lambda x: tuple(replace_micro_with_mu(item) for item in x)
)

# Apply normalization to bounds values
standard_info = standard_info.map(replace_micro_with_mu)

## using the units file (bounds), standardize variable units across all data

In [47]:
import pandas as pd
import pint
import numpy as np

# Initialize pint unit registry
ureg = pint.UnitRegistry()
ureg.setup_matplotlib(True)

# Define units
ureg.define("umol_per_mol = micromole / mole")
ureg.define("mmol_per_mol = millimole / mole")
ureg.define("mol_per_m2_per_s = mole / meter**2 / second")
ureg.define("umol_per_m2_per_s = micromole / meter**2 / second")
ureg.define("mmol_per_m2_per_s = millimole / meter**2 / second")
ureg.define("degree_C = degC")  # Celsius

ureg.define("millimole = 1e-3 * mole")
ureg.define("micromole = 1e-6 * mole")

# Dictionary to map internal names to user-friendly names
unit_aliases = {
    "umol_per_mol": "μmol mol-1",
    "mmol_per_mol": "mmol mol-1",
    "mol_per_m2_per_s": "mol m-2 s-1",
    "mmol_per_m2_per_s": "mmol m-2 s-1",
    "umol_per_m2_per_s": "μmol m-2 s-1",
}

# Reverse mapping for alias lookups
reverse_aliases = {v: k for k, v in unit_aliases.items()}

def normalize_unit(unit):
    if unit == '1':  # Special case for dimensionless unit
        return 1
    return reverse_aliases.get(unit, unit)

# Function to convert a column
def convert_column(column, from_unit, to_unit):
    try:
        # Normalize units
        from_unit = normalize_unit(from_unit)
        to_unit = normalize_unit(to_unit)

        # Handle special case: converting from dimensionless '1' to a physical unit
        if from_unit == 1 and to_unit != 1:
            print(f"Converting from dimensionless '1' to {to_unit}")
            numeric_column = pd.to_numeric(column, errors='coerce')
            return pd.Series(numeric_column, index=column.index)  # Assume data is already in correct scale

        # Handle special case: converting from a physical unit to '1'
        if to_unit == 1 and from_unit != 1:
            print(f"Converting from {from_unit} to dimensionless '1'")
            numeric_column = pd.to_numeric(column, errors='coerce')
            return pd.Series(numeric_column, index=column.index) 

        # Standard numeric conversion
        numeric_column = pd.to_numeric(column, errors='coerce')
        quantity = ureg.Quantity(numeric_column.values, from_unit)
        converted_values = quantity.to(to_unit).magnitude
        return pd.Series(converted_values, index=column.index)

    except pint.DimensionalityError as e:
        print(f"Dimensionality error for column {column.name}: {e}")
    except Exception as e:
        print(f"\tFailed to convert from {from_unit} to {to_unit} for column {column.name}: {e}")
    return column  # Return original column on failure

# Function to standardize units
def standardize_units(df, standard_info):
    updated_columns = []

    for col in df.columns:
        # Extract header information
        std_var, std_desc, unit = col
        normalized_unit = normalize_unit(unit)  # Normalize current unit

        # Find matching variable in standard_info
        match = standard_info.loc[standard_info['Standard_Variable'] == std_var]
        if not match.empty:
            target_unit = normalize_unit(match['Standard_Unit'].values[0])  # Get target unit from standard_info

            # Handle case where the unit in the header needs to change (even without numeric conversion)
            if normalized_unit == 1 and target_unit != 1:
                print(f"Updating unit for {col} from '1' to {target_unit}")
                updated_columns.append((std_var, std_desc, target_unit))
                df[(std_var, std_desc, target_unit)] = df[col]
            elif normalized_unit != target_unit:
                # Handle standard conversion case
                print(f"Converting {col} from {normalized_unit} to {target_unit}")
                converted_column = convert_column(df[col], normalized_unit, target_unit)
                updated_columns.append((std_var, std_desc, target_unit))
                df[(std_var, std_desc, target_unit)] = converted_column  # Add converted column
            else:
                # No changes needed; retain original column
                updated_columns.append((std_var, std_desc, unit))
        else:
            # No match in standard_info; retain original column
            updated_columns.append((std_var, std_desc, unit))

    # Rebuild DataFrame with updated columns
    df = df[updated_columns]
    df.columns = pd.MultiIndex.from_tuples(updated_columns, names=['Standard_Variable', 'Standard_Description', 'Standard_Unit'])

    # Map units to user-friendly names using unit_aliases
    df.columns = pd.MultiIndex.from_tuples([
        (std_var, std_desc, unit_aliases.get(unit, unit))  # Use alias if available, else keep original
        for std_var, std_desc, unit in df.columns
    ], names=['Standard_Variable', 'Standard_Description', 'Standard_Unit'])

    return df

# Apply
final_df_standard = standardize_units(final_df, standard_info)  # Standardize units

Updating unit for ('date', 'Date of observation', '1') from '1' to YYYY-MM-DD
Updating unit for ('time', 'Time of observation', '1') from '1' to HH:MM:SS
Converting ('gsw', 'Stomatal conductance to water vapor per leaf area', 'mol m-2 s-1') from mol_per_m2_per_s to mmol_per_m2_per_s
Converting ('gbw', 'Boundary layer conductance to water vapor per leaf area', 'mol m-2 s-1') from mol_per_m2_per_s to mmol_per_m2_per_s
Converting ('E', 'Transpiration rate of H2O per leaf area', 'mol m-2 s-1') from mol_per_m2_per_s to mmol_per_m2_per_s
Converting ('Tair', 'Air temperature inside the chamber', 'K') from K to degree_C
Converting ('CO2s', 'CO2 concentration in wet air inside chamber', 'ppm') from ppm to umol_per_mol
Converting ('Ci', 'Intercellular CO2 concentration in air', 'ppm') from ppm to umol_per_mol
Updating unit for ('latitudeY', 'Latitude coordinate of the site', '1') from '1' to degree
Updating unit for ('longitudeX', 'Longitude coordinate of the site', '1') from '1' to degree
Conve

  df[(std_var, std_desc, target_unit)] = converted_column  # Add converted column
  df[(std_var, std_desc, target_unit)] = converted_column  # Add converted column
  df[(std_var, std_desc, target_unit)] = converted_column  # Add converted column
  df[(std_var, std_desc, target_unit)] = converted_column  # Add converted column


## Remove old columns that were converted to new units and thus new columns

In [48]:
columns_to_keep = list(standard_info['Standard_Variable'])
first_level = final_df_standard.columns.get_level_values(0)
filtered_df = final_df_standard.loc[:, first_level.isin(columns_to_keep)]
filtered_df

Standard_Variable,siteIdentifier,speciesCode,record,date,time,measurementDevice,Tleaf,A,Ci,RHs,...,replicate,A,gsw,Ci,Qin,CO2r,Oatm,CO2s,date,plantHeight
Standard_Description,Location descriptor of where data was collected,Code used to identify species in data tables,Observation record number,Date of observation,Time of observation,Name of the instrument that collected data,Leaf surface temperature,Net CO2 exchange per leaf area,Intercellular CO2 concentration in air,Relative humidity of air inside the chamber,...,Identification number indicating if a plant was measured more than once,Net CO2 exchange per leaf area,Stomatal conductance to water vapor per leaf area,Intercellular CO2 concentration in air,"In-chamber photosynthetic flux density (PPFD) incident on the leaf, quanta per area",CO2 concentration in wet air entering chamber,The pressure of oxygen in the air,CO2 concentration in wet air inside chamber,Date of observation,The height of the plant measured
Standard_Unit,1,1,1,YYYY-MM-DD,HH:MM:SS,1,degree_C,μmol m-2 s-1,μmol mol-1,%,...,1,μmol m-2 s-1.1,mmol m-2 s-1,μmol mol-1.1,μmol m-2 s-1,μmol mol-1,kPa,μmol mol-1,YYYY-MM-DD.1,m
0,PA-SLZ,TERMAM,BNL11837,,8:25:44,Mariano,28.631668,11.906554,283.221239,78.689293,...,,,,,,,,,,
1,PA-SLZ,TERMAM,BNL11837,,8:27:06,Mariano,28.641735,9.372769,233.030258,78.823975,...,,,,,,,,,,
2,PA-SLZ,TERMAM,BNL11837,,8:28:28,Mariano,28.697920,6.892756,183.434017,79.025185,...,,,,,,,,,,
3,PA-SLZ,TERMAM,BNL11837,,8:29:50,Mariano,28.700821,4.170762,134.618800,79.197121,...,,,,,,,,,,
4,PA-SLZ,TERMAM,BNL11837,,8:31:12,Mariano,28.742250,1.303606,86.790520,79.443336,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91333,Blandy,,Blandy_Mgra_4,5/26/14,,,24.870000,8.320000,396.000000,63.580000,...,4.0,,,,,800.22,,782.81,5/26/14,
91334,Blandy,,Blandy_Mgra_4,5/26/14,,,24.960000,10.800000,500.000000,63.680000,...,4.0,,,,,1000.47,,977.93,5/26/14,
91335,Blandy,,Blandy_Mgra_4,5/26/14,,,25.050000,13.000000,612.000000,63.830000,...,4.0,,,,,1200.12,,1172.96,5/26/14,
91336,Blandy,,Blandy_Mgra_4,5/26/14,,,25.080000,15.800000,795.000000,63.910000,...,4.0,,,,,1500.09,,1466.87,5/26/14,


## See which columns had unit conversions

In [49]:
# Extract first row (level 0) of the multi-index
first_row_list = filtered_df.columns.get_level_values(0)

# Find duplicates
seen = set()
duplicates = set()
for item in first_row_list:
    if item in seen:
        duplicates.add(item)
    else:
        seen.add(item)

# Display duplicates and their associated full multi-index rows
if duplicates:
    print(f"Duplicates found in the first row: {list(duplicates)}")
    print("\nFull multi-index rows for duplicates:")
    for duplicate in duplicates:
        matching_rows = [col for col in filtered_df.columns if col[0] == duplicate]
        print(f"Duplicate: {duplicate}")
        for row in matching_rows:
            print(row)
else:
    print("No duplicates found in the first row.")

Duplicates found in the first row: ['Ci', 'CO2s', 'E', 'date', 'latitudeY', 'gsw', 'A', 'Qin', 'longitudeX', 'Tair', 'plantHeight']

Full multi-index rows for duplicates:
Duplicate: Ci
('Ci', 'Intercellular CO2 concentration in air', 'μmol mol-1')
('Ci', 'Intercellular CO2 concentration in air', 'μmol mol-1')
('Ci', 'Intercellular CO2 concentration in air', 'μmol mol-1')
Duplicate: CO2s
('CO2s', 'CO2 concentration in wet air inside chamber', 'μmol mol-1')
('CO2s', 'CO2 concentration in wet air inside chamber', 'μmol mol-1')
Duplicate: E
('E', 'Transpiration rate of H2O per leaf area', 'mmol m-2 s-1')
('E', 'Transpiration rate of H2O per leaf area', 'mmol m-2 s-1')
Duplicate: date
('date', 'Date of observation', 'YYYY-MM-DD')
('date', 'Date of observation', 'YYYY-MM-DD')
Duplicate: latitudeY
('latitudeY', 'Latitude coordinate of the site', 'degree')
('latitudeY', 'Latitude coordinate of the site', 'degree')
Duplicate: gsw
('gsw', 'Stomatal conductance to water vapor per leaf area', 'mmo

## combine duplicate columns while handling precision differences from overlapping data
## return a set of values if there are very different values during merging

In [50]:
import numpy as np
import pandas as pd

def merge_duplicates_with_tolerance(df, tolerance=1e-4):
    """
    Merges duplicate columns in a DataFrame:
    - If values are different (beyond the tolerance), aggregates into a set.
    - If values are the same (within the tolerance), takes the first non-NaN value.
    - Replaces empty sets with np.nan.

    Args:
        df (pd.DataFrame): Input DataFrame with multi-header columns.
        tolerance (float): Tolerance for approximate equality of numeric values.

    Returns:
        pd.DataFrame: DataFrame with merged duplicate columns.
    """
    # Identify exact duplicate column headers (all levels of the multi-header)
    duplicate_headers = df.columns[df.columns.duplicated()].unique()

    # DataFrame to hold the merged results
    merged_df = df.copy()

    for duplicate in duplicate_headers:
        # Select all columns with the current duplicate header
        duplicate_cols = merged_df.loc[:, [duplicate]]

        # Split numeric and non-numeric data
        numeric_cols = duplicate_cols.select_dtypes(include=[int, float])
        non_numeric_cols = duplicate_cols.select_dtypes(exclude=[int, float])

        # Merge numeric data: check approximate equality
        def merge_numeric_row(row, tolerance):
            row = row.dropna()
            if row.empty:
                return np.nan
            first_value = row.iloc[0]
            if np.all(np.isclose(row, first_value, atol=tolerance)):
                return first_value
            return set(row)
        numeric_merged = numeric_cols.apply(lambda row: merge_numeric_row(row, tolerance), axis=1)

        # Merge non-numeric data: take unique non-NaN values
        def merge_non_numeric_row(row):
            row = row.dropna()
            if row.nunique() > 1:
                return set(row)
            elif not row.empty:
                return row.iloc[0]
            else:
                return np.nan
        non_numeric_merged = non_numeric_cols.apply(merge_non_numeric_row, axis=1)

        # Combine numeric and non-numeric results
        def combine_values(x, y):
            # Handle cases where one or both are NaN
            if pd.isna(x) and pd.isna(y):
                return np.nan
            if pd.isna(x):
                return y
            if pd.isna(y):
                return x

            # Handle cases where one or both are sets
            x = x if isinstance(x, set) else {x}
            y = y if isinstance(y, set) else {y}
            combined = x.union(y)
            return combined if combined else np.nan  # Replace empty set with np.nan

        merged_col = numeric_merged.combine(non_numeric_merged, combine_values)

        # Ensure alignment of merged_col with merged_df
        merged_col = pd.Series(merged_col, index=merged_df.index)

        # Replace duplicate columns with the merged column
        merged_df = merged_df.drop(columns=duplicate_cols.columns)
        merged_df = merged_df.sort_index(axis=1)
        merged_df[duplicate] = merged_col

    return merged_df

merged_df = merge_duplicates_with_tolerance(filtered_df, tolerance=1e-4)

In [68]:
merged_df.columns

MultiIndex([(                       'A', ...),
            (                    'CO2r', ...),
            (                    'CO2s', ...),
            (                      'Ci', ...),
            (                    'CiCa', ...),
            (                       'E', ...),
            (                     'LAI', ...),
            (                    'Oatm', ...),
            (                    'Patm', ...),
            (                     'Qin', ...),
            (                    'Qout', ...),
            (                     'RHr', ...),
            (                     'RHs', ...),
            (                    'Tair', ...),
            (                  'Tblock', ...),
            (                   'Tleaf', ...),
            (                 'VPDleaf', ...),
            (                    'area', ...),
            (         'dataContributor', ...),
            (             'data_source', ...),
            (                    'date', ...),
            (

In [52]:
merged_df.to_csv('../data/check_merged_cols.csv')

## take a moment to see what the data is looking like

In [176]:
from ydata_profiling import ProfileReport

In [177]:
df = merged_df.copy()
df.columns = df.columns.get_level_values(0)
profile = ProfileReport(df, title=f"Merged Data Profiling Report")
profile.to_file('../data/merged_data_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  return spearmanr(a, b)[0]
  return spearmanr(a, b)[0]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## some things I noticed in the report:
- `A` has large tails in its curve; lots of data lying outside of -20 to 120 bounds
    - Min value is -51400 and Max is 97275.6 which is very incorrect
- `CO2s` and `CO2r` have negative values when they shouldn't and distribution is choppy
- `Ci` also has enormous distribution tails
    - Min -4208000 and Max 6727848.8 when limits are 0 to 5000
- `CiCa` is a ratio, ranges from -2.2769324 to 8.5741832; should there be negatives ratios?
- `E` is right skewed; values range from -2.0898413 to 36.2; I doubt respiration can be negative and usually `E` is between 0 to 25 according to chatgpt
- `Oatm` is a constant
- `Patm` has a couple outliers below 50 kPa but mostly looks good
- `Qin` has crazy outliers (-286 to 2002614) given normal between 0 to 5000; I think this is a severe unit conversion issue
- `Qout` looks much better in comparison to `Qin`
- `RHr` and `RHs` look good
- `Tair` is in Kelvin when it should be in Celsius?
- `Tleaf` looks good
- `VPDleaf` mostly okay but has a good amount of data over 5 (max in data is 10.6, but reasonable limit is 5)
- `date` needs to be standardized
- `experimentalManipulation` has "ambient" and "Ambient"
    - Categorical values need to be standardized
- `gbw` appears to have a unit conversion issue
- `gsw` has out of range values (-175.27194 to 3240 when limits are 0 to 1000)
- `measurementDevice`, `plantBiome`, `plantForm`, `plantLeaf`, `plantType` categories need to be standardized
- `season` needs to be calculated when dates are standardized

## identify out-of-bounds data

In [66]:
# Extract the first row (variable names) from the multi-header DataFrame
variable_names = merged_df.columns.get_level_values(0)
bounds_dict = bounds.set_index("variableName").to_dict(orient="index")
columns_to_check = [
    col for col in variable_names
    if col in bounds_dict 
    and pd.notna(bounds_dict[col].get('expectedValueRangeMin')) 
    and pd.notna(bounds_dict[col].get('expectedValueRangeMax'))
]
columns_to_check

['A',
 'CO2r',
 'CO2s',
 'Ci',
 'Patm',
 'Qin',
 'Qout',
 'RHr',
 'RHs',
 'Tair',
 'Tleaf',
 'VPDleaf',
 'gsw']

In [91]:
# Extract the first row (variable names) from the multi-header DataFrame
variable_names = merged_df.columns.get_level_values(0)
bounds_dict = bounds.set_index("variableName").to_dict(orient="index")

# Identify columns in merged_df that match variable names in bounds and have expected ranges
columns_to_check = [
    col for col in variable_names
    if col in bounds_dict 
    and pd.notna(bounds_dict[col].get('expectedValueRangeMin'))  # Ensure min bound exists
    and pd.notna(bounds_dict[col].get('expectedValueRangeMax'))  # Ensure max bound exists
]

# Create a dictionary to store a violations DataFrame for each column
violations_dict = {}

# Check each matching column for violations
for col in columns_to_check:
    min_val = bounds_dict[col]['expectedValueRangeMin']
    max_val = bounds_dict[col]['expectedValueRangeMax']
    
    # Get the data column from merged_df matching the variable name
    column_data = merged_df[col]
    
    # Exclude NaN values before applying the range filter
    valid_data = column_data.dropna()
    
    # Find rows where values are outside the specified range
    out_of_bounds_rows = valid_data[(valid_data < min_val) | (valid_data > max_val)]
    
    # Add to the violations dictionary if there are out-of-bound rows
    if not out_of_bounds_rows.empty:
        # Add the column name as a new column
        out_of_bounds_rows["violated_column"] = col

        # Add additional context columns
        out_of_bounds_rows["dataContributor"] = merged_df["dataContributor"]
        out_of_bounds_rows["data_source"] = merged_df["data_source"]

        # Store the violations for this column
        violations_dict[col] = out_of_bounds_rows.dropna()

# At this point, violations_dict contains one DataFrame for each column with out-of-bound values
# Example: Access violations for a specific column
print(list(violations_dict.keys()))
df = violations_dict['gsw']
df['data_source'].unique()

['A', 'CO2r', 'CO2s', 'Ci', 'Patm', 'Qin', 'Qout', 'RHr', 'RHs', 'Tair', 'Tleaf', 'VPDleaf', 'gsw']


array(['lin_2015_data.csv', 'kumarathunge_2019_data.csv',
       'anderegg_2018_data.csv'], dtype=object)