In [104]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import chardet
import glob
import os
import shutil
from fuzzywuzzy import process

## Overview data

In [155]:
# import data
data_path = 'data/2017_SLZ_BNL_ACi_20190328_20220608183440/2017_SLZ_ACi_comp.csv'
metadata_path = 'data/2017_SLZ_BNL_ACi_20190328_20220608183440/2017_SLZ_ACi_comp_DCD.csv'
data = pd.read_csv(data_path)
metadata = pd.read_csv(metadata_path)

In [156]:
data.columns

Index(['Site', 'Species', 'Sample_ID', 'YYYYMMDD', 'HHMMSS', 'Branch_Number',
       'Leaf_Age', 'Leaf_Age_Revised', 'Machine_name', 'Serial', 'QC', 'Obs',
       'Tleaf', 'Photo', 'Ci', 'CO2S', 'VpdA', 'RH_S', 'Tair', 'PARi', 'Press',
       'Cond', 'BLCond', 'Trmmol', 'FTime', 'Ebal', 'VpdL', 'CTleaf', 'Area',
       'BLC_1', 'StmRat', 'TBlk', 'CO2R', 'H2OR', 'H2OS', 'RH_R', 'Flow',
       'PARo', 'CsMch', 'HsMch', 'StableF', 'BLCslope', 'BLCoffst', 'f_parin',
       'f_parout', 'alphaK', 'Status', 'fda', 'Trans', 'Tair_K', 'Twall_K',
       'R_W_m2', 'Tl_minus_Ta', 'SVTleaf', 'h2o_i', 'h20diff', 'CTair',
       'SVTair', 'CndTotal', 'vp_kPa', 'CndCO2', 'Ci_Pa', 'Ci_Ca', 'RHsfc',
       'C2sfc', 'AHs_Cs'],
      dtype='object')

In [157]:
data.head(3)

Unnamed: 0,Site,Species,Sample_ID,YYYYMMDD,HHMMSS,Branch_Number,Leaf_Age,Leaf_Age_Revised,Machine_name,Serial,...,CTair,SVTair,CndTotal,vp_kPa,CndCO2,Ci_Pa,Ci_Ca,RHsfc,C2sfc,AHs_Cs
0,PA-SLZ,TERMAM,BNL11837,20170216,8:25:44,1,M2,M2,Mariano,PSC-0464,...,28.309216,3.863787,0.195816,2.983829,0.12361,28.223604,0.736047,76.650681,379.126938,0.024072
1,PA-SLZ,TERMAM,BNL11837,20170216,8:27:06,1,M2,M2,Mariano,PSC-0464,...,28.318286,3.865825,0.196425,2.990343,0.123999,23.222696,0.744784,76.771822,308.427569,0.02333
2,PA-SLZ,TERMAM,BNL11837,20170216,8:28:28,1,M2,M2,Mariano,PSC-0464,...,28.356964,3.87453,0.200272,3.001679,0.126452,18.280311,0.760298,76.841053,237.98952,0.022255


In [158]:
metadata.columns = ['Variable', 'Description', 'Units']
metadata.columns

Index(['Variable', 'Description', 'Units'], dtype='object')

In [159]:
metadata.head(3)

Unnamed: 0,Variable,Description,Units
0,Site,Canopy crane location,text
1,Species,STRI species code,alphanumeric
2,Sample_ID,sample number,alphanumeric


## Connect metadata to data

In [160]:
# standardize variables
metadata['Variable'] = (metadata['Variable'].str.strip()
                        .str.lower()
                        .str.replace(' ', '_'))
data.columns = (data.columns.str.strip()
                .str.lower()
                .str.replace(' ', '_'))

In [161]:
# check if variables match
md_columns = metadata['Variable'].to_list()
d_columns = data.columns.to_list()
md_columns == d_columns

False

In [162]:
# check what the differences are
diff = list(set(data.columns.to_list()) ^ set(metadata['Variable'].to_list()))
diff

['ahs_cs',
 'ci/ca',
 'tl_minus_ta',
 'ci_ca',
 'r_w_m2',
 'r(w/m2)',
 'ebal?',
 'tl-ta',
 'ebal',
 'ahs/cs']

In [163]:
convert_dict = {'ci_ca':'ci/ca',
                'ebal?':'ebal',
                'r_w_m2':'r(w/m2)',
                'tl_minus_ta':'tl-ta',
                'ahs_cs':'ahs/cs'}

In [164]:
data = data.rename(columns=convert_dict)
metadata = metadata.replace(convert_dict)
diff = list(set(data.columns.to_list()) ^ set(metadata['Variable'].to_list()))
diff

[]

In [165]:
# creater new header rows
new_header = pd.MultiIndex.from_arrays([
    data.columns,  # Original column names
    metadata.set_index('Variable').loc[data.columns, 'Description'],  # Descriptions
    metadata.set_index('Variable').loc[data.columns, 'Units']  # Units
])

# Step 2: Apply the multi-level header to all_data
data.columns = new_header

In [166]:
data

Unnamed: 0_level_0,site,species,sample_id,yyyymmdd,hhmmss,branch_number,leaf_age,leaf_age_revised,machine_name,serial,...,ctair,svtair,cndtotal,vp_kpa,cndco2,ci_pa,ci/ca,rhsfc,c2sfc,ahs/cs
Description,Canopy crane location,STRI species code,sample number,Measurement date,Measurement local time,Branch ID to differentiate branches of each species,Leaf age as assessed at time of sampling + Leaf count from youngest leaf on stem,Updated leaf age following reassessment against photographs + Leaf count from youngest leaf on stem,Licor 6400XT instrument name,Licor 6400XT instrument serial number,...,Computed chamber air temp,SatVap(Tair),Total conductance,vapor pressure chamber air,Total Conductance to CO2,Intercellular CO2,Intercellular CO2 / Ambient CO2,Surface humidity,Surface CO2,Ball-Berry parameter
Units,text,alphanumeric,alphanumeric,numeric,numeric,integer,alphanumeric,alphanumeric,text,alphanumeric,...,numeric,numeric,numeric,numeric,numeric,Pa,numeric ratio,numeric percentage,μmol mol-1,numeric
0,PA-SLZ,TERMAM,BNL11837,20170216,8:25:44,1,M2,M2,Mariano,PSC-0464,...,28.309216,3.863787,0.195816,2.983829,0.123610,28.223604,0.736047,76.650681,379.126938,0.024072
1,PA-SLZ,TERMAM,BNL11837,20170216,8:27:06,1,M2,M2,Mariano,PSC-0464,...,28.318286,3.865825,0.196425,2.990343,0.123999,23.222696,0.744784,76.771822,308.427569,0.023330
2,PA-SLZ,TERMAM,BNL11837,20170216,8:28:28,1,M2,M2,Mariano,PSC-0464,...,28.356964,3.874530,0.200272,3.001679,0.126452,18.280311,0.760298,76.841053,237.989520,0.022255
3,PA-SLZ,TERMAM,BNL11837,20170216,8:29:50,1,M2,M2,Mariano,PSC-0464,...,28.361144,3.875472,0.202453,3.009167,0.127843,13.415526,0.793715,77.029217,167.623377,0.019166
4,PA-SLZ,TERMAM,BNL11837,20170216,8:31:12,1,M2,M2,Mariano,PSC-0464,...,28.387931,3.881511,0.206393,3.020660,0.130357,8.649342,0.883591,77.164247,97.605067,0.010306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1218,PA-SLZ,GUATDU,BNL12145,20170222,14:01:12,31,Y1,Y1,Derek,PSC-0570,...,29.932375,4.243929,0.246933,3.402782,0.156286,67.361114,0.869760,77.428780,774.330152,0.013890
1219,PA-SLZ,GUATDU,BNL12145,20170222,14:02:58,31,Y1,Y1,Derek,PSC-0570,...,29.915181,4.239737,0.243835,3.393303,0.154301,85.829341,0.883882,77.275124,972.053924,0.011949
1220,PA-SLZ,GUATDU,BNL12145,20170222,14:05:08,31,Y1,Y1,Derek,PSC-0570,...,29.911918,4.238942,0.238757,3.382065,0.151049,123.236254,0.903276,77.008070,1367.942767,0.009330
1221,PA-SLZ,GUATDU,BNL12145,20170222,14:07:27,31,Y1,Y1,Derek,PSC-0570,...,29.958687,4.250350,0.235803,3.378858,0.149157,161.208717,0.916819,76.652911,1764.937564,0.007556


## Clean up headers, descriptions, units

In [167]:
def clean_values(values):
    """
    Cleans the given pandas Index (or Series) of string values by:
    - Stripping leading/trailing whitespace
    - Removing stray quotes
    - Normalizing whitespace and commas
    - Joining related terms
    
    Parameters:
        values (pd.Index or pd.Series): The values to be cleaned.

    Returns:
        pd.Index: The cleaned values.
    """
    cleaned_values = (
        values
        .str.strip()  # Remove leading/trailing whitespace
        .str.replace(r"(^['\"]|['\"]$)", '', regex=True)  # Remove leading/trailing quotes
        .str.replace(r"\s*,\s*", ', ', regex=True)  # Normalize commas with single space after them
        .str.replace(r"\s+", ' ', regex=True)  # Normalize whitespace within strings
        .str.replace(r"' '", '', regex=True)  # Remove isolated single quotes
        .str.replace(r"([a-zA-Z]),([a-zA-Z])", r"\1, \2", regex=True)  # Add space after commas if missing
    )

    # Handle cases like ' evergreen, deciduous '
    cleaned_values = cleaned_values.str.replace(r"['‘’]", '', regex=True)  # Remove any stray single quotes
    cleaned_values = cleaned_values.str.replace(r"\b ,\b", ',', regex=True)  # Fix any stray commas

    return cleaned_values

## 1. Units

In [168]:
# clean units
values = data.columns.get_level_values(2)
cleaned_l2_values = clean_values(values)
data.columns = pd.MultiIndex.from_arrays([
    data.columns.get_level_values(0),
    data.columns.get_level_values(1),
    cleaned_l2_values
])

In [169]:
replacement_l2_vals = {'Kelvin':'K',
                       'W/m2':'W m-2',
                       'alphanumeric':1,
                       'cm2':'cm-2',
                       'degrees Celsius':'degree_C',
                       'integer':1,
                       'mmol H2O m-2 s-1)':'mmol m-2 s-1',
                       'mmol H2O mol-1':'mmol mol-1',
                       'mol H2O m-2 s-1':'mol m-2 s-1',
                       'mol/m2/s':'mol m-2 s-1',
                       'numeric':1,
                       'numeric percentage':'%',
                       'numeric ratio':1,
                       'text':1,
                       'μmol CO2 m-2 s-1':'μmol m-2 s-1',
                       'μmol CO2 mol-1':'μmol mol-1'}

In [170]:
# standardize l2 unit values
# Get the levels from the MultiIndex
level_0 = data.columns.get_level_values(0)
level_1 = data.columns.get_level_values(1)
level_2 = data.columns.get_level_values(2).to_series().replace(replacement_l2_vals)

# Reassign the modified levels back to all_data.columns as a new MultiIndex
data.columns = pd.MultiIndex.from_arrays([level_0, level_1, level_2])

## 2. Descriptions

In [171]:
# clean descriptions
values = data.columns.get_level_values(1)
cleaned_l1_values = clean_values(values)
data.columns = pd.MultiIndex.from_arrays([
    data.columns.get_level_values(0),
    cleaned_l1_values,
    data.columns.get_level_values(2),
])

## 3. Variables

In [172]:
values = data.columns.get_level_values(0)
cleaned_l0_values = clean_values(values)
data.columns = pd.MultiIndex.from_arrays([
    cleaned_l0_values,
    data.columns.get_level_values(1),
    data.columns.get_level_values(2),
])

In [173]:
data

Unnamed: 0_level_0,site,species,sample_id,yyyymmdd,hhmmss,branch_number,leaf_age,leaf_age_revised,machine_name,serial,...,ctair,svtair,cndtotal,vp_kpa,cndco2,ci_pa,ci/ca,rhsfc,c2sfc,ahs/cs
Description,Canopy crane location,STRI species code,sample number,Measurement date,Measurement local time,Branch ID to differentiate branches of each species,Leaf age as assessed at time of sampling + Leaf count from youngest leaf on stem,Updated leaf age following reassessment against photographs + Leaf count from youngest leaf on stem,Licor 6400XT instrument name,Licor 6400XT instrument serial number,...,Computed chamber air temp,SatVap(Tair),Total conductance,vapor pressure chamber air,Total Conductance to CO2,Intercellular CO2,Intercellular CO2 / Ambient CO2,Surface humidity,Surface CO2,Ball-Berry parameter
Units,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,Pa,1,%,μmol mol-1,1
0,PA-SLZ,TERMAM,BNL11837,20170216,8:25:44,1,M2,M2,Mariano,PSC-0464,...,28.309216,3.863787,0.195816,2.983829,0.123610,28.223604,0.736047,76.650681,379.126938,0.024072
1,PA-SLZ,TERMAM,BNL11837,20170216,8:27:06,1,M2,M2,Mariano,PSC-0464,...,28.318286,3.865825,0.196425,2.990343,0.123999,23.222696,0.744784,76.771822,308.427569,0.023330
2,PA-SLZ,TERMAM,BNL11837,20170216,8:28:28,1,M2,M2,Mariano,PSC-0464,...,28.356964,3.874530,0.200272,3.001679,0.126452,18.280311,0.760298,76.841053,237.989520,0.022255
3,PA-SLZ,TERMAM,BNL11837,20170216,8:29:50,1,M2,M2,Mariano,PSC-0464,...,28.361144,3.875472,0.202453,3.009167,0.127843,13.415526,0.793715,77.029217,167.623377,0.019166
4,PA-SLZ,TERMAM,BNL11837,20170216,8:31:12,1,M2,M2,Mariano,PSC-0464,...,28.387931,3.881511,0.206393,3.020660,0.130357,8.649342,0.883591,77.164247,97.605067,0.010306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1218,PA-SLZ,GUATDU,BNL12145,20170222,14:01:12,31,Y1,Y1,Derek,PSC-0570,...,29.932375,4.243929,0.246933,3.402782,0.156286,67.361114,0.869760,77.428780,774.330152,0.013890
1219,PA-SLZ,GUATDU,BNL12145,20170222,14:02:58,31,Y1,Y1,Derek,PSC-0570,...,29.915181,4.239737,0.243835,3.393303,0.154301,85.829341,0.883882,77.275124,972.053924,0.011949
1220,PA-SLZ,GUATDU,BNL12145,20170222,14:05:08,31,Y1,Y1,Derek,PSC-0570,...,29.911918,4.238942,0.238757,3.382065,0.151049,123.236254,0.903276,77.008070,1367.942767,0.009330
1221,PA-SLZ,GUATDU,BNL12145,20170222,14:07:27,31,Y1,Y1,Derek,PSC-0570,...,29.958687,4.250350,0.235803,3.378858,0.149157,161.208717,0.916819,76.652911,1764.937564,0.007556


In [174]:
data.to_csv('data/2017_SLZ_BNL_ACi_20190328_20220608183440/2017_SLZ_ACi_comp_data.csv', index=False)

In [195]:
sorted(data.columns.to_list())

[('ahs/cs', 'Ball-Berry parameter', 1),
 ('alphak', 'Used in the conversion of μmol mol-1 to W m-2', 1),
 ('area', 'Leaf area in chamber', 'cm-2'),
 ('blc_1', 'One sided BLC', 1),
 ('blcoffst', 'offset as function of area', 1),
 ('blcond',
  'Total boundary layer conductance for the leaf (includes stomatal ratio)',
  'mol m-2 s-1'),
 ('blcslope', 'slope as function of area', 1),
 ('branch_number', 'Branch ID to differentiate branches of each species', 1),
 ('c2sfc', 'Surface CO2', 'μmol mol-1'),
 ('ci', 'Intercellular CO2 concentration', 'μmol mol-1'),
 ('ci/ca', 'Intercellular CO2 / Ambient CO2', 1),
 ('ci_pa', 'Intercellular CO2', 'Pa'),
 ('cndco2', 'Total Conductance to CO2', 1),
 ('cndtotal', 'Total conductance', 1),
 ('co2r', 'Reference cell CO2', 'μmol mol-1'),
 ('co2s', 'Sample cell CO2', 'μmol mol-1'),
 ('cond', 'Conductance to H2O', 'mol m-2 s-1'),
 ('csmch', 'Sample CO2 offset', 'μmol mol-1'),
 ('ctair', 'Computed chamber air temp', 1),
 ('ctleaf',
  'Computed leaf temp (C). 

In [205]:
data[('site', 'Canopy crane location', 1)].unique()

array(['PA-SLZ'], dtype=object)