In [23]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import chardet
import glob
import os
import shutil
from fuzzywuzzy import process

---
# Load LCE Data

In [24]:
# import LCE site data
data_path = '../data/smith_2017/LCE_data_utf8.csv'
meta_path = '../data/smith_2017/LCE_dictionary_utf8.csv'
data = pd.read_csv(data_path, index_col=0)
meta = pd.read_csv(meta_path)

# join meta to LCE data
data_with_meta = pd.concat([
    pd.DataFrame([meta.set_index('Variable').loc[data.columns, 'Description'].values], columns=data.columns),
    pd.DataFrame([meta.set_index('Variable').loc[data.columns, 'Units'].values], columns=data.columns),
    data
], ignore_index=True)
data_with_meta.iloc[1].unique().tolist()

['mm/dd/yy',
 'unitless',
 '¡',
 'µmol/m2/s',
 '¡C',
 'cm2',
 'g',
 'g/g',
 '%',
 'Mpa',
 'm',
 'cm',
 'mm/mm',
 'mm',
 'kPa',
 'mol m-2 s-1']

### Fix unit names

In [25]:
# Replacing values in the first row of `data_with_meta_automated` using the provided dictionary
replacement_dict = {
    'unitless': 1,
    '¡': 'degree',
    'µmol/m2/s': 'µmol m-2 s-1',
    '¡C': 'degree_C',
    'cm2': 'cm-2',
    'g/g': 1,
    'Mpa': 'MPa',
    'mm/mm': 1
}

In [26]:
# Apply the replacement using the dictionary
data_with_meta.iloc[1] = data_with_meta.iloc[1].replace(replacement_dict)

# Extract rows 0 and 1 to use as headers
new_header = pd.MultiIndex.from_arrays([data_with_meta.columns,
                                        data_with_meta.iloc[0], 
                                        data_with_meta.iloc[1]])
data_with_meta.columns = new_header
data_with_meta = data_with_meta[2:]
data_with_meta.reset_index(drop=True, inplace=True)
data_with_meta.columns = data_with_meta.columns.map(lambda x: tuple(str(i) for i in x))
data_with_meta.columns = pd.MultiIndex.from_tuples(
    [(str(level1), str(level2), str(level3).replace('/', '-')) for level1, level2, level3 in data_with_meta.columns])

In [27]:
data_with_meta.head(3)

Unnamed: 0_level_0,Date,Year,Day,Location,Lat,Lon,Genus,Species,Rep,Phenology,...,Tavg90,Tavg91,ai,Temperature_gs,Precipitation_gs,VPD_gs,PAR_gs,aci_id,leaf_shape,Cond
Unnamed: 0_level_1,Date of measurement,Year of measurement,Day of year of measurement,Location of measurement,Latitude,Longitude,Genus of individual,Species of individual,Repitition of species within a location,Indicator of deciduousness of individual (evergreen or nonevergreen),...,Average temperature on the day of and n days prior to measuremet; n = 89,Average temperature on the day of and n days prior to measuremet; n = 90,Aridity index (mean annual precipitation / potential evapotranspiration),Average temperature during the growing season from 1901-2015 (growing season = all months where average temperature > 0),Average precipitation during the growing season from 1901-2015 (growing season = all months where average temperature > 0),Average leaf to air vapor pressure deficit during the growing season from 1901-2015 (growing season = all months where average temperature > 0),Average photosynthetically active radiation during the growing season from 1901-2015 (growing season = all months where average temperature > 0),id corresponding to A/Ci curve file,shape of leaf for individual (broadleaf or needleleaf),Mean stomatal conductance for Aci curve
Unnamed: 0_level_2,mm-dd-yy,1,1,1,degree,degree,1,1,1,1,...,degree_C,degree_C,1,degree_C,mm,kPa,µmol m-2 s-1,1,1,mol m-2 s-1
0,5/19/15,2015,139,LaSelva,10.42,-84.02,Albizia,adinocephala,1,Evergreen,...,23.721292,23.713365,2.5834,23.118304,325.055357,0.497027,982.702895,LaSelva_Aadi_1,broadleaf,0.041948
1,5/21/15,2015,141,LaSelva,10.42,-84.02,Alchornea,costaricensis,1,Evergreen,...,23.719903,23.710206,2.5834,23.118304,325.055357,0.497027,982.702895,LaSelva_Acos_1,broadleaf,0.036077
2,5/22/15,2015,142,LaSelva,10.42,-84.02,Annona,papilionella,1,Nonevergreen,...,23.733482,23.729955,2.5834,23.118304,325.055357,0.497027,982.702895,LaSelva_Apap_1,broadleaf,0.100705


---
# Load LCE ACi Curve data

In [28]:
# import LCE ACi curves data
curve_dir = '../data/smith_2017/LCE_ACi_curves'
curve_files = glob.glob(os.path.join(curve_dir, '**', '*.csv'), recursive=True)
curves = []
for file in curve_files:
    df = pd.read_csv(file, index_col=0)
    curves.append(df)
all_curves = pd.concat(curves)
all_curves = all_curves.drop(columns=['X', 'SetLA'])
all_curves.head(3)

Unnamed: 0,Photo,Cond,Ci,Trmmol,VPD,LA,StmRat,BLCond,Tair,Tleaf,...,CO2R,CO2S,H2OR,H2OS,RH_R,RH_S,Flow,Pari,Press,id
1,0.3,0.0182,168.0,0.323,1.74,6.0,0.5,2.25,23.34,24.74,...,200.03,199.3,13.366,14.003,46.23,48.42,300.2,1801.0,99.5,UMBS_Qrub_12
2,-0.195,0.0226,111.0,0.41,1.78,6.0,0.5,2.25,23.67,24.96,...,99.54,99.84,13.176,13.984,44.7,47.38,300.2,1800.0,99.5,UMBS_Qrub_12
3,-0.514,0.0318,76.5,0.588,1.82,6.0,0.5,2.25,23.76,25.27,...,51.26,52.23,12.998,14.156,43.81,47.71,300.2,1799.0,99.5,UMBS_Qrub_12


In [29]:
# load LCE ACi curves metadata
meta_path = '../data/smith_2017/LCE_ACi_dictionary_utf8.csv'
meta = pd.read_csv(meta_path)

# join meta to LCE data
curve_data_with_meta = pd.concat([
    pd.DataFrame([meta.set_index('Variable').loc[all_curves.columns, 'Description'].values], columns=all_curves.columns),
    pd.DataFrame([meta.set_index('Variable').loc[all_curves.columns, 'Units'].values], columns=all_curves.columns),
    all_curves
], ignore_index=True)
curve_data_with_meta.iloc[1].unique().tolist()

['µmol/m2/s',
 'mol/m2/s',
 'µmol/mol',
 'mmol/m2/s',
 'kPa',
 'cm2',
 nan,
 '°C',
 'mmol/mol',
 '%',
 'µmol/s']

In [30]:
replacement_dict = {'µmol/m2/s':'µmol m-2 s-1',
                    'mol/m2/s':'mol m-2 s-1',
                    'µmol/mol':'µmol mol-1',
                    'mmol/m2/s':'mmol m-2 s-1',
                    'cm2':'cm-2',
                    np.nan:1,
                    '°C':'degree_C',
                    'mmol/mol':'mmol mol-1',
                    'µmol/s':'µmol s-1'}

In [31]:
# Apply the replacement using the dictionary
curve_data_with_meta.iloc[1] = curve_data_with_meta.iloc[1].replace(replacement_dict)

# Extract rows 0 and 1 to use as headers
new_header = pd.MultiIndex.from_arrays([curve_data_with_meta.columns,
                                        curve_data_with_meta.iloc[0], 
                                        curve_data_with_meta.iloc[1]])
curve_data_with_meta.columns = new_header
curve_data_with_meta = curve_data_with_meta[2:]
curve_data_with_meta.reset_index(drop=True, inplace=True)
curve_data_with_meta.columns = curve_data_with_meta.columns.map(lambda x: tuple(str(i) for i in x))
curve_data_with_meta.columns = pd.MultiIndex.from_tuples(
    [(str(level1), str(level2), str(level3).replace('/', '-')) for level1, level2, level3 in curve_data_with_meta.columns]
)
curve_data_with_meta.head(3)

Unnamed: 0_level_0,Photo,Cond,Ci,Trmmol,VPD,LA,StmRat,BLCond,Tair,Tleaf,...,CO2R,CO2S,H2OR,H2OS,RH_R,RH_S,Flow,Pari,Press,id
Unnamed: 0_level_1,Net photosynthetic rate,Stomatal conductance,Intracellular CO2,Transpiration rate,Vapor pressure deifcit,Leaf area,Ratio of stomatal density from one sideo f leaf to other,Boundary layer conductance,Air temprature,Leaf temperature,...,Reference CO2,Sample CO2,Reference H2O,Sample H2O,Reference relative humidity,Sample relative humidity,Flow rate,Photosynthetically active radiation inside the chamber,Atmospheric pressure,Identifier for Aci curve
Unnamed: 0_level_2,µmol m-2 s-1,mol m-2 s-1,µmol mol-1,mmol m-2 s-1,kPa,cm-2,1,mol m-2 s-1,degree_C,degree_C,...,µmol mol-1,µmol mol-1,mmol mol-1,mmol mol-1,%,%,µmol s-1,µmol m-2 s-1,kPa,1
0,0.3,0.0182,168.0,0.323,1.74,6.0,0.5,2.25,23.34,24.74,...,200.03,199.3,13.366,14.003,46.23,48.42,300.2,1801.0,99.5,UMBS_Qrub_12
1,-0.195,0.0226,111.0,0.41,1.78,6.0,0.5,2.25,23.67,24.96,...,99.54,99.84,13.176,13.984,44.7,47.38,300.2,1800.0,99.5,UMBS_Qrub_12
2,-0.514,0.0318,76.5,0.588,1.82,6.0,0.5,2.25,23.76,25.27,...,51.26,52.23,12.998,14.156,43.81,47.71,300.2,1799.0,99.5,UMBS_Qrub_12


In [32]:
curve_data_with_meta.columns.to_list()

[('Photo', 'Net photosynthetic rate', 'µmol m-2 s-1'),
 ('Cond', 'Stomatal conductance', 'mol m-2 s-1'),
 ('Ci', 'Intracellular CO2', 'µmol mol-1'),
 ('Trmmol', 'Transpiration rate', 'mmol m-2 s-1'),
 ('VPD', 'Vapor pressure deifcit', 'kPa'),
 ('LA', 'Leaf area', 'cm-2'),
 ('StmRat', 'Ratio of stomatal density from one sideo f leaf to other', '1'),
 ('BLCond', 'Boundary layer conductance', 'mol m-2 s-1'),
 ('Tair', 'Air temprature', 'degree_C'),
 ('Tleaf', 'Leaf temperature', 'degree_C'),
 ('TBlk', 'Block temperature', 'degree_C'),
 ('CO2R', 'Reference CO2', 'µmol mol-1'),
 ('CO2S', 'Sample CO2', 'µmol mol-1'),
 ('H2OR', 'Reference H2O', 'mmol mol-1'),
 ('H2OS', 'Sample H2O', 'mmol mol-1'),
 ('RH_R', 'Reference relative humidity', '%'),
 ('RH_S', 'Sample relative humidity', '%'),
 ('Flow', 'Flow rate', 'µmol s-1'),
 ('Pari',
  'Photosynthetically active radiation inside the chamber',
  'µmol m-2 s-1'),
 ('Press', 'Atmospheric pressure', 'kPa'),
 ('id', 'Identifier for Aci curve', '1')]

---
# Join LCE data to ACi Curve Data

In [33]:
merged = curve_data_with_meta.merge(data_with_meta, how='left', 
                                    left_on=[('id', 'Identifier for Aci curve', '1')], 
                                    right_on=[('aci_id', 'id corresponding to A/Ci curve file', '1')], 
                                    suffixes=('_x', '_y'))
merged

Unnamed: 0_level_0,Photo,Cond,Ci,Trmmol,VPD,LA_x,StmRat,BLCond,Tair,Tleaf,...,Tavg90,Tavg91,ai,Temperature_gs,Precipitation_gs,VPD_gs,PAR_gs,aci_id,leaf_shape,Cond
Unnamed: 0_level_1,Net photosynthetic rate,Stomatal conductance,Intracellular CO2,Transpiration rate,Vapor pressure deifcit,Leaf area,Ratio of stomatal density from one sideo f leaf to other,Boundary layer conductance,Air temprature,Leaf temperature,...,Average temperature on the day of and n days prior to measuremet; n = 89,Average temperature on the day of and n days prior to measuremet; n = 90,Aridity index (mean annual precipitation / potential evapotranspiration),Average temperature during the growing season from 1901-2015 (growing season = all months where average temperature > 0),Average precipitation during the growing season from 1901-2015 (growing season = all months where average temperature > 0),Average leaf to air vapor pressure deficit during the growing season from 1901-2015 (growing season = all months where average temperature > 0),Average photosynthetically active radiation during the growing season from 1901-2015 (growing season = all months where average temperature > 0),id corresponding to A/Ci curve file,shape of leaf for individual (broadleaf or needleleaf),Mean stomatal conductance for Aci curve
Unnamed: 0_level_2,µmol m-2 s-1,mol m-2 s-1,µmol mol-1,mmol m-2 s-1,kPa,cm-2,1,mol m-2 s-1,degree_C,degree_C,...,degree_C,degree_C,1,degree_C,mm,kPa,µmol m-2 s-1,1,1,mol m-2 s-1
0,0.3,0.0182,168.0,0.323,1.74,6.0,0.5,2.25,23.34,24.74,...,12.304135,12.212661,0.9386,11.939913,78.045043,0.366431,815.654972,UMBS_Qrub_12,broadleaf,0.039091
1,-0.195,0.0226,111.0,0.41,1.78,6.0,0.5,2.25,23.67,24.96,...,12.304135,12.212661,0.9386,11.939913,78.045043,0.366431,815.654972,UMBS_Qrub_12,broadleaf,0.039091
2,-0.514,0.0318,76.5,0.588,1.82,6.0,0.5,2.25,23.76,25.27,...,12.304135,12.212661,0.9386,11.939913,78.045043,0.366431,815.654972,UMBS_Qrub_12,broadleaf,0.039091
3,4.65,0.0451,213.0,0.798,1.76,6.0,0.5,2.25,22.39,25.12,...,12.304135,12.212661,0.9386,11.939913,78.045043,0.366431,815.654972,UMBS_Qrub_12,broadleaf,0.039091
4,5.39,0.0484,198.0,0.842,1.73,6.0,0.5,2.25,22.34,25.06,...,12.304135,12.212661,0.9386,11.939913,78.045043,0.366431,815.654972,UMBS_Qrub_12,broadleaf,0.039091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7570,8.32,0.0361,396.0,0.496,1.36,6.0,0.5,2.25,23.01,24.87,...,9.875425,9.753717,0.8511,13.313639,84.415825,0.458358,727.120837,Blandy_Mgra_4,broadleaf,0.032431
7571,10.8,0.0379,500.0,0.524,1.37,6.0,0.5,2.25,23.05,24.96,...,9.875425,9.753717,0.8511,13.313639,84.415825,0.458358,727.120837,Blandy_Mgra_4,broadleaf,0.032431
7572,13.0,0.0389,612.0,0.542,1.38,6.0,0.5,2.25,23.08,25.05,...,9.875425,9.753717,0.8511,13.313639,84.415825,0.458358,727.120837,Blandy_Mgra_4,broadleaf,0.032431
7573,15.8,0.0398,795.0,0.554,1.38,6.0,0.5,2.25,23.1,25.08,...,9.875425,9.753717,0.8511,13.313639,84.415825,0.458358,727.120837,Blandy_Mgra_4,broadleaf,0.032431


### Check overlapping columns and adjust names

In [34]:
col_list = merged.columns.to_list()
for col_tuple in merged.columns.to_list():
    for i in col_tuple:
        # print(i)
        if i.endswith('_x') or i.endswith('_y'):
            print(i)

LA_x
LA_y


In [35]:
# fix column names
df = merged.drop(columns=['LA_y'], level=0) # duplicate columns
df = df.drop_duplicates()
df

Unnamed: 0_level_0,Photo,Cond,Ci,Trmmol,VPD,LA_x,StmRat,BLCond,Tair,Tleaf,...,Tavg90,Tavg91,ai,Temperature_gs,Precipitation_gs,VPD_gs,PAR_gs,aci_id,leaf_shape,Cond
Unnamed: 0_level_1,Net photosynthetic rate,Stomatal conductance,Intracellular CO2,Transpiration rate,Vapor pressure deifcit,Leaf area,Ratio of stomatal density from one sideo f leaf to other,Boundary layer conductance,Air temprature,Leaf temperature,...,Average temperature on the day of and n days prior to measuremet; n = 89,Average temperature on the day of and n days prior to measuremet; n = 90,Aridity index (mean annual precipitation / potential evapotranspiration),Average temperature during the growing season from 1901-2015 (growing season = all months where average temperature > 0),Average precipitation during the growing season from 1901-2015 (growing season = all months where average temperature > 0),Average leaf to air vapor pressure deficit during the growing season from 1901-2015 (growing season = all months where average temperature > 0),Average photosynthetically active radiation during the growing season from 1901-2015 (growing season = all months where average temperature > 0),id corresponding to A/Ci curve file,shape of leaf for individual (broadleaf or needleleaf),Mean stomatal conductance for Aci curve
Unnamed: 0_level_2,µmol m-2 s-1,mol m-2 s-1,µmol mol-1,mmol m-2 s-1,kPa,cm-2,1,mol m-2 s-1,degree_C,degree_C,...,degree_C,degree_C,1,degree_C,mm,kPa,µmol m-2 s-1,1,1,mol m-2 s-1
0,0.3,0.0182,168.0,0.323,1.74,6.0,0.5,2.25,23.34,24.74,...,12.304135,12.212661,0.9386,11.939913,78.045043,0.366431,815.654972,UMBS_Qrub_12,broadleaf,0.039091
1,-0.195,0.0226,111.0,0.41,1.78,6.0,0.5,2.25,23.67,24.96,...,12.304135,12.212661,0.9386,11.939913,78.045043,0.366431,815.654972,UMBS_Qrub_12,broadleaf,0.039091
2,-0.514,0.0318,76.5,0.588,1.82,6.0,0.5,2.25,23.76,25.27,...,12.304135,12.212661,0.9386,11.939913,78.045043,0.366431,815.654972,UMBS_Qrub_12,broadleaf,0.039091
3,4.65,0.0451,213.0,0.798,1.76,6.0,0.5,2.25,22.39,25.12,...,12.304135,12.212661,0.9386,11.939913,78.045043,0.366431,815.654972,UMBS_Qrub_12,broadleaf,0.039091
4,5.39,0.0484,198.0,0.842,1.73,6.0,0.5,2.25,22.34,25.06,...,12.304135,12.212661,0.9386,11.939913,78.045043,0.366431,815.654972,UMBS_Qrub_12,broadleaf,0.039091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7570,8.32,0.0361,396.0,0.496,1.36,6.0,0.5,2.25,23.01,24.87,...,9.875425,9.753717,0.8511,13.313639,84.415825,0.458358,727.120837,Blandy_Mgra_4,broadleaf,0.032431
7571,10.8,0.0379,500.0,0.524,1.37,6.0,0.5,2.25,23.05,24.96,...,9.875425,9.753717,0.8511,13.313639,84.415825,0.458358,727.120837,Blandy_Mgra_4,broadleaf,0.032431
7572,13.0,0.0389,612.0,0.542,1.38,6.0,0.5,2.25,23.08,25.05,...,9.875425,9.753717,0.8511,13.313639,84.415825,0.458358,727.120837,Blandy_Mgra_4,broadleaf,0.032431
7573,15.8,0.0398,795.0,0.554,1.38,6.0,0.5,2.25,23.1,25.08,...,9.875425,9.753717,0.8511,13.313639,84.415825,0.458358,727.120837,Blandy_Mgra_4,broadleaf,0.032431


In [41]:
# sorted(df.columns.get_level_values(0).to_list())

In [22]:
df.to_csv('../data/smith_2017/smith_2017_data.csv', index=False)