In [13]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import chardet
import glob
import os
import shutil
from fuzzywuzzy import process

# Load data

In [14]:
# import LCE site data
data_path = '../data/anderegg_2018/anderegg_2018_utf8.csv'
data = pd.read_csv(data_path, index_col=0)

In [15]:
# these are identical columns to lin 2015
data.columns

Index(['Pathway', 'Type', 'Plantform', 'Leafspan', 'Tregion', 'Wregion',
       'Wregion2', 'opt', 'Date', 'Time', 'Datacontrib', 'Species', 'Funtype',
       'fitgroup', 'Location', 'RH', 'Tair', 'Tleaf', 'CO2S', 'PARin', 'Patm',
       'BLCond', 'Trmmol', 'SWC', 'SWP', 'latitude', 'longitude',
       'Totalheight', 'LAI', 'sampleheight', 'canopyposition', 'LWP',
       'LWPpredawn', 'Instrument', 'Season', 'GrowthCa', 'GrowthTair',
       'Growthcond', 'Treatment', 'OriginalFile', 'Comments', 'Reference',
       'LightSource', 'Cond', 'Photo', 'VPD', 'Ci'],
      dtype='object')

In [16]:
metadata_path = '../data/lin_2015/lin_2015_data.csv'
metadata = pd.read_csv(metadata_path)
metadata

Unnamed: 0,Pathway,Type,Plantform,Leafspan,Tregion,Wregion,Wregion2,opt,Date,Time,...,Growthcond,Treatment,OriginalFile,Comments,Reference,LightSource,Cond,Photo,VPD,Ci
0,Photosynthetic pathway,Gymnosperm or angiosperm,Plant life form,Evergreen or Deciduous,Biome,Classification based on aridity index,Moisture index,Growing under ambient or stressed condition,Date of the year when measurement conducted,Time of the day when measurement conducted,...,Growth condition,Experimental treatment,Original file name from data contributor,Anything useful related to the data,Referee associated with the data set,Light source used in the cuvette,Stomatal conductance,Photosynthetic rate,Vapour pressure deficit,Intercelllular [CO2]
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,mol m-2 s-1,µmol m-2 s-1,kPa,ppm
2,C3,angiosperm,shrub,deciduous,Arctic,dry sub-humid,0.546,opt,20130719,17:35:19,...,field,none,JY19_Jorge_Survey,spot measurements at ambient,Rogers unpublished,-9999,0.429072453,10.67373846,0.353032749,343.4368737
3,C3,angiosperm,shrub,deciduous,Arctic,dry sub-humid,0.546,opt,20130719,18:00:26,...,field,none,JY19_Jorge_Survey,spot measurements at ambient,Rogers unpublished,-9999,0.358063524,12.79332032,0.376239308,320.0955589
4,C3,angiosperm,shrub,deciduous,Arctic,dry sub-humid,0.546,opt,20130719,18:13:21,...,field,none,JY19_Jorge_Survey,spot measurements at ambient,Rogers unpublished,-9999,0.35478348,7.150691615,0.391061634,353.2007844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15159,C3,angiosperm,tree,deciduous,temperate,humid,1.055,opt,31-Aug,14:14,...,field,none,none,Automated branch cuvette,Linderson et al_2012_AFM,none,0.305935816,9.4,0.693282673,254.1
15160,C3,angiosperm,tree,deciduous,temperate,humid,1.055,opt,31-Aug,14:18,...,field,none,none,Automated branch cuvette,Linderson et al_2012_AFM,none,0.207549418,7.8,0.973262183,250.1
15161,C3,angiosperm,tree,deciduous,temperate,humid,1.055,opt,31-Aug,14:19,...,field,none,none,Automated branch cuvette,Linderson et al_2012_AFM,none,0.19601083,6.5,1.030555299,265.7
15162,C3,angiosperm,tree,deciduous,temperate,humid,1.055,opt,31-Aug,14:23,...,field,none,none,Automated branch cuvette,Linderson et al_2012_AFM,none,0.238387065,10.7,1.059201764,223.5


In [17]:
units_df = metadata.copy()  # Copy the original data to avoid modifying it
units_df.columns = units_df.columns.get_level_values(0)
headers = units_df.iloc[:2]

In [18]:
# Step 1: Reindex `units` to match the column order of `data`
headers_reindexed = headers.reindex(columns=data.columns)

# Step 2: Extract the reordered rows from `units` to use as header levels
level_0 = data.columns
level_1 = headers_reindexed.iloc[0].values
level_2 = headers_reindexed.iloc[1].values

# Step 3: Create the multi-level header and assign it to `data`
multi_header = pd.MultiIndex.from_arrays([level_0, level_1, level_2])
data.columns = multi_header

In [19]:
data

Unnamed: 0_level_0,Pathway,Type,Plantform,Leafspan,Tregion,Wregion,Wregion2,opt,Date,Time,...,Growthcond,Treatment,OriginalFile,Comments,Reference,LightSource,Cond,Photo,VPD,Ci
Unnamed: 0_level_1,Photosynthetic pathway,Gymnosperm or angiosperm,Plant life form,Evergreen or Deciduous,Biome,Classification based on aridity index,Moisture index,Growing under ambient or stressed condition,Date of the year when measurement conducted,Time of the day when measurement conducted,...,Growth condition,Experimental treatment,Original file name from data contributor,Anything useful related to the data,Referee associated with the data set,Light source used in the cuvette,Stomatal conductance,Photosynthetic rate,Vapour pressure deficit,Intercelllular [CO2]
Unnamed: 0_level_2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,mol m-2 s-1,µmol m-2 s-1,kPa,ppm
7660,C3,angiosperm,tree,evergreen,temperate,humid,0.977,opt,20/07/2009,7:03:51,...,Field,Wet site of the Gradient,-9999,-9999,Martin-StPaul et al 2012,-9999.0,0.051325,7.037894,1.830000,165.275368
7661,C3,angiosperm,tree,evergreen,temperate,humid,0.977,opt,28/04/2009,7:27:24,...,Field,Wet site of the Gradient,-9999,-9999,Martin-StPaul et al 2012,-9999.0,0.137396,13.168188,1.360000,225.796292
7662,C3,angiosperm,tree,evergreen,temperate,humid,0.977,opt,27/11/2009,8:52:30,...,Field,Wet site of the Gradient,-9999,-9999,Martin-StPaul et al 2012,-9999.0,0.175809,14.126326,1.310000,251.358395
7663,C3,angiosperm,tree,evergreen,temperate,humid,0.977,opt,21/06/2009,11:12:15,...,Field,Wet site of the Gradient,-9999,-9999,Martin-StPaul et al 2012,-9999.0,0.065359,6.721385,1.910000,215.880258
7664,C3,angiosperm,tree,evergreen,temperate,humid,0.977,opt,20/07/2009,11:30:24,...,Field,Wet site of the Gradient,-9999,-9999,Martin-StPaul et al 2012,-9999.0,0.041581,3.526875,4.370000,236.524749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5453,,,,,,,,,,,...,,,,,,,0.412063,20.327120,2.135457,227.486152
5454,,,,,,,,,,,...,,,,,,,0.266069,19.286368,2.480808,191.230231
5455,,,,,,,,,,,...,,,,,,,0.263950,18.756451,2.241162,196.043212
5456,,,,,,,,,,,...,,,,,,,0.344322,20.703066,2.046599,210.016550


In [21]:
sorted(data.columns.to_list())

[('BLCond', 'Boundary layer conductance', 'mol m-2 s-1'),
 ('CO2S', 'CO2 concentration at leaf surface', 'ppm'),
 ('Ci', 'Intercelllular [CO2]', 'ppm'),
 ('Comments', 'Anything useful related to the data', '1'),
 ('Cond', 'Stomatal conductance', 'mol m-2 s-1'),
 ('Datacontrib', 'Name of the data contributor', '1'),
 ('Date', 'Date of the year when measurement conducted', '1'),
 ('Funtype', 'Plant functional type (optional', '1'),
 ('GrowthCa', 'Growth [CO2]', '1'),
 ('GrowthTair', 'Growth Tair', '1'),
 ('Growthcond', 'Growth condition', '1'),
 ('Instrument', 'Instrument used for measurement', '1'),
 ('LAI', 'Leaf area index', 'm-2 m-2'),
 ('LWP', 'Leaf water potential', 'MPa'),
 ('LWPpredawn', 'Pre-dawn leaf water potential', 'MPa'),
 ('Leafspan', 'Evergreen or Deciduous', '1'),
 ('LightSource', 'Light source used in the cuvette', '1'),
 ('Location', 'Site location', '1'),
 ('OriginalFile', 'Original file name from data contributor', '1'),
 ('PARin', 'Photosynthetically active radiatio

In [30]:
data[('GrowthCa', 'Growth [CO2]', '1')].unique()

array(['ambient', nan, 'Ambient'], dtype=object)

In [29]:
data[('GrowthTair', 'Growth Tair', '1')].unique()

array(['ambient', '-9999', nan], dtype=object)

In [27]:
data[('Growthcond', 'Growth condition', '1')].unique()

array(['Field', 'field', nan, 'OTC field'], dtype=object)

In [28]:
data[('Treatment', 'Experimental treatment', '1')].unique()

array(['Wet site of the Gradient', 'Intermediate site of the gradient',
       'Dry site of Gradient', 'Ambient', 'Irrigation', nan,
       'OTC CO2_amb'], dtype=object)

In [32]:
data[('Date', 'Date of the year when measurement conducted', '1')].unique()

array(['20/07/2009', '28/04/2009', '27/11/2009', '21/06/2009',
       '24/08/2009', '5/7/09', '30/09/2009', '12/6/09', '29/04/2009',
       '27/08/2009', '2/12/09', '4/7/09', '29/09/2009', '23/07/2009',
       '24/07/2009', '30/04/2009', '18/06/2009', '2/10/09', '27/04/2009',
       '23/08/2009', '28/11/2009', '21/07/2009', '29/11/2009', '5/4/10',
       '6/1/10', '6/29/10', '8/5/10', '9/1/10', '10/5/10', '4/19/11',
       '5/18/11', '7/18/11', '8/22/11', '10/3/11', '5/6/10', '6/3/10',
       '6/28/10', '6/30/10', '8/2/10', '8/4/10', '8/30/10', '10/4/10',
       '10/6/10', '4/18/11', '4/20/11', '5/16/11', '6/20/11', '7/20/11',
       '8/24/11', '10/5/11', nan, 'none'], dtype=object)

In [36]:
data[('canopyposition', np.nan, '1')].unique()

array(['top', '-9999', nan, 'Top'], dtype=object)

In [8]:
data.to_csv('data/anderegg_2018/anderegg_2018_data.csv', index=False)