In [1]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import chardet
import glob
import os
import shutil
from fuzzywuzzy import process

In [2]:
data = pd.read_csv('../data/kumarathunge_2019/ACi-TGlob_V1.0_utf8.csv')
metadata = pd.read_csv('../data/kumarathunge_2019/ACi-TGlob_V1.0_metadata_utf8.csv')

  data = pd.read_csv('../data/kumarathunge_2019/ACi-TGlob_V1.0_utf8.csv')


### Fix mismatches between column names and metadata Variable column

In [3]:
data.columns = (data.columns.str.strip()
                .str.lower()
                .str.replace(' ', '_'))

In [4]:
metadata = metadata[metadata['Variable'] != 'species'] # header doesnt even exist in data
metadata['Variable'] = (metadata['Variable'].str.strip()
                        .str.lower()
                        .str.replace(' ', '_'))

In [5]:
diff = list(set(data.columns.to_list()) ^ set(metadata['Variable'].to_list()))
diff

['leafspan', 'lifespan']

In [6]:
# fix unit issues
metadata['Variable'] = metadata['Variable'].replace({'lifespan':'leafspan'})
metadata['Units'] = metadata['Units'].replace({np.nan:1,
                                              '°C':'degree_C'})
metadata.loc[metadata['Variable'] == 'data_type', 'Units'] = 1
metadata.loc[metadata['Variable'] == 'pft', 'Units'] = 1
metadata.loc[metadata['Variable'] == 'temp_treatment', 'Units'] = 1
metadata.loc[metadata['Variable'] == 'co2_treatment', 'Units'] = 1
metadata.loc[metadata['Variable'] == 'water_treatment', 'Units'] = 1
metadata = metadata[['Variable', 'Description', 'Units']]

In [7]:
len(metadata['Variable'])

36

In [8]:
len(data.columns)

36

In [9]:
data = data[metadata['Variable'].to_list()]
new_header = pd.MultiIndex.from_arrays([
    data.columns,  # Original column names
    metadata.set_index('Variable').loc[data.columns, 'Description'],  # Descriptions
    metadata.set_index('Variable').loc[data.columns, 'Units']  # Units
])

# Step 2: Apply the multi-level header to all_data
data.columns = new_header

### Final cleanup

In [10]:
def clean_values(values):
    """
    Cleans the given pandas Index (or Series) of string values by:
    - Stripping leading/trailing whitespace
    - Removing stray quotes
    - Normalizing whitespace and commas
    - Joining related terms
    - Preserving numeric values

    Parameters:
        values (pd.Index or pd.Series): The values to be cleaned.

    Returns:
        pd.Index: The cleaned values.
    """
    cleaned_values = (
        values
        .astype(str)  # Ensure all values are strings for cleaning operations
        .str.strip()  # Remove leading/trailing whitespace
        .str.replace(r"(^['\"]|['\"]$)", '', regex=True)  # Remove leading/trailing quotes
        .str.replace(r"\s*,\s*", ', ', regex=True)  # Normalize commas with single space after them
        .str.replace(r"\s+", ' ', regex=True)  # Normalize whitespace within strings
        .str.replace(r"([a-zA-Z]),([a-zA-Z])", r"\1, \2", regex=True)  # Add space after missing commas
    )

    # Handle cases like ' evergreen, deciduous '
    cleaned_values = cleaned_values.str.replace(r"['‘’]", '', regex=True)  # Remove any stray single quotes
    cleaned_values = cleaned_values.str.replace(r"\b ,\b", ',', regex=True)  # Fix any stray commas

    return cleaned_values

In [11]:
# clean units
values = data.columns.get_level_values(2)
cleaned_l2_values = clean_values(values)
data.columns = pd.MultiIndex.from_arrays([
    data.columns.get_level_values(0),
    data.columns.get_level_values(1),
    cleaned_l2_values
])

In [12]:
# clean descriptions
values = data.columns.get_level_values(1)
cleaned_l1_values = clean_values(values)
data.columns = pd.MultiIndex.from_arrays([
    data.columns.get_level_values(0),
    cleaned_l1_values,
    data.columns.get_level_values(2),
])

In [13]:
values = data.columns.get_level_values(0)
cleaned_l0_values = clean_values(values)
data.columns = pd.MultiIndex.from_arrays([
    cleaned_l0_values,
    data.columns.get_level_values(1),
    data.columns.get_level_values(2),
])

In [14]:
data

Unnamed: 0_level_0,dataset,data_type,data_contributor,location,leafspan,tregion,type,pft,growth_condition,species,...,tair,tleaf,tblk,co2r,co2s,pari,mat,maxt,mgdd0,reference
Description,unique name given for individual datasets,data grouping tag,name of the individual dataset contributors,site where the dataset collected,Evergreen or Deciduous,Biome,Gymnosperm or angiosperm,plant functional type,plant growth condition,standard species name,...,air temperature,leaf temperature,LICOR block temperature,reference CO2 concentration,CO2 concentration at leaf surface,Photosynthetically active radiation inside leaf cuvette,mean annual air temperature at the species seed source,mean maximum air temperature of the warmest month at the species seed source,mean air temperature during the growing season at the species seed source,reference
Units,1,1,1,1,1,1,1,1,1,1,...,degree_C,degree_C,degree_C,mmol mol-1,mmol mol-1,mmol m-2 s-1,degree_C,degree_C,degree_C,1
0,"Black Spruce, ON, Canada",ACi-T,Danielle A. Way,"Ontario, Canada",Evergreen,Boreal,Gymnosperm,NET_B,Glasshouse,Picea mariana,...,,9.980000,,,380.920000,,0.1,21.8,10.766667,Way and Sage (2008)
1,"Black Spruce, ON, Canada",ACi-T,Danielle A. Way,"Ontario, Canada",Evergreen,Boreal,Gymnosperm,NET_B,Glasshouse,Picea mariana,...,,10.080000,,,280.570000,,0.1,21.8,10.766667,Way and Sage (2008)
2,"Black Spruce, ON, Canada",ACi-T,Danielle A. Way,"Ontario, Canada",Evergreen,Boreal,Gymnosperm,NET_B,Glasshouse,Picea mariana,...,,10.140000,,,91.680000,,0.1,21.8,10.766667,Way and Sage (2008)
3,"Black Spruce, ON, Canada",ACi-T,Danielle A. Way,"Ontario, Canada",Evergreen,Boreal,Gymnosperm,NET_B,Glasshouse,Picea mariana,...,,10.140000,,,47.400000,,0.1,21.8,10.766667,Way and Sage (2008)
4,"Black Spruce, ON, Canada",ACi-T,Danielle A. Way,"Ontario, Canada",Evergreen,Boreal,Gymnosperm,NET_B,Glasshouse,Picea mariana,...,,10.220000,,,381.070000,,0.1,21.8,10.766667,Way and Sage (2008)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61866,"Subalpine Eucalypt, AU-NSW (A)",An-T,Belinda E. Medlyn,"Tumbarumba NSW, Australia",Evergreen,Temperate_S,Angiosperm,BET_TE,Field (NE),Eucalyptus delegatensis,...,16.213333,16.238333,16.273333,368.733333,347.116667,1500.0,8.5,22.9,8.525000,Medlyn et al. (2007)
61867,"Subalpine Eucalypt, AU-NSW (A)",An-T,Belinda E. Medlyn,"Tumbarumba NSW, Australia",Evergreen,Temperate_S,Angiosperm,BET_TE,Field (NE),Eucalyptus delegatensis,...,16.540000,16.470000,16.615000,369.716667,343.816667,1500.0,8.5,22.9,8.525000,Medlyn et al. (2007)
61868,"Subalpine Eucalypt, AU-NSW (A)",An-T,Belinda E. Medlyn,"Tumbarumba NSW, Australia",Evergreen,Temperate_S,Angiosperm,BET_TE,Field (NE),Eucalyptus delegatensis,...,17.191667,17.070000,17.273333,370.266667,345.483333,1499.0,8.5,22.9,8.525000,Medlyn et al. (2007)
61869,"Subalpine Eucalypt, AU-NSW (A)",An-T,Belinda E. Medlyn,"Tumbarumba NSW, Australia",Evergreen,Temperate_S,Angiosperm,BET_TE,Field (NE),Eucalyptus delegatensis,...,17.748095,17.657619,17.883333,362.890476,340.276190,1501.0,8.5,22.9,8.525000,Medlyn et al. (2007)


In [18]:
sorted(data.columns.to_list())

[('chamber', 'unique chamber number for whole tree chamber experiments', '1'),
 ('ci', 'intercellular CO2 concentration', 'mmol mol-1'),
 ('co2_treatment', 'experimental CO2 treatments', '1'),
 ('co2r', 'reference CO2 concentration', 'mmol mol-1'),
 ('co2s', 'CO2 concentration at leaf surface', 'mmol mol-1'),
 ('cond', 'stomatal conductance', 'mmol m-2 s-1'),
 ('curve_id',
  'a unique curve number for individual ACi curves in a dataset',
  '1'),
 ('curve_number',
  'a unique curve number for individual ACi curves within the dataset',
  '1'),
 ('data_contributor', 'name of the individual dataset contributors', '1'),
 ('data_type', 'data grouping tag', '1'),
 ('dataset', 'unique name given for individual datasets', '1'),
 ('date', 'data collection date', '1'),
 ('growth_condition', 'plant growth condition', '1'),
 ('leaf_age', 'leaf age', 'years'),
 ('leafspan', 'Evergreen or Deciduous', '1'),
 ('location', 'site where the dataset collected', '1'),
 ('mat', 'mean annual air temperature a

In [19]:
data[('date', 'data collection date', '1')].unique()

array([nan, '9/07/2014', '8/07/2014', '7/07/2014', 'Aug-07', '14/07/1995',
       '12/07/1995', '23/07/1995', '13/07/1995', '15/07/1995',
       '16/07/1995', '18/07/1995', '19/07/1995', '26/07/1994',
       '28/07/1994', '29/07/1994', '1/08/1994', '4/08/1994', '25/07/1994',
       '27/07/1994', '30/07/1994', '2/08/1994', '22/07/1995',
       '17/07/1995', '24/07/1994', '31/07/1994', '3/08/1994',
       '20/07/1995', '12/08/1997', '13/08/1997', '14/08/1997',
       '15/08/1997', '16/08/1997', '17/08/1997', '18/08/1997',
       '20/08/1997', '21/08/1997', '22/08/1997', '3/12/2010',
       '10/12/2010', '9/12/2010', '8/12/2010', '6/12/2010', '2/12/2010',
       '7/12/2010', '10/02/2011', '9/02/2011', '14/02/2011', '11/02/2011',
       '12/02/2011', '15/02/2011', '26/08/2011', '29/08/2011',
       '2/09/2011', '5/09/2011', '1/09/2011', '8/09/2011', '20/04/2011',
       '18/04/2011', '19/04/2011', '7/07/2016', '11/07/2016',
       '12/07/2016', '13/12/2016', '14/12/2016', '15/12/2016',
   

In [15]:
data.to_csv('data/kumarathunge_2019/kumarathunge_2019_data.csv', index=False)