In [1]:
# -*- coding: utf-8 -*-
"""
Update 21/11/24 LL : added option to save as csv
Update 30/10/24 by LL (v4): added check for empty paleoData_values row

Here we use code from sisal3_extractCSVdata.py 
to extract a dataframe with the following columns:

columns=['archiveType', 
        'climateInterpretation_variable',
        'climateInterpretation_variableDetail',
        'datasetId',
        'dataSetName',                                                                                
        'geo_meanElev', 
        'geo_meanLat', 
        'geo_meanLon',
        'year', 'yearUnits',                                                                                         
        'paleoData_variableName',
        'paleoData_units',                                                                                           
        'paleoData_values',
        'paleoData_notes',
        'paleoData_sensorSpecies',
        'originalDataURL',
        'originalDatabase'
]

We save a standardised compact dataframe for concatenation to form DoD2k


Created by Kevin Fan (last edited: September 29th, 2024)
Changes have been made to filtering entityIds by date and how the dataframe is constructed
Mostly cleaning and checking work


---------------------------------
load the SISALv3 database (2024) downloaded on June 3rd 2024 
from https://ora.ox.ac.uk/objects/uuid:1e91e2ac-ca9f-46e5-85f3-8d82d4d3cfd4 | MNE 2024/06/03

---------------------------------
following sisal3_extractCSVdata.py:

Created on Nov 11 2023

This python script reads SISALv3 csv-data in a directory './SISALv3_csv' 
    relative to the path of this file (unpack all your downloaded csv-files 
    there!) and extracts stable isotope, Mg/Ca and growth rate data for all 
    entities, which cover a to be specified period of interest 
    (change lines 70 and 71 according to your needs). 

Only records with more than 'number_of_dating_points' U-Th dated depths
    (line 78) will be accounted for. 
    attention: it may happen that there are enough dated depth available in 
        the requested period, but proxies might not be provided this will 
        result in an empty output, but output nevertheless

The individual data will be plotted and the plots can be saved 
    (comment/uncomment line 225).

The mean and standard deviation of all proxies within your specified period 
    will be determined and saved in a csv file.

There will also be a raw plot for illustrative purposes available.

Feel free to change the code as you see fit.
    
@author: Jens Fohlmeister
---------------------------------
"""




"\nUpdate 21/11/24 LL : added option to save as csv\nUpdate 30/10/24 by LL (v4): added check for empty paleoData_values row\n\nHere we use code from sisal3_extractCSVdata.py \nto extract a dataframe with the following columns:\n\ncolumns=['archiveType', \n        'climateInterpretation_variable',\n        'climateInterpretation_variableDetail',\n        'datasetId',\n        'dataSetName',                                                                                \n        'geo_meanElev', \n        'geo_meanLat', \n        'geo_meanLon',\n        'year', 'yearUnits',                                                                                         \n        'paleoData_variableName',\n        'paleoData_units',                                                                                           \n        'paleoData_values',\n        'paleoData_notes',\n        'paleoData_sensorSpecies',\n        'originalDataURL',\n        'originalDatabase'\n]\n\nWe save a standardised c

# Set up environment

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# import necessary modules
import numpy as np
import pandas as pd
import os
import math


In [4]:
# set up working directory. 
# The default working directory should be the parent folder (compile_proxy_database) so we can access the 'helper' files 
# Make sure this is changing to the correct path!

#wdir = '/home/jupyter-lluecke/compile_proxy_database_v2.0' # working directory, this should work, but doesn't seem to...
if not os.getcwd().endswith('compile_proxy_database_v2.1'):
    os.chdir(os.getcwd()+'/..')
wdir = os.getcwd()
print('working directory: '+wdir)
import functions as f # contains functions for plotting 

working directory: /home/jupyter-lluecke/compile_proxy_database_v2.1


# Load source data and apply corrections

## Read CSVs

In [5]:
# read the sisalv3 csv files

entity = pd.read_csv('sisal/sisalv3_csv/entity.csv')
d13C   = pd.read_csv('sisal/sisalv3_csv/d13C.csv')
d18O   = pd.read_csv('sisal/sisalv3_csv/d18O.csv')
MgCa   = pd.read_csv('sisal/sisalv3_csv/Mg_Ca.csv')
dating = pd.read_csv('sisal/sisalv3_csv/dating.csv')
dating.rename(columns = {'238U_content':'c238U_content','238U_uncertainty':'c238U_uncertainty',
    '232Th_content':'c232Th_content','c232Th_uncertainty':'c232Th_uncertainty',
    '230Th_content':'c230Th_content','c230Th_uncertainty':'c230Th_uncertainty',
    '230Th_232Th_ratio':'a230Th_232Th_ratio','230Th_232Th_ratio_uncertainty':'a230Th_232Th_ratio_uncertainty',
    '230Th_238U_activity':'a230Th_238U_activity','230Th_238U_activity_uncertainty':'a230Th_238U_activity_uncertainty',
    '234U_238U_activity':'a234U_238U_activity','234U_238U_activity_uncertainty':'a234U_238U_activity_uncertainty'},
    inplace = True)
    # it is necessary to rename those columns with a number on first position

entity_link_reference = pd.read_csv('sisal/sisalv3_csv/entity_link_reference.csv')
original_chronology   = pd.read_csv('sisal/sisalv3_csv/original_chronology.csv')
reference             = pd.read_csv('sisal/sisalv3_csv/reference.csv')
sample                = pd.read_csv('sisal/sisalv3_csv/sample.csv')
sisal_chronology      = pd.read_csv('sisal/sisalv3_csv/sisal_chronology.csv')
site                  = pd.read_csv('sisal/sisalv3_csv/site.csv')
# os.chdir('..')


## Filter data out of wanted bounds

In [6]:
###########################################################################
# extract required data from speleothems covering the period of interest
#   + provides all entities, which include non-14C ages and non-events  
#     during the time period
###########################################################################
low = 1950         # defines minimum age [a]

# KF: Filtering data indicies that don't match age and variable requirements
i0 = dating.loc[(dating['corr_age'] <= low) &
                (dating['date_type']!='C14') & (dating['date_type'].str.find('Event')!=0)]
i1 = i0['entity_id'].to_numpy() 
i3 = np.unique(i1) 

### remove all entities with less than 'number_of_dating_points' dated depths
number_of_dating_points = 3
for i in np.arange(0,len(i3)):
    i_dummy = i0.entity_id[i0.entity_id==i3[i]].count()
    if i_dummy < number_of_dating_points:
        i0 = i0[i0.entity_id!=i3[i]]
i1 = i0['entity_id'].to_numpy()
i2 = np.unique(i1)  # provides all entities, which include >= 'number_of_dating_points' 
                    # dated depths during the required time period

# You could speed up the above process by forming a frequency dictionary to begin with and just referencing those as you go intead of remeasuring frequencies.
# However, this only becomes more efficient when i0 gets realy big with a lot of repeated entity IDs - Kevin

###########################################################################


# Create compact dataframe

## Parameter definitions

In [7]:
### define parameters (all of those will be saved in a final file)
site1_id     = np.zeros(len(i2))
site_name1   = ['0']*len(i2)
rock_age1    = ['0']*len(i2)
material1    = ['0']*len(i2)
entity_name1 = ['0']*len(i2)
lon          = np.zeros(len(i2))
lat          = np.zeros(len(i2))
elev         = np.zeros(len(i2))
entity1_id   = np.zeros(len(i2))
mean_C       = np.zeros(len(i2))
mean_O       = np.zeros(len(i2))
mean_GR      = np.zeros(len(i2))
mean_MgCa    = np.zeros(len(i2))
std_C        = np.zeros(len(i2))
std_O        = np.zeros(len(i2))
std_GR       = np.zeros(len(i2))
std_MgCa     = np.zeros(len(i2))

#we need to initialize a publication_DOI array with the length set by the number of publications meeting the selection criteria.
publication_DOI1 = np.zeros(len(i2),dtype='object')

#Check size of daa lists
len(i2)


211

## Parameter/metadata population

In [8]:
# KF:common dataframe.
df = pd.DataFrame(columns=['archiveType', 'dataSetName', 'datasetId', 'geo_meanElev', 
                           'geo_meanLat', 'geo_meanLon', 'originalDataURL', 
                           'paleoData_notes', 'paleoData_proxy', 'paleoData_units',
                           'paleoData_values', 'year', 'yearUnits'])

# KF: Populating common dataframe
for n in np.arange(0,len(i2)): #for every valid unique entity

    dummy = dating.loc[(dating['entity_id'] == i2[n])] # Row associated with valid entity

    ### already some metadata for individual speleothems
    site1_id[n] = entity.site_id[(entity['entity_id'] == i2[n])].to_numpy() 
    entity1_id[n] = entity.entity_id[(entity['entity_id'] == i2[n])].to_numpy()
    entity_name1[n] = entity.entity_name[(entity['entity_id'] == i2[n])].to_list()
    site_name1[n] = site.site_name[(site['site_id'] == site1_id[n])].to_list()
    refID = entity_link_reference.ref_id[(entity_link_reference['entity_id'] == i2[n])].to_list()
    publication_DOI1[n] = reference.publication_DOI[(reference['ref_id'] == refID[0])].to_list()
    if len(publication_DOI1[n])==1: publication_DOI1[n]=publication_DOI1[n][0]
    lon[n] = site.longitude[(site['site_id'] == site1_id[n]).to_numpy()]
    lat[n] = site.latitude[(site['site_id'] == site1_id[n]).to_numpy()]
    elev[n] = site.elevation[(site['site_id'] == site1_id[n]).to_numpy()]
    if dummy.material_dated.dropna().eq('calcite').all():
        material1[n] = 'calcite'
    elif dummy.material_dated.dropna().eq('aragonite').all():
        material1[n] = 'aragonite'
    else:
        material1[n] = 'mixed'
    ### extract isotope data (d18O and d13C) and elements #####################

    idx1 = sample.sample_id[(sample['entity_id']==i2[n])].to_numpy() # sample ids for current entity id
    age = original_chronology.interp_age[original_chronology['sample_id'].isin(idx1)].to_numpy() # interpretation ages from orig. chron. based on current sample ids
    
# Oxygen
    idx2 = original_chronology.sample_id[original_chronology['sample_id'].isin(idx1)].to_numpy() # orig. chron. sample ids based on idx sample ids
    d18O_1 = d18O.d18O_measurement[d18O['sample_id'].isin(idx2)].to_numpy() # d18O measurements corresponding to idx2 sample ids 
    idx3 = d18O.sample_id[d18O['sample_id'].isin(idx2)].to_numpy() # d18O sample ids corresponding to idx2 sample ids
    age18 = original_chronology.interp_age[original_chronology['sample_id'].isin(idx3)].to_numpy() #orig. chron. interpretation ages corresponding to idx3 sample ids
    # KF: Filter dates too low and too high
    filter = list(map((lambda x: True if (x <= low) else False), age18))
    age18 = age18[filter]
    d18O_1 = d18O_1[filter]
    if (len(idx3) < len(idx2) and len(idx3) > 0):
        idx2 = idx3 # KF: if a discrepancy exists, brute force idx3 into idx2 for whatever reason, doing this for the other variables causes an index bounds error for growth rate.
        # probably something to do with the age list manipulation.
        age = original_chronology.interp_age[original_chronology['sample_id'].isin(idx2)].to_numpy() # then set age to the interp. age pertaining to the new idx2 sample ids
    if (len(d18O_1) > 0):
        # Follows the common dictionary format.
        df.loc[len(df)] = ['speleothem', site_name1[n][0], entity1_id[n], elev[n], lat[n], lon[n], publication_DOI1[n], material1[n], 'd18O', 'permil', d18O_1, age18, 'BP']  
       
# Carbon  
    d13C_1 = d13C.d13C_measurement[d13C['sample_id'].isin(idx2)].to_numpy() # d13C measurements for idx sample ids
    idx4 = d13C.sample_id[d13C['sample_id'].isin(idx2)].to_numpy() # d13C sample ids corresponding to idx2 sample ids
    age13 = original_chronology.interp_age[original_chronology['sample_id'].isin(idx4)].to_numpy() # interp ages of d13C sample ids from orig. chron.
    # KF: Filter dates too low and too high
    filter = list(map((lambda x: True if (x<= low) else False), age13))
    age13 = age13[filter]
    d13C_1 = d13C_1[filter]
    #if len(idx4) < len(idx):
    #    idx3 = idx4 
    #    age = original_chronology.interp_age[original_chronology['sample_id'].isin(idx3)].to_numpy()
    if (len(d13C_1) > 0):
        df.loc[len(df)] = ['speleothem', site_name1[n][0], entity1_id[n], elev[n], lat[n], lon[n], publication_DOI1[n], material1[n], 'd13C', 'permil', d13C_1, age13, 'BP']   
      
# Magnesium Calcium
    MgCa_1 = MgCa.Mg_Ca_measurement[MgCa['sample_id'].isin(idx2)].to_numpy() # MgCa measurements corresponding to idx2 sample ids
    idx5 = MgCa.sample_id[MgCa['sample_id'].isin(idx2)].to_numpy() # MgCa sample ids based on idx2 sample ids
    ageMgCa = original_chronology.interp_age[original_chronology['sample_id'].isin(idx5)].to_numpy() # interp ages of MgC samples from orig. chron.
    # KF: Filter dates too low and too high
    filter = list(map((lambda x: True if (x<= low) else False), ageMgCa))
    ageMgCa = ageMgCa[filter]
    MgCa_1 = MgCa_1[filter]
    if (len(MgCa_1) > 0):
        df.loc[len(df)] = ['speleothem', site_name1[n][0], entity1_id[n], elev[n], lat[n], lon[n], publication_DOI1[n], material1[n], 
                           'MgCa', 'mmol/mol', MgCa_1, ageMgCa, 'BP']
    
    ### also growth rate (gr) could be important ##############################
    if len(idx2) != 0:
        isotopeDepth = sample.depth_sample[sample['sample_id'].isin(idx2)].to_numpy()

        # Estalish placeholder arrays
        gr = np.zeros(len(isotopeDepth))
        ageGR = np.zeros(len(isotopeDepth))
        fage_err_gr = np.zeros(len(isotopeDepth))

        # Array population
        for i in np.arange(0,len(gr)-1):
            if entity.depth_ref[(entity['entity_id'] == i2[n])].to_numpy() == 'from top':
                gr[i] = (isotopeDepth[i] - isotopeDepth[i+1]) / (age[i] - age[i+1])
            else:
                gr[i] = -(isotopeDepth[i] - isotopeDepth[i+1]) / (age[i] - age[i+1])
            ageGR[i] = age[i]

            # KF: error checking
            fage_err = (dating['corr_age_uncert_pos'][i] + dating['corr_age_uncert_neg'][i])/dating['corr_age'][i]
            fage_err1 = (dating['corr_age_uncert_pos'][i + 1] + dating['corr_age_uncert_neg'][i + 1])/dating['corr_age'][i + 1]
            fage_err_gr[i] = math.sqrt((fage_err ** 2) + (fage_err1 **2))
                
        gr[-1] = gr[-2]
        if len(np.argwhere(np.isinf(gr))>0): 
            if (np.argwhere(np.isinf(gr))[-1]==len(gr)-1): # if the last value is 'inf'
                gr[np.argwhere(np.isinf(gr))[-1]] = gr[np.argwhere(np.isinf(gr))[-1]-2]
            else:
                gr[np.argwhere(np.isinf(gr))]=gr[np.argwhere(np.isinf(gr))+1] # replace 'inf' values by neighboring values for gr
            while len(np.argwhere(np.isinf(gr))>0): # second iteration for cases where there is very fast growth and initially two successive 'inf' values
                gr[np.argwhere(np.isinf(gr))]=gr[np.argwhere(np.isinf(gr))+1] # replace 'inf' values by neighboring values for gr

        # GR calculation
        for i in np.arange(1,len(gr)-1):
            if gr[i]>1: 
                gr[i]=(gr[i-1]+gr[i+1])/2

        # KF: error masking
        gr[fage_err_gr>0.1]=-9999.99
        
         # KF: Adding growth rate to common frame
        filter = list(map((lambda x: True if (x<= low) else False), ageGR))
        df.loc[len(df)] = ['speleothem', site_name1[n][0], entity1_id[n], elev[n], lat[n], lon[n], publication_DOI1[n], material1[n], "growth rate", "mm/year", gr[filter], ageGR[filter], "BP"]    

        ### determine the according averages  #################################
        #if len(d18O_1)>0:
            #mean_GR[n] = np.mean(gr[np.argwhere((ageGR>=low) & (ageGR<=high))])
            #mean_O[n] = np.mean(d18O_1[np.argwhere((age18>=low) & (age18<=high))])
            #std_GR[n] = np.std(gr[np.argwhere((ageGR>=low) & (ageGR<=high))])
            #std_O[n] = np.std(d18O_1[np.argwhere((age18>=low) & (age18<=high))])
        #if len(d13C_1)>0:
            #mean_C[n] = np.mean(d13C_1[np.argwhere((age13>=low) & (age13<=high))])
            #std_C[n] = np.std(d13C_1[np.argwhere((age13>=low) & (age13<=high))])
        #if len(MgCa_1)>0:
            #mean_MgCa[n] = np.mean(MgCa_1[np.argwhere((ageMgCa>=low) & (ageMgCa<=high))])
            #std_MgCa[n] = np.std(MgCa_1[np.argwhere((ageMgCa>=low) & (ageMgCa<=high))])
        #####################################################################

  site1_id[n] = entity.site_id[(entity['entity_id'] == i2[n])].to_numpy()
  entity1_id[n] = entity.entity_id[(entity['entity_id'] == i2[n])].to_numpy()
  lon[n] = site.longitude[(site['site_id'] == site1_id[n]).to_numpy()]
  lat[n] = site.latitude[(site['site_id'] == site1_id[n]).to_numpy()]
  elev[n] = site.elevation[(site['site_id'] == site1_id[n]).to_numpy()]
  site1_id[n] = entity.site_id[(entity['entity_id'] == i2[n])].to_numpy()
  entity1_id[n] = entity.entity_id[(entity['entity_id'] == i2[n])].to_numpy()
  lon[n] = site.longitude[(site['site_id'] == site1_id[n]).to_numpy()]
  lat[n] = site.latitude[(site['site_id'] == site1_id[n]).to_numpy()]
  elev[n] = site.elevation[(site['site_id'] == site1_id[n]).to_numpy()]
  site1_id[n] = entity.site_id[(entity['entity_id'] == i2[n])].to_numpy()
  entity1_id[n] = entity.entity_id[(entity['entity_id'] == i2[n])].to_numpy()
  lon[n] = site.longitude[(site['site_id'] == site1_id[n]).to_numpy()]
  lat[n] = site.latitude[(site['site_id'] == si

## Data cleaning and format conventions

In [9]:
# KF: adding og dataset name
df.insert(6, 'originalDatabase', ['SISAL v3']*len(df))
df.insert(6, 'geo_siteName', df['dataSetName'])
df.insert(1, 'climateInterpretation_variable', ['N/A']*len(df))
df.insert(1, 'climateInterpretation_variableDetail', ['N/A']*len(df))
df.insert(12, 'paleoData_sensorSpecies', ['N/A']*len(df))
df.loc[df['paleoData_proxy']=='MgCa', 'paleoData_proxy']='Mg/Ca'

# KF: Temp cleaning rows with NAN in year
# There are thirteen of them, hopefully this does not skew data too much. 
length = len(df['year'])
df = df[df['year'].notna()]
df = df[df['year'].map(lambda x: len(x) > 1)]
df = df[df['paleoData_values'].map(lambda x: len(x) > 1)]
df = df[df['paleoData_values'].map(lambda x: not any(pd.isnull(x)))]
print('Number of rows discarded: ', (length - len(df['year'])))

Number of rows discarded:  13


In [10]:
set(df['paleoData_proxy'])

{'Mg/Ca', 'd13C', 'd18O', 'growth rate'}

In [11]:
# assign climateInterpretation_variable

# d18O is temperature and moisture
df.loc[df['paleoData_proxy']=='d18O', 'climateInterpretation_variable']='temperature+moisture' 
df.loc[df['paleoData_proxy']=='d18O', 'climateInterpretation_variableDetail']='temperature+moisture - manually assigned by DoD2k authors for paleoData_proxy = d18O.' 

# Mg/Ca is temperature
df.loc[df['paleoData_proxy']=='Mg/Ca', 'climateInterpretation_variable']='temperature' 
df.loc[df['paleoData_proxy']=='Mg/Ca', 'climateInterpretation_variableDetail']='temperature - manually assigned by DoD2k authors for paleoData_proxy = Mg/Ca' 


In [12]:
# BP 0 Adjustment

def BP2CE(year):
    year = 1950 - year
    if year <= 0:
        year = year - 1
    return year
df['year'] = df['year'].apply(lambda x: (np.array(list(map(lambda y: BP2CE(y), x)))))
df['yearUnits'] = ['CE']*len(df)


In [13]:
# KF: Type-checking

df = df.astype({'archiveType': str, 'dataSetName': str, 'datasetId': str, 'geo_meanElev': float, 'geo_meanLat': float, 'geo_meanLon': float, 'geo_siteName': str, 
                    'originalDatabase': str, 'originalDataURL': str, 'paleoData_notes': str, 'paleoData_proxy': str, 'paleoData_units': str, 'yearUnits': str})
df['year'] = df['year'].map(lambda x: np.array(x, dtype = float))
df['paleoData_values'] = df['paleoData_values'].map(lambda x: np.array(x, dtype = float))


In [14]:
# KF: display the dataframe
df.reset_index(drop= True, inplace= True)
df = df[sorted(df.columns)]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546 entries, 0 to 545
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           546 non-null    object 
 1   climateInterpretation_variable        546 non-null    object 
 2   climateInterpretation_variableDetail  546 non-null    object 
 3   dataSetName                           546 non-null    object 
 4   datasetId                             546 non-null    object 
 5   geo_meanElev                          546 non-null    float64
 6   geo_meanLat                           546 non-null    float64
 7   geo_meanLon                           546 non-null    float64
 8   geo_siteName                          546 non-null    object 
 9   originalDataURL                       546 non-null    object 
 10  originalDatabase                      546 non-null    object 
 11  paleoData_notes    

In [15]:
#  check that the datasetId is unique - it currently is not (df has 546 records).
print(len(df.datasetId.unique()))
# make datasetId unique by simply adding index number
df.datasetId=df.apply(lambda x: 'sisal_'+x.datasetId+'_'+str(x.name), axis=1)
# check uniqueness - problem solved.
print(len(df.datasetId.unique()))

200
546


In [16]:
# KF: apply -9999.99 masking to nans
df['paleoData_values'] = df['paleoData_values'].map(lambda x: [-9999.99 if pd.isnull(y) else y for y in x])
df['paleoData_values'] = df['paleoData_values'].map(lambda x: [-9999.99 if y == 'nan' else y for y in x])
df['paleoData_values'] = df['paleoData_values'].map(lambda x: [-9999.99 if np.isnan(y) else y for y in x])

df['year'] = df['year'].map(lambda x: [-9999.99 if pd.isnull(y) else y for y in x])
df['year'] = df['year'].map(lambda x: [-9999.99 if y == 'nan' else y for y in x])
df['year'] = df['year'].map(lambda x: [-9999.99 if np.isnan(y) else y for y in x])

In [17]:
# drop nans 
for ii in df.index:
    dd   = np.array(df.at[ii, 'paleoData_values'])
    mask = dd==-9999.99
    df.at[ii, 'paleoData_values']=dd[~mask]
    df.at[ii, 'year']=np.array(df.at[ii, 'year'])[~mask]

In [18]:
drop_inds = []

for ii in df.index:
    if len(df.at[ii,'year'])==0:
        print('empty', ii, df.at[ii,'year'], df.at[ii,'originalDatabase'])
        print(df.at[ii,'paleoData_values'])
        drop_inds += [ii]
    
    if np.round(np.std(df.at[ii,'paleoData_values']), 9)==0: 
        print(ii, 'std=0')
    elif np.round(np.sum(np.diff(df.at[ii,'paleoData_values'])**2), 9)==0: 
        print(ii, 'diff=0')
    elif np.isnan(np.std(df.at[ii,'paleoData_values'])):
        print(ii, 'std nan')
    else:
        continue
    drop_inds += [ii]
    print(df.iloc[ii]['datasetId'])
print(drop_inds)
df = df.drop(index=np.unique(drop_inds))

76 std=0
sisal_120.0_76
88 diff=0
sisal_128.0_88
291 std=0
sisal_443.0_291
431 std=0
sisal_769.0_431
437 std=0
sisal_771.0_437
[76, 88, 291, 431, 437]


## Save and output dataframe

### save pickle

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 541 entries, 0 to 545
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           541 non-null    object 
 1   climateInterpretation_variable        541 non-null    object 
 2   climateInterpretation_variableDetail  541 non-null    object 
 3   dataSetName                           541 non-null    object 
 4   datasetId                             541 non-null    object 
 5   geo_meanElev                          541 non-null    float64
 6   geo_meanLat                           541 non-null    float64
 7   geo_meanLon                           541 non-null    float64
 8   geo_siteName                          541 non-null    object 
 9   originalDataURL                       541 non-null    object 
 10  originalDatabase                      541 non-null    object 
 11  paleoData_notes         

In [20]:
# KF: Save to pickle
df.to_pickle('sisal/sisal_compact.pkl')


### save csv

In [21]:
# save to a list of csv files (metadata, data, year)
df.name='sisal'
f.write_compact_dataframe_to_csv(df)

METADATA: archiveType, climateInterpretation_variable, climateInterpretation_variableDetail, dataSetName, datasetId, geo_meanElev, geo_meanLat, geo_meanLon, geo_siteName, originalDataURL, originalDatabase, paleoData_notes, paleoData_proxy, paleoData_sensorSpecies, paleoData_units, yearUnits
Saved to /home/jupyter-lluecke/compile_proxy_database_v2.1/sisal/sisal_compact_%s.csv


In [22]:
# load dataframe
print(f.load_compact_dataframe_from_csv('sisal').info())
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           541 non-null    object 
 1   climateInterpretation_variable        541 non-null    object 
 2   climateInterpretation_variableDetail  541 non-null    object 
 3   dataSetName                           541 non-null    object 
 4   datasetId                             541 non-null    object 
 5   geo_meanElev                          541 non-null    float32
 6   geo_meanLat                           541 non-null    float32
 7   geo_meanLon                           541 non-null    float32
 8   geo_siteName                          541 non-null    object 
 9   originalDataURL                       541 non-null    object 
 10  originalDatabase                      541 non-null    object 
 11  paleoData_notes    

# check output

## dataset metadata: dataSetName, datasetId, originalDataURL, originalDatabase

### index

In [23]:
# # check index
print(df.index)

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       536, 537, 538, 539, 540, 541, 542, 543, 544, 545],
      dtype='int64', length=541)


### dataSetName

In [24]:
# # check dataSetName
key = 'dataSetName'
print('%s: '%key)
print(df[key].values)

dataSetName: 
['Bittoo cave' 'Bittoo cave' 'Bittoo cave' 'Kesang cave' 'Kesang cave'
 'Kesang cave' 'Paraiso cave' 'Paraiso cave' 'Paraiso cave' 'Paraiso cave'
 'Paraiso cave' 'Paraiso cave' 'Villars cave' 'Villars cave'
 'Villars cave' 'Cold Air cave' 'Cold Air cave' 'Cold Air cave'
 'Cold Air cave' 'Cold Air cave' 'Cold Air cave' 'Cold Air cave'
 'Cold Air cave' 'Cold Air cave' 'Cold Air cave' 'Cold Air cave'
 'Cold Air cave' 'Cold Air cave' 'Cold Air cave' 'Lancaster Hole'
 'Lancaster Hole' 'Jeita cave' 'Jeita cave' 'Jeita cave' 'Jeita cave'
 'Jeita cave' 'Jeita cave' 'Huangye cave' 'Huangye cave' 'Huangye cave'
 'Huangye cave' 'Huangye cave' 'Huangye cave' 'Lapa grande cave'
 'Lapa grande cave' 'Lapa grande cave' 'Palestina cave' 'Palestina cave'
 'Palestina cave' 'Palestina cave' 'Palestina cave' 'Palestina cave'
 'Okshola cave' 'Okshola cave' 'Okshola cave' 'Tamboril cave'
 'Tamboril cave' 'Tamboril cave' 'Anjokipoty' 'Anjokipoty' 'Anjokipoty'
 'Curupira cave' 'Curupira cave' 'Cu

### datasetId

In [25]:
# # check datasetId

print(len(df.datasetId.unique()))
print(len(df))
key = 'datasetId'
print('%s (starts with): '%key)
print(df[key].values)

541
541
datasetId (starts with): 
['sisal_9.0_0' 'sisal_9.0_1' 'sisal_9.0_2' 'sisal_19.0_3' 'sisal_19.0_4'
 'sisal_19.0_5' 'sisal_20.0_6' 'sisal_20.0_7' 'sisal_20.0_8'
 'sisal_21.0_9' 'sisal_21.0_10' 'sisal_21.0_11' 'sisal_33.0_12'
 'sisal_33.0_13' 'sisal_33.0_14' 'sisal_45.0_15' 'sisal_45.0_16'
 'sisal_45.0_17' 'sisal_46.0_18' 'sisal_46.0_19' 'sisal_46.0_20'
 'sisal_47.0_21' 'sisal_47.0_22' 'sisal_47.0_23' 'sisal_48.0_24'
 'sisal_48.0_25' 'sisal_49.0_26' 'sisal_49.0_27' 'sisal_49.0_28'
 'sisal_51.0_29' 'sisal_51.0_30' 'sisal_58.0_31' 'sisal_58.0_32'
 'sisal_58.0_33' 'sisal_60.0_34' 'sisal_60.0_35' 'sisal_60.0_36'
 'sisal_76.0_37' 'sisal_76.0_38' 'sisal_77.0_39' 'sisal_77.0_40'
 'sisal_78.0_41' 'sisal_78.0_42' 'sisal_90.0_43' 'sisal_90.0_44'
 'sisal_90.0_45' 'sisal_93.0_46' 'sisal_93.0_47' 'sisal_93.0_48'
 'sisal_94.0_49' 'sisal_94.0_50' 'sisal_94.0_51' 'sisal_95.0_52'
 'sisal_95.0_53' 'sisal_95.0_54' 'sisal_97.0_55' 'sisal_97.0_56'
 'sisal_97.0_57' 'sisal_107.0_58' 'sisal_107.0_59' 's

### originalDataURL

In [26]:
# originalDataURL
key = 'originalDataURL'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))

originalDataURL: 
['10.1002/2015GL063826' '10.1002/2015gl065397' '10.1002/2016GL071786'
 '10.1002/jqs.1490' '10.1007/s11430-019-9649-1'
 '10.1016/j.chemgeo.2013.08.026' '10.1016/j.epsl.2004.10.024'
 '10.1016/j.epsl.2005.01.036' '10.1016/j.epsl.2007.10.015'
 '10.1016/j.epsl.2008.07.060' '10.1016/j.epsl.2008.08.018'
 '10.1016/j.epsl.2009.12.017' '10.1016/j.epsl.2009.12.039'
 '10.1016/j.epsl.2010.04.002' '10.1016/j.epsl.2010.08.016'
 '10.1016/j.epsl.2011.05.028' '10.1016/j.epsl.2015.03.015'
 '10.1016/j.epsl.2016.02.050' '10.1016/j.epsl.2016.06.008'
 '10.1016/j.epsl.2017.01.034' '10.1016/j.epsl.2017.07.045'
 '10.1016/j.epsl.2017.11.044' '10.1016/j.epsl.2018.04.001'
 '10.1016/j.epsl.2018.04.048' '10.1016/j.epsl.2018.07.027'
 '10.1016/j.epsl.2019.115717' '10.1016/j.epsl.2019.115737'
 '10.1016/j.gca.2019.12.007' '10.1016/j.gca.2022.03.020'
 '10.1016/j.gloplacha.2019.03.007' '10.1016/j.gloplacha.2020.103266'
 '10.1016/j.jseaes.2010.06.011' '10.1016/j.jseaes.2013.04.015'
 '10.1016/j.jseaes.2017

### originalDatabase

In [27]:
# # originalDataSet
key = 'originalDatabase'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))
# Note: the last two records have missing URLs

originalDatabase: 
['SISAL v3']


## geographical metadata: elevation, latitude, longitude, site name

### geo_meanElev

In [28]:
# check Elevation
key = 'geo_meanElev'
print('%s: '%key)
print(df[key])
print(np.unique(['%d'%kk for kk in df[key] if np.isfinite(kk)]))

geo_meanElev: 
0      3000.0
1      3000.0
2      3000.0
3      2000.0
4      2000.0
        ...  
541    1190.0
542    1190.0
543    1190.0
544    1190.0
545    1190.0
Name: geo_meanElev, Length: 541, dtype: float64
['10' '100' '1000' '1120' '1140' '1160' '1190' '120' '1200' '1240' '1250'
 '1260' '1290' '1300' '131' '1370' '1386' '1400' '1407' '1420' '1440'
 '1460' '1490' '1495' '150' '1530' '162' '165' '1650' '175' '180' '184'
 '1900' '1960' '20' '200' '2000' '2114' '2132' '22' '230' '2347' '239'
 '240' '2400' '250' '2660' '280' '2830' '285' '294' '300' '3000' '306'
 '310' '32' '335' '336' '340' '350' '352' '365' '383' '3850' '393' '400'
 '401' '41' '420' '43' '433' '435' '440' '455' '456' '475' '480' '500'
 '518' '53' '530' '550' '570' '590' '60' '600' '631' '650' '660' '680'
 '70' '700' '72' '730' '85' '860' '870' '934' '940' '965']


### geo_meanLat

In [29]:
# # Latitude
key = 'geo_meanLat'
print('%s: '%key)
print(np.unique(['%d'%kk for kk in df[key]]))

geo_meanLat: 
['-11' '-12' '-13' '-14' '-15' '-16' '-18' '-19' '-21' '-24' '-27' '-31'
 '-32' '-34' '-35' '-38' '-4' '-41' '-5' '-8' '-9' '0' '12' '15' '16' '17'
 '18' '19' '20' '22' '25' '26' '27' '28' '29' '30' '31' '32' '33' '35'
 '36' '37' '38' '39' '4' '40' '41' '42' '43' '44' '45' '46' '50' '51' '54'
 '66' '67' '9']


### geo_meanLon

In [30]:
# # Longitude 
key = 'geo_meanLon'
print('%s: '%key)
print(np.unique(['%d'%kk for kk in df[key]]))

geo_meanLon: 
['-104' '-105' '-111' '-115' '-118' '-123' '-2' '-3' '-37' '-4' '-41'
 '-44' '-46' '-47' '-49' '-55' '-56' '-60' '-65' '-67' '-7' '-75' '-77'
 '-79' '-80' '-83' '-89' '-9' '-90' '-98' '-99' '0' '10' '100' '102' '103'
 '105' '106' '107' '108' '109' '11' '110' '113' '114' '115' '117' '118'
 '120' '128' '13' '14' '148' '15' '159' '167' '17' '171' '177' '21' '22'
 '24' '29' '3' '30' '31' '35' '39' '45' '46' '54' '56' '63' '7' '72' '77'
 '8' '81' '82' '91']


### geo_siteName

In [31]:
# Site Name 
key = 'geo_siteName'
print('%s: '%key)
print(df[key].values)

geo_siteName: 
['Bittoo cave' 'Bittoo cave' 'Bittoo cave' 'Kesang cave' 'Kesang cave'
 'Kesang cave' 'Paraiso cave' 'Paraiso cave' 'Paraiso cave' 'Paraiso cave'
 'Paraiso cave' 'Paraiso cave' 'Villars cave' 'Villars cave'
 'Villars cave' 'Cold Air cave' 'Cold Air cave' 'Cold Air cave'
 'Cold Air cave' 'Cold Air cave' 'Cold Air cave' 'Cold Air cave'
 'Cold Air cave' 'Cold Air cave' 'Cold Air cave' 'Cold Air cave'
 'Cold Air cave' 'Cold Air cave' 'Cold Air cave' 'Lancaster Hole'
 'Lancaster Hole' 'Jeita cave' 'Jeita cave' 'Jeita cave' 'Jeita cave'
 'Jeita cave' 'Jeita cave' 'Huangye cave' 'Huangye cave' 'Huangye cave'
 'Huangye cave' 'Huangye cave' 'Huangye cave' 'Lapa grande cave'
 'Lapa grande cave' 'Lapa grande cave' 'Palestina cave' 'Palestina cave'
 'Palestina cave' 'Palestina cave' 'Palestina cave' 'Palestina cave'
 'Okshola cave' 'Okshola cave' 'Okshola cave' 'Tamboril cave'
 'Tamboril cave' 'Tamboril cave' 'Anjokipoty' 'Anjokipoty' 'Anjokipoty'
 'Curupira cave' 'Curupira cave' 'C

## proxy metadata: archive type, proxy type, interpretation

### archiveType

In [32]:
# now check all the entries bit by bit (can be omitted at a later stage)

# archiveType
key = 'archiveType'
print('%s: '%key)
print(np.unique(df[key]))

archiveType: 
['speleothem']


### paleoData_proxy

In [33]:
# paleoData_proxy
key = 'paleoData_proxy'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))

paleoData_proxy: 
['Mg/Ca' 'd13C' 'd18O' 'growth rate']


### paleoData_notes

In [34]:
# # paleoData_notes
key = 'paleoData_notes'
print('%s: '%key)
print(df[key].values)

paleoData_notes: 
['calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite'
 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite'
 'calcite' 'aragonite' 'aragonite' 'aragonite' 'aragonite' 'aragonite'
 'aragonite' 'aragonite' 'aragonite' 'aragonite' 'aragonite' 'aragonite'
 'aragonite' 'aragonite' 'aragonite' 'calcite' 'calcite' 'calcite'
 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite'
 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite'
 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite'
 'calcite' 'calcite' 'aragonite' 'aragonite' 'aragonite' 'calcite'
 'calcite' 'calcite' 'aragonite' 'aragonite' 'aragonite' 'aragonite'
 'aragonite' 'mixed' 'mixed' 'mixed' 'calcite' 'calcite' 'calcite'
 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite'
 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite'
 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite' 'calcite'


### climateInterpretation_variable

In [35]:
# climate_interpretation
key = 'climateInterpretation_variable'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))


climateInterpretation_variable: 
['N/A' 'temperature' 'temperature+moisture']


### climateInterpretation_variableDetail

In [36]:
# climate_interpretation
key = 'climateInterpretation_variableDetail'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))


climateInterpretation_variableDetail: 
['N/A'
 'temperature - manually assigned by DoD2k authors for paleoData_proxy = Mg/Ca'
 'temperature+moisture - manually assigned by DoD2k authors for paleoData_proxy = d18O.']


### paleoData_sensorSpecies

In [37]:
# climate_interpretation
key = 'paleoData_sensorSpecies'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))


paleoData_sensorSpecies: 
['N/A']


## data 

### paleoData_units

In [38]:
# paleoData_units
key = 'paleoData_units'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))

paleoData_units: 
['mm/year' 'mmol/mol' 'permil']


### paleoData_values

In [39]:
# # paleoData_values
key = 'paleoData_values'

print('%s: '%key)
for ii, vv in enumerate(df[key][:20]):
    try: 
        print('%-30s: %s -- %s'%(df['dataSetName'].iloc[ii][:30], str(np.nanmin(vv)), str(np.nanmax(vv))))
        print(type(vv))
    except: print(df['dataSetName'].iloc[ii], 'NaNs detected.')

paleoData_values: 
Bittoo cave                   : -10.871 -- -5.274
<class 'numpy.ndarray'>
Bittoo cave                   : -8.426 -- 2.907
<class 'numpy.ndarray'>
Bittoo cave                   : 0.023809523809523808 -- 0.5
<class 'numpy.ndarray'>
Kesang cave                   : -9.86 -- -7.15
<class 'numpy.ndarray'>
Kesang cave                   : -7.34 -- -1.85
<class 'numpy.ndarray'>
Kesang cave                   : 0.011538461538461565 -- 0.028750000000000053
<class 'numpy.ndarray'>
Paraiso cave                  : -7.432 -- -5.568
<class 'numpy.ndarray'>
Paraiso cave                  : -10.426 -- -7.532
<class 'numpy.ndarray'>
Paraiso cave                  : 0.07721781220400367 -- 0.9637938429240929
<class 'numpy.ndarray'>
Paraiso cave                  : -6.82 -- -4.73
<class 'numpy.ndarray'>
Paraiso cave                  : -10.03 -- -6.5
<class 'numpy.ndarray'>
Paraiso cave                  : 0.12432379835401446 -- 0.30160359600881986
<class 'numpy.ndarray'>
Villars cave          

### year

In [40]:
# # year
key = 'year'
print('%s: '%key)
for ii, vv in enumerate(df[key][:]):
    try: print('%-30s: %s -- %s'%(df['dataSetName'].iloc[ii][:30], str(np.nanmin(vv)), str(np.nanmax(vv))))
    except: print('NaNs detected.', vv)

year: 
Bittoo cave                   : 11.0 -- 1076.0
Bittoo cave                   : 11.0 -- 1076.0
Bittoo cave                   : 11.0 -- 1950.0
Kesang cave                   : 632.0 -- 1098.0
Kesang cave                   : 632.0 -- 1098.0
Kesang cave                   : 640.0 -- 1950.0
Paraiso cave                  : 7.2082187999999405 -- 1235.592002788
Paraiso cave                  : 7.2082187999999405 -- 1235.592002788
Paraiso cave                  : 7.2082187999999405 -- 1950.0
Paraiso cave                  : 1181.5256421200002 -- 1998.0458382874
Paraiso cave                  : 1181.5256421200002 -- 1998.0458382874
Paraiso cave                  : 1188.3239775030002 -- 1998.0458382874
Villars cave                  : 6.1469999999999345 -- 1987.727
Villars cave                  : 6.1469999999999345 -- 1987.727
Villars cave                  : 6.1469999999999345 -- 1987.727
Cold Air cave                 : 1264.2116 -- 1484.17985
Cold Air cave                 : 1264.2116 -- 1484.1798

### yearUnits

In [41]:
# yearUnits
key = 'yearUnits'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))

yearUnits: 
['CE']
