In [1]:
"""
Update 21/11/24 LL : added option to save as csv
Update 30/10/24 by LL (v4): added check for empty paleoData_values row
Update 29/10/2024 by LL (v4): modified datasetId to create unique identifier for each record.

Load the ch2k database from https://lipdverse.org/CoralHydro2k/current_version/


Here we use the template (from Julien and Feng's lipd2df notebook) 
to extract a dataframe with the following columns:

columns=['archiveType', 
        'climateInterpretation_variable',
        'climateInterpretation_variableDetail',
        'datasetId',
        'dataSetName',                                                                                
        'geo_meanElev', 
        'geo_meanLat', 
        'geo_meanLon',
        'year', 'yearUnits',                                                                                         
        'paleoData_variableName',
        'paleoData_units',                                                                                           
        'paleoData_values',
        'paleoData_notes',
        'paleoData_sensorSpecies',
        'originalDataURL',
        'originalDatabase'
]

We save a standardised compact dataframe for concatenation to DoD2k


Initially created by Kevin Fan
"""

"\nUpdate 21/11/24 LL : added option to save as csv\nUpdate 30/10/24 by LL (v4): added check for empty paleoData_values row\nUpdate 29/10/2024 by LL (v4): modified datasetId to create unique identifier for each record.\n\nLoad the ch2k database from https://lipdverse.org/CoralHydro2k/current_version/\n\n\nHere we use the template (from Julien and Feng's lipd2df notebook) \nto extract a dataframe with the following columns:\n\ncolumns=['archiveType', \n        'climateInterpretation_variable',\n        'climateInterpretation_variableDetail',\n        'datasetId',\n        'dataSetName',                                                                                \n        'geo_meanElev', \n        'geo_meanLat', \n        'geo_meanLon',\n        'year', 'yearUnits',                                                                                         \n        'paleoData_variableName',\n        'paleoData_units',                                                                       

# Set up working environment

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Import packages
import pandas as pd
import lipd
import numpy as np
import os

In [4]:
# set up working directory. 
# The default working directory should be the parent folder (compile_proxy_database) so we can access the 'helper' files 
# Make sure this is changing to the correct path!

#wdir = '/home/jupyter-lluecke/compile_proxy_database_v2.0' # working directory, this should work, but doesn't seem to...
if not os.getcwd().endswith('compile_proxy_database_v2.1'):
    os.chdir(os.getcwd()+'/..')
wdir = os.getcwd()
print('working directory: '+wdir)
import functions as f # contains functions for plotting 

working directory: /home/jupyter-mnevans/dod2k


# Load source data and apply corrections

In [5]:
# download and unzip the dataset in LiPD
#!wget https://lipdverse.org/CoralHydro2k/current_version/CoralHydro2k1_0_0.zip
#!unzip CoralHydro2k1_0_0.zip

In [6]:
# load LiPD files from the given directory

D = lipd.readLipd(os.getcwd()+'/ch2k/lipdfiles/');
TS = lipd.extractTs(D);
len(TS)
os.chdir(os.getcwd()+'/../..')

Disclaimer: LiPD files may be updated and modified to adhere to standards

Found: 179 LiPD file(s)
reading: CH03BUN01.lpd
reading: ZI15MER01.lpd
reading: CO03PAL03.lpd
reading: CO03PAL02.lpd
reading: CA13PEL01.lpd
reading: LI06RAR01.lpd
reading: CO03PAL07.lpd
reading: FL18DTO03.lpd
reading: SM06LKF02.lpd
reading: UR00MAI01.lpd
reading: TU95MAD01.lpd
reading: ZI04IFR01.lpd
reading: RE18CAY01.lpd
reading: KU99HOU01.lpd
reading: OS13NLP01.lpd
reading: EV98KIR01.lpd
reading: LI00RAR01.lpd
reading: NU11PAL01.lpd
reading: MA08DTO01.lpd
reading: AB20MEN03.lpd
reading: CA14TIM01.lpd
reading: KA17RYU01.lpd
reading: MC11KIR01.lpd
reading: AB20MEN09.lpd
reading: HE08LRA01.lpd
reading: DA06MAF01.lpd
reading: SM06LKF01.lpd
reading: NA09MAL01.lpd
reading: SW98STP01.lpd
reading: MU18GSI01.lpd
reading: ZI14HOU01.lpd
reading: FL17DTO02.lpd
reading: DA06MAF02.lpd
reading: SA19PAL02.lpd
reading: CO03PAL01.lpd
reading: ZI16ROD01.lpd
reading: OS13NGP01.lpd
reading: CH98PIR01.lpd
reading: RE19GBR02.lpd
read

# Create compact dataframe

## Dataframe construction

In [7]:
# create a null DataFrame with desired columns

col_str=['archiveType', 'dataSetName', 'datasetId', 'geo_meanElev', 'geo_meanLat', 'geo_meanLon', 'geo_siteName', 
         'originalDataUrl', 'paleoData_notes', 'paleoData_variableName',
         'paleoData_archiveSpecies','paleoData_units', 'paleoData_values', 'year']

df_tmp = pd.DataFrame(index=range(len(TS)), columns=col_str)

In [8]:
# loop over the timeseries and pick those for global temperature analysis
i = 0                                                                                                                
for ts in TS: #for every time series
    # need to filter these variables in the list
    if ts['paleoData_variableName'] not in ['year', 'd18OUncertainty', 'SrCaUncertainty']: #filter out all ts with thee three as the var name
        for name in col_str:     #for each of the 12 main keys, shove the wanted data into the df                                                                                    
            try:
                df_tmp.loc[i, name] = ts[name]                                                                       
            except:
                df_tmp.loc[i, name] = np.nan                                                                         
    
    i += 1 
        
# drop the rows with all NaNs (those not for global temperature analysis)
df = df_tmp.dropna(how='all')

In [9]:
# double check the variable names we have
set(df['paleoData_variableName'])

{'SrCa', 'SrCa_annual', 'd18O', 'd18O_annual', 'd18O_sw', 'd18O_sw_annual'}

In [10]:
# KF: adding original dataset name and yearUnits
df.insert(7, 'originalDatabase', ['CoralHydro2k v1.0.0']*len(df))
df.insert(len(df.columns), 'yearUnits', ['CE'] * len(df))
df.insert(1, 'climateInterpretation_variable', ['N/A']*len(df))
df.insert(1, 'climateInterpretation_variableDetail', ['N/A']*len(df))

In [11]:
# Rename columns to fit naming conventions
df = df.rename(columns={'paleoData_variableName': 'paleoData_proxy', 'originalDataUrl': 'originalDataURL',
                        'paleoData_archiveSpecies': 'paleoData_sensorSpecies'})

In [12]:
# assign climateInterpretation_variable

# d18O is temperature and moisture
df.loc[np.isin(df['paleoData_proxy'], ['d18O', 'd18O_annual']), 'climateInterpretation_variable']='temperature+moisture' 
df.loc[np.isin(df['paleoData_proxy'], ['d18O', 'd18O_annual']), 'climateInterpretation_variableDetail']='temperature+moisture - manually assigned by DoD2k authors for paleoData_proxy = d18O' 
# d18O_sw is moisture
df.loc[np.isin(df['paleoData_proxy'], ['d18O_sw', 'd18O_sw_annual']), 'climateInterpretation_variable']='moisture' 
df.loc[np.isin(df['paleoData_proxy'], ['d18O_sw', 'd18O_sw_annual']), 'climateInterpretation_variableDetail']='moisture - manually assigned by DoD2k authors for paleoData_proxy = d18O_sw' 
# SrCa is temperature
df.loc[np.isin(df['paleoData_proxy'], ['SrCa', 'SrCa_annual']), 'climateInterpretation_variable']='temperature' 
df.loc[np.isin(df['paleoData_proxy'], ['SrCa', 'SrCa_annual']), 'climateInterpretation_variableDetail']='temperature - manually assigned by DoD2k authors for paleoData_proxy = Sr/Ca' 


In [13]:
import re
# KF: Extract and exclude sw values
df_sw = df[df['paleoData_proxy'].isin(['d18O_sw', 'd18O_sw_annual'])]
df = df[df['paleoData_proxy'].isin(['d18O_sw', 'd18O_sw_annual']) == False]
# KF: I was told to exclude the SW data for the concatented data by Mike

# KF: Turn annual measurements into regular
df_annual = df[df['paleoData_proxy'].isin(['SrCa_annual', 'd18O_annual'])]
df ['paleoData_proxy'] = df['paleoData_proxy'].apply(lambda x: re.match(r'(.*)_annual', x).group(1) if re.match(r'(.*)_annual', x) else x)

# KF: Replace SrCa with Sr/Ca for concat consistency
df_annual = df[df['paleoData_proxy'].isin(['SrCa_annual', 'd18O_annual'])]
df ['paleoData_proxy'] = df['paleoData_proxy'].apply(lambda x: 'Sr/Ca' if re.match('SrCa', x) else x)

# KF: Temp cleaning rows with NAN in year
length = len(df['year'])
df = df[df['year'].notna()]
df = df[df['year'].map(lambda x: len(x) > 0)]
df = df[df['paleoData_values'].map(lambda x: not any(pd.isnull(x)))]
print('Number of rows discarded: ', (length - len(df['year'])))

# KF: Make datasetIds unique
df['datasetId'] = df['datasetId'] + np.array(df.index, dtype = str)

Number of rows discarded:  0


In [14]:
# KF: Type-checking

df = df.astype({'archiveType': str, 'dataSetName': str, 'datasetId': str, 'geo_meanElev': np.float32, 'geo_meanLat': np.float32, 'geo_meanLon': np.float32, 'geo_siteName': str, 
                    'originalDatabase': str, 'originalDataURL': str, 'paleoData_notes': str, 'paleoData_proxy': str, 'paleoData_units': str, 'yearUnits': str})
df['year']             = df['year'].map(lambda x: np.array(x, dtype = np.float32))
df['paleoData_values'] = df['paleoData_values'].map(lambda x: np.array(x, dtype = np.float32))

In [15]:
for ii in df.index:
    year = np.array(df.at[ii, 'year'], dtype=float)
    vals = np.array(df.at[ii, 'paleoData_values'], dtype=float)
    df.at[ii, 'year']             = year[year>=1]
    df.at[ii, 'paleoData_values'] = vals[year>=1]

In [16]:
# Display dataframe

df = df[sorted(df.columns)]
df.reset_index(drop= True, inplace= True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           272 non-null    object 
 1   climateInterpretation_variable        272 non-null    object 
 2   climateInterpretation_variableDetail  272 non-null    object 
 3   dataSetName                           272 non-null    object 
 4   datasetId                             272 non-null    object 
 5   geo_meanElev                          229 non-null    float32
 6   geo_meanLat                           272 non-null    float32
 7   geo_meanLon                           272 non-null    float32
 8   geo_siteName                          272 non-null    object 
 9   originalDataURL                       272 non-null    object 
 10  originalDatabase                      272 non-null    object 
 11  paleoData_notes    

In [17]:
#  check that the datasetId is unique - it currently is not (df has 272 records).
print(len(df.datasetId.unique()))
# make datasetId unique by simply adding index number
df.datasetId=df.apply(lambda x: x.datasetId.replace('ch2k','ch2k_')+'_'+str(x.name), axis=1)
# check uniqueness - problem solved.
print(len(df.datasetId.unique()))

272
272


In [18]:
# KF: apply -9999.99 masking to nans
df['paleoData_values'] = df['paleoData_values'].map(lambda x: [-9999.99 if pd.isnull(y) else y for y in x])
df['paleoData_values'] = df['paleoData_values'].map(lambda x: [-9999.99 if y == 'nan' else y for y in x])
df['paleoData_values'] = df['paleoData_values'].map(lambda x: [-9999.99 if np.isnan(y) else y for y in x])

df['year'] = df['year'].map(lambda x: [-9999.99 if pd.isnull(y) else y for y in x])
df['year'] = df['year'].map(lambda x: [-9999.99 if y == 'nan' else y for y in x])
df['year'] = df['year'].map(lambda x: [-9999.99 if np.isnan(y) else y for y in x])

In [19]:
# drop nans 

for ii in df.index:
    dd   = np.array(df.at[ii, 'paleoData_values'])
    mask = dd==-9999.99
    df.at[ii, 'paleoData_values']=dd[~mask]
    df.at[ii, 'year']=np.array(df.at[ii, 'year'])[~mask]
    
drop_inds = []
for ii in range(df.shape[0]):
    # print(df.iloc[ii]['year'])
    if len(df.iloc[ii]['year'])==0:
        print('empty', ii, df.iloc[ii]['year'], df.iloc[ii]['originalDatabase'])
        print(df.iloc[ii]['paleoData_values'])
        drop_inds += [df.index[ii]]

for ii, row in enumerate(df.paleoData_values):
    if np.std(row)==0: 
        print(ii, 'std=0')
    elif np.sum(np.diff(row)**2)==0: 
        print(ii, 'diff=0')
    elif np.isnan(np.std(row)):
        print(ii, 'std nan')
    else:
        continue
    if df.index[ii] not in drop_inds: 
        drop_inds += [df.index[ii]]
    
print(drop_inds)
df = df.drop(index=drop_inds)

[]


In [20]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           272 non-null    object 
 1   climateInterpretation_variable        272 non-null    object 
 2   climateInterpretation_variableDetail  272 non-null    object 
 3   dataSetName                           272 non-null    object 
 4   datasetId                             272 non-null    object 
 5   geo_meanElev                          229 non-null    float32
 6   geo_meanLat                           272 non-null    float32
 7   geo_meanLon                           272 non-null    float32
 8   geo_siteName                          272 non-null    object 
 9   originalDataURL                       272 non-null    object 
 10  originalDatabase                      272 non-null    object 
 11  paleoData_notes    

## save compact dataframe

### save pickle

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           272 non-null    object 
 1   climateInterpretation_variable        272 non-null    object 
 2   climateInterpretation_variableDetail  272 non-null    object 
 3   dataSetName                           272 non-null    object 
 4   datasetId                             272 non-null    object 
 5   geo_meanElev                          229 non-null    float32
 6   geo_meanLat                           272 non-null    float32
 7   geo_meanLon                           272 non-null    float32
 8   geo_siteName                          272 non-null    object 
 9   originalDataURL                       272 non-null    object 
 10  originalDatabase                      272 non-null    object 
 11  paleoData_notes    

In [22]:
# save to a pickle file
df.to_pickle('ch2k/ch2k_compact.pkl')

### save csv

In [23]:
# save to a list of csv files (metadata, data, year)
df.name='ch2k'
f.write_compact_dataframe_to_csv(df)

METADATA: archiveType, climateInterpretation_variable, climateInterpretation_variableDetail, dataSetName, datasetId, geo_meanElev, geo_meanLat, geo_meanLon, geo_siteName, originalDataURL, originalDatabase, paleoData_notes, paleoData_proxy, paleoData_sensorSpecies, paleoData_units, yearUnits
Saved to /home/jupyter-mnevans/dod2k/ch2k/ch2k_compact_%s.csv


In [24]:
# load dataframe
f.load_compact_dataframe_from_csv('ch2k').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           272 non-null    object 
 1   climateInterpretation_variable        272 non-null    object 
 2   climateInterpretation_variableDetail  272 non-null    object 
 3   dataSetName                           272 non-null    object 
 4   datasetId                             272 non-null    object 
 5   geo_meanElev                          229 non-null    float32
 6   geo_meanLat                           272 non-null    float32
 7   geo_meanLon                           272 non-null    float32
 8   geo_siteName                          272 non-null    object 
 9   originalDataURL                       272 non-null    object 
 10  originalDatabase                      272 non-null    object 
 11  paleoData_notes    

# check output

## dataset metadata: dataSetName, datasetId, originalDataURL, originalDatabase

### index

In [25]:
# # check index
print(df.index)

RangeIndex(start=0, stop=272, step=1)


### dataSetName

In [26]:
# # check dataSetName
key = 'dataSetName'
print('%s: '%key)
print(df[key].values)

dataSetName: 
['CH03BUN01' 'ZI15MER01' 'ZI15MER01' 'CO03PAL03' 'CO03PAL02' 'CA13PEL01'
 'LI06RAR01' 'CO03PAL07' 'FL18DTO03' 'SM06LKF02' 'SM06LKF02' 'UR00MAI01'
 'TU95MAD01' 'ZI04IFR01' 'ZI04IFR01' 'RE18CAY01' 'RE18CAY01' 'RE18CAY01'
 'RE18CAY01' 'KU99HOU01' 'OS13NLP01' 'EV98KIR01' 'LI00RAR01' 'LI00RAR01'
 'NU11PAL01' 'NU11PAL01' 'MA08DTO01' 'MA08DTO01' 'AB20MEN03' 'CA14TIM01'
 'CA14TIM01' 'KA17RYU01' 'MC11KIR01' 'AB20MEN09' 'HE08LRA01' 'DA06MAF01'
 'SM06LKF01' 'SM06LKF01' 'NA09MAL01' 'SW98STP01' 'MU18GSI01' 'MU18GSI01'
 'ZI14HOU01' 'ZI14HOU01' 'ZI14HOU01' 'ZI14HOU01' 'FL17DTO02' 'FL17DTO02'
 'DA06MAF02' 'SA19PAL02' 'SA19PAL02' 'CO03PAL01' 'ZI16ROD01' 'OS13NGP01'
 'CH98PIR01' 'RE19GBR02' 'RE19GBR02' 'MU18RED04' 'MU18RED04' 'GR13MAD01'
 'XI17HAI01' 'XI17HAI01' 'XI17HAI01' 'XI17HAI01' 'DE14DTO03' 'KL97DAH01'
 'QU06RAB01' 'QU06RAB01' 'DE14DTO01' 'KU00NIN01' 'TU01SIA01' 'RE19GBR01'
 'RE19GBR01' 'GR13MAD02' 'AB20MEN07' 'BR19RED01' 'BR19RED01' 'NU09FAN01'
 'NU09FAN01' 'MU18RED01' 'OS14RIP01' 

### datasetId

In [27]:
# # check datasetId

print(len(df.datasetId.unique()))
print(len(df))
key = 'datasetId'
print('%s (starts with): '%key)
print(df[key].values)

272
272
datasetId (starts with): 
['ch2k_CH03BUN010_0' 'ch2k_ZI15MER012_1' 'ch2k_ZI15MER014_2'
 'ch2k_CO03PAL036_3' 'ch2k_CO03PAL028_4' 'ch2k_CA13PEL0110_5'
 'ch2k_LI06RAR0112_6' 'ch2k_CO03PAL0714_7' 'ch2k_FL18DTO0316_8'
 'ch2k_SM06LKF0218_9' 'ch2k_SM06LKF0220_10' 'ch2k_UR00MAI0122_11'
 'ch2k_TU95MAD0124_12' 'ch2k_ZI04IFR0126_13' 'ch2k_ZI04IFR0128_14'
 'ch2k_RE18CAY0130_15' 'ch2k_RE18CAY0132_16' 'ch2k_RE18CAY0134_17'
 'ch2k_RE18CAY0136_18' 'ch2k_KU99HOU0140_19' 'ch2k_OS13NLP0142_20'
 'ch2k_EV98KIR0144_21' 'ch2k_LI00RAR0146_22' 'ch2k_LI00RAR0148_23'
 'ch2k_NU11PAL0152_24' 'ch2k_NU11PAL0154_25' 'ch2k_MA08DTO0158_26'
 'ch2k_MA08DTO0160_27' 'ch2k_AB20MEN0362_28' 'ch2k_CA14TIM0164_29'
 'ch2k_CA14TIM0166_30' 'ch2k_KA17RYU0170_31' 'ch2k_MC11KIR0172_32'
 'ch2k_AB20MEN0974_33' 'ch2k_HE08LRA0176_34' 'ch2k_DA06MAF0178_35'
 'ch2k_SM06LKF0180_36' 'ch2k_SM06LKF0182_37' 'ch2k_NA09MAL0184_38'
 'ch2k_SW98STP0186_39' 'ch2k_MU18GSI0188_40' 'ch2k_MU18GSI0190_41'
 'ch2k_ZI14HOU0192_42' 'ch2k_ZI14HOU0194_43

### originalDataURL

In [28]:
# originalDataURL
key = 'originalDataURL'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))
# 'this study' should point to the correct URL (PAGES2k)

originalDataURL: 
['https://doi.org/10.1594/PANGAEA.874078'
 'https://doi.pangaea.de/10.1594/PANGAEA.743953'
 'https://doi.pangaea.de/10.1594/PANGAEA.830601'
 'https://doi.pangaea.de/10.1594/PANGAEA.88199'
 'https://doi.pangaea.de/10.1594/PANGAEA.88200'
 'https://doi.pangaea.de/10.1594/PANGAEA.887712'
 'https://doi.pangaea.de/10.1594/PANGAEA.891094'
 'https://www.ncdc.noaa.gov/paleo/study/1003972'
 'https://www.ncdc.noaa.gov/paleo/study/1003973'
 'https://www.ncdc.noaa.gov/paleo/study/10373'
 'https://www.ncdc.noaa.gov/paleo/study/10425'
 'https://www.ncdc.noaa.gov/paleo/study/10808'
 'https://www.ncdc.noaa.gov/paleo/study/11935'
 'https://www.ncdc.noaa.gov/paleo/study/12278'
 'https://www.ncdc.noaa.gov/paleo/study/12891'
 'https://www.ncdc.noaa.gov/paleo/study/12893'
 'https://www.ncdc.noaa.gov/paleo/study/12994'
 'https://www.ncdc.noaa.gov/paleo/study/13035'
 'https://www.ncdc.noaa.gov/paleo/study/13439'
 'https://www.ncdc.noaa.gov/paleo/study/15238'
 'https://www.ncdc.noaa.gov/paleo

### originalDatabase

In [29]:
# # originalDataSet
key = 'originalDatabase'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))
# Note: the last two records have missing URLs

originalDatabase: 
['CoralHydro2k v1.0.0']


## geographical metadata: elevation, latitude, longitude, site name

### geo_meanElev

In [30]:
# check Elevation
key = 'geo_meanElev'
print('%s: '%key)
print(df[key])
print(np.unique(['%d'%kk for kk in df[key] if np.isfinite(kk)]))

geo_meanElev: 
0      -3.0
1     -17.0
2     -17.0
3       NaN
4       NaN
       ... 
267   -10.0
268   -10.0
269    -7.0
270    -6.0
271    -6.0
Name: geo_meanElev, Length: 272, dtype: float32
['-1' '-10' '-11' '-12' '-14' '-16' '-17' '-18' '-2' '-25' '-3' '-4' '-5'
 '-6' '-7' '-8' '-9' '0']


### geo_meanLat

In [31]:
# # Latitude
key = 'geo_meanLat'
print('%s: '%key)
print(np.unique(['%d'%kk for kk in df[key]]))

geo_meanLat: 
['-10' '-11' '-12' '-13' '-14' '-15' '-16' '-17' '-18' '-19' '-2' '-21'
 '-22' '-23' '-28' '-3' '-4' '-5' '-6' '-8' '0' '1' '10' '11' '12' '13'
 '14' '15' '16' '17' '18' '19' '2' '20' '21' '22' '23' '24' '25' '27' '28'
 '3' '32' '4' '5' '7']


### geo_meanLon

In [32]:
# # Longitude 
key = 'geo_meanLon'
print('%s: '%key)
print(np.unique(['%d'%kk for kk in df[key]]))

geo_meanLon: 
['-109' '-114' '-149' '-157' '-159' '-162' '-169' '-174' '-22' '-33' '-60'
 '-61' '-64' '-66' '-67' '-80' '-81' '-82' '-86' '-87' '-88' '-89' '-91'
 '100' '105' '109' '110' '111' '113' '114' '115' '117' '118' '119' '120'
 '122' '123' '124' '130' '134' '142' '143' '144' '145' '146' '147' '148'
 '150' '151' '152' '153' '163' '166' '167' '172' '173' '179' '34' '36'
 '37' '38' '39' '40' '43' '45' '49' '55' '58' '63' '7' '70' '71' '72' '92'
 '96' '98' '99']


### geo_siteName

In [33]:
# Site Name 
key = 'geo_siteName'
print('%s: '%key)
print(df[key].values)

geo_siteName: 
['Bunaken Island, Indonesia' 'Rowley Shoals, Australia'
 'Rowley Shoals, Australia'
 'Palmyra Island, United States Minor Outlying Islands'
 'Palmyra Island, United States Minor Outlying Islands'
 'Cayos Cochinos, Honduras' 'Rarotonga, Cook Islands'
 'Palmyra Island, United States Minor Outlying Islands'
 'Dry Tortugas, Florida, USA' 'Looe Key, Florida, USA'
 'Looe Key, Florida, USA' 'Maiana, Republic of Kiribati'
 'Madang Lagoon, Papua New Guinea' 'Ifaty Reef, Madagascar'
 'Ifaty Reef, Madagascar' 'Little Cayman, Cayman Islands'
 'Little Cayman, Cayman Islands' 'Little Cayman, Cayman Islands'
 'Little Cayman, Cayman Islands' 'Houtman Abrolhos Islands, Australia'
 'Ngeralang, Palau' 'Kiritimati (Christmas) Island, Republic of Kiribati'
 'Rarotonga, Cook Islands' 'Rarotonga, Cook Islands'
 'Palmyra Island, United States Minor Outlying Islands'
 'Palmyra Island, United States Minor Outlying Islands'
 'Dry Tortugas, Florida, USA' 'Dry Tortugas, Florida, USA'
 'Mentawai Isla

## proxy metadata: archive type, proxy type, interpretation

### archiveType

In [34]:
# now check all the entries bit by bit (can be omitted at a later stage)

# archiveType
key = 'archiveType'
print('%s: '%key)
print(np.unique(df[key]))

archiveType: 
['coral']


### paleoData_proxy

In [35]:
# paleoData_proxy
key = 'paleoData_proxy'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))

paleoData_proxy: 
['Sr/Ca' 'd18O']


### paleoData_notes

In [36]:
# # paleoData_notes
key = 'paleoData_notes'
print('%s: '%key)
print(df[key].values)

paleoData_notes: 
['This paper did not calibrate the d18O proxy or reconstruct temperature. It instead analyzed variability through time by directly using the d18O proxy.'
 'Sr/Ca-SST recconstructed with composite plus scale method to ERSSTv3b, no regression applied'
 'Sr/Ca-SST recconstructed with composite plus scale method to ERSSTv3b, no regression applied'
 'nan' 'nan' 'nan'
 'Individual coral records that are part of the Rarotonga composite' 'nan'
 'nan' 'nan' 'nan' 'nan' 'monthly correlations with SST not reported'
 'Other calibration slopes are available in Zinke et al. 2004; 1920-1995 samples monthly; 1919-1658 sampled bimonthly'
 'Other calibration slopes are available in Zinke et al. 2004; 1920-1995 samples monthly; 1919-1658 sampled bimonthly'
 'nan' 'nan' 'nan' 'nan'
 '1953-1993 and 1961-1993 calibration periods, first with 0.13 slope, latter -0.17 slope'
 'nan' 'nan'
 'Sr/Ca-SST calibrations listed were found in Linsley et al. 2004. The calibration from Linsley et al. 200

### climateInterpretation_variable

In [37]:
# climate_interpretation
key = 'climateInterpretation_variable'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))


climateInterpretation_variable: 
['temperature' 'temperature+moisture']


### climateInterpretation_variableDetail

In [38]:
# climate_interpretation
key = 'climateInterpretation_variableDetail'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))


climateInterpretation_variableDetail: 
['temperature - manually assigned by DoD2k authors for paleoData_proxy = Sr/Ca'
 'temperature+moisture - manually assigned by DoD2k authors for paleoData_proxy = d18O']


### paleoData_sensorSpecies

In [39]:
# climate_interpretation
key = 'paleoData_sensorSpecies'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))


paleoData_sensorSpecies: 
['Diploastrea heliopora' 'Diploria labyrinthiformis' 'Favia speciosa'
 'Orbicella faveolata' 'Pavona clavus' 'Pavona gigantea'
 'Platygyra lamellina' 'Porites australiensis' 'Porites lobata'
 'Porites lutea' 'Porites solida' 'Porites sp.' 'Pseudodiploria strigosa'
 'Siderastrea radians' 'Siderastrea siderea' 'Siderastrea sp.'
 'Siderastrea stellata' 'Solenastrea bournoni']


## data 

### paleoData_units

In [40]:
# paleoData_units
key = 'paleoData_units'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))

paleoData_units: 
['mmol/mol' 'permil']


### paleoData_values

In [41]:
# # paleoData_values
key = 'paleoData_values'

print('%s: '%key)
for ii, vv in enumerate(df[key][:20]):
    try: 
        print('%-30s: %s -- %s'%(df['dataSetName'].iloc[ii][:30], str(np.nanmin(vv)), str(np.nanmax(vv))))
        print(type(vv))
    except: print(df['dataSetName'].iloc[ii], 'NaNs detected.')

paleoData_values: 
CH03BUN01                     : -5.757999897003174 -- -4.651800155639648
<class 'numpy.ndarray'>
ZI15MER01                     : 8.801589965820312 -- 9.006901741027832
<class 'numpy.ndarray'>
ZI15MER01                     : 8.801589965820312 -- 9.006901741027832
<class 'numpy.ndarray'>
CO03PAL03                     : -5.380000114440918 -- -4.110000133514404
<class 'numpy.ndarray'>
CO03PAL02                     : -5.295000076293945 -- -4.3379998207092285
<class 'numpy.ndarray'>
CA13PEL01                     : -4.789000034332275 -- -3.4670000076293945
<class 'numpy.ndarray'>
LI06RAR01                     : -5.130000114440918 -- -3.819999933242798
<class 'numpy.ndarray'>
CO03PAL07                     : -5.510000228881836 -- -4.440000057220459
<class 'numpy.ndarray'>
FL18DTO03                     : 8.890999794006348 -- 9.47599983215332
<class 'numpy.ndarray'>
SM06LKF02                     : -4.730000019073486 -- -3.0
<class 'numpy.ndarray'>
SM06LKF02                     

### year

In [42]:
# # year
key = 'year'
print('%s: '%key)
for ii, vv in enumerate(df[key][:20]):
    try: print('%-30s: %s -- %s'%(df['dataSetName'].iloc[ii][:30], str(np.nanmin(vv)), str(np.nanmax(vv))))
    except: print('NaNs detected.', vv)

year: 
CH03BUN01                     : 1860.0 -- 1990.5799560546875
ZI15MER01                     : 1891.0 -- 2009.0
ZI15MER01                     : 1891.0 -- 2009.0
CO03PAL03                     : 1317.1700439453125 -- 1406.489990234375
CO03PAL02                     : 1149.0799560546875 -- 1220.2049560546875
CA13PEL01                     : 1922.0899658203125 -- 2006.469970703125
LI06RAR01                     : 1906.8800048828125 -- 1999.75
CO03PAL07                     : 1635.02001953125 -- 1666.47998046875
FL18DTO03                     : 1997.64599609375 -- 2012.2080078125
SM06LKF02                     : 1960.969970703125 -- 2002.6300048828125
SM06LKF02                     : 1960.969970703125 -- 2002.6300048828125
UR00MAI01                     : 1840.0 -- 1994.5
TU95MAD01                     : 1922.5419921875 -- 1991.2919921875
ZI04IFR01                     : 1659.625 -- 1995.625
ZI04IFR01                     : 1688.625 -- 1995.625
RE18CAY01                     : 1887.0400390625 -- 2

### yearUnits

In [43]:
# yearUnits
key = 'yearUnits'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))

yearUnits: 
['CE']
