In [1]:
"""

load Iso 2k from csv (generated using load_iso2k.R from iso2k1_0_1.RData

source for R code and Rdata:  https://www.ncei.noaa.gov/pub/data/paleo/reconstructions/iso2k/ (accessed 5/11/24)

author: Lucie Luecke, created 5/11/24

Here we use extract a dataframe with the following columns:

columns=['archiveType', 
        'climateInterpretation_variable',
        'climateInterpretation_variableDetail',
        'datasetId',
        'dataSetName',                                                                                
        'geo_meanElev', 
        'geo_meanLat', 
        'geo_meanLon',
        'year', 'yearUnits',                                                                                         
        'paleoData_variableName',
        'paleoData_units',                                                                                           
        'paleoData_values',
        'paleoData_notes',
        'paleoData_sensorSpecies',
        'originalDataURL',
        'originalDatabase'
]

We save a standardised compact dataframe for concatenation to DoD2k

21/11/2024 LL: added csv saving of compact dataframe, removed redundant output.

"""


"\n\nload Iso 2k from csv (generated using load_iso2k.R from iso2k1_0_1.RData\n\nsource for R code and Rdata:  https://www.ncei.noaa.gov/pub/data/paleo/reconstructions/iso2k/ (accessed 5/11/24)\n\nauthor: Lucie Luecke, created 5/11/24\n\nHere we use extract a dataframe with the following columns:\n\ncolumns=['archiveType', \n        'climateInterpretation_variable',\n        'climateInterpretation_variableDetail',\n        'datasetId',\n        'dataSetName',                                                                                \n        'geo_meanElev', \n        'geo_meanLat', \n        'geo_meanLon',\n        'year', 'yearUnits',                                                                                         \n        'paleoData_variableName',\n        'paleoData_units',                                                                                           \n        'paleoData_values',\n        'paleoData_notes',\n        'paleoData_sensorSpecies',\n        'origi

# Set up working environment

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature 
from matplotlib.gridspec import GridSpec as GS
import scipy.io as sio
from copy import deepcopy as dc

In [4]:
# set up working directory. 
# The default working directory should be the parent folder (compile_proxy_database) so we can access the 'helper' files 
# Make sure this is changing to the correct path!

#wdir = '/home/jupyter-lluecke/compile_proxy_database_v2.0' # working directory, this should work, but doesn't seem to...
if not os.getcwd().endswith('compile_proxy_database_v2.1'):
    os.chdir(os.getcwd()+'/..')
wdir = os.getcwd()
print('working directory: '+wdir)
import functions as f # contains functions for plotting 

working directory: /home/jupyter-mnevans/dod2k


In [5]:

#%run -i functions.py'
import functions as f # Lucie's functions


# load the source data and apply corrections

## load main database and auxiliary data

In [6]:
data_dir   = 'iso2k/iso2k_csv/'

all_files  = f.fns(data_dir, end='.csv', print_dir=True)
all_files  = [ff for ff in all_files if 'checkpoint' not in ff]

['dataSetName.csv', 'originalDataURL.csv', 'paleoData_units.csv', 'geo_siteName.csv', 'geo_meanElev.csv', 'paleoData_values.csv', 'yearUnits.csv', 'archiveType.csv', 'paleoData_proxy.csv', 'datasetId.csv', 'paleoData_sensorSpecies.csv', 'year.csv', 'geo_meanLat.csv', 'geo_meanLon.csv', 'paleoData_notes.csv', 'climateInterpretation_variable.csv']


In [7]:
dt = {'dataSetName': str, 'originalDataURL': str, 'paleoData_units': str, 'geo_siteName': str, 
      'geo_meanElev': float, 'paleoData_values': False, 'yearUnits': str, 
      'archiveType': str, 'paleoData_proxy': str, 'datasetId': str, 
      'paleoData_sensorSpecies': str, 'year': False, 'geo_meanLat': float, 
      'geo_meanLon': float, 'climateInterpretation_variable': str}

In [8]:
data = {}
data_num = {}

for filename in all_files:
    print(filename)
    var = filename.replace('.csv','')
    row = f.read_csv(data_dir+var, header=False)
    if filename in ['year.csv', 'paleoData_values.csv']:
        data_num[var] = row
    else:
        data[var] = [rr[0] for rr in row]

archiveType.csv
climateInterpretation_variable.csv
dataSetName.csv
datasetId.csv
geo_meanElev.csv
geo_meanLat.csv
geo_meanLon.csv
geo_siteName.csv
originalDataURL.csv
paleoData_notes.csv
paleoData_proxy.csv
paleoData_sensorSpecies.csv
paleoData_units.csv
paleoData_values.csv
year.csv
yearUnits.csv


# create compact dataframe

## define compact dataframe and populate with metadata

In [9]:
df = pd.DataFrame(data)

In [10]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596 entries, 0 to 595
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   archiveType                     596 non-null    object
 1   climateInterpretation_variable  596 non-null    object
 2   dataSetName                     596 non-null    object
 3   datasetId                       596 non-null    object
 4   geo_meanElev                    596 non-null    object
 5   geo_meanLat                     596 non-null    object
 6   geo_meanLon                     596 non-null    object
 7   geo_siteName                    596 non-null    object
 8   originalDataURL                 596 non-null    object
 9   paleoData_notes                 596 non-null    object
 10  paleoData_proxy                 596 non-null    object
 11  paleoData_sensorSpecies         596 non-null    object
 12  paleoData_units                 596 non-null    ob

## rename columns and entries according to standard terminology

In [11]:
# archiveType naming convention changes
df['archiveType'] = (df['archiveType']).str.lower()
df.loc[df['archiveType'] == 'lakesediment', 'archiveType'] = 'lake sediment'
df.loc[df['archiveType'] == 'marinesediment', 'archiveType'] = 'marine sediment'
df.loc[df['archiveType'] == 'wood', 'archiveType'] = 'tree'
df.loc[df['archiveType'] == 'glacierice', 'archiveType'] = 'glacier ice'
df.loc[df['archiveType'] == 'groundice', 'archiveType'] = 'ground ice'
df.loc[df['archiveType'] == 'terrestrialsediment', 'archiveType'] = 'terrestrial sediment'
df.loc[df['archiveType'] == 'molluskshells', 'archiveType'] = 'mollusk shells'


# originalDataURL replace references to 'this compilation' with Iso2k DOI
df['originalDataURL'] = (df['originalDataURL']).str.lower()
df.loc[df['originalDataURL'] == 'this compilation', 'originalDataURL'] = 'https://www.ncei.noaa.gov/access/paleo-search/study/29593'

# yearUnits convert AD to CE
df['yearUnits'] = (df['yearUnits']).str.lower()
df.loc[df['yearUnits'] == 'ad', 'yearUnits'] = 'CE'


In [12]:
df.datasetId=df.apply(lambda x: 'iso2k_'+x.datasetId, axis=1)

In [13]:
# Insert columns for the original source database
df.insert(7, "originalDatabase", ["Iso2k v1.0.1"]*len(df))

In [14]:
df['geo_meanElev'] = df['geo_meanElev'].apply(lambda x: x if x!='NA' else np.nan)

In [15]:
df = df.astype({'archiveType': str, 
                'dataSetName': str, 'datasetId': str, 
                'geo_meanElev': np.float32, 
                'geo_meanLat': np.float32, 
                'geo_meanLon': np.float32, 
                'geo_siteName': str, 
                'originalDatabase': str, 'originalDataURL': str, 'paleoData_notes': str,
                'paleoData_proxy': str, 'paleoData_units': str, 'yearUnits': str
               })


In [16]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596 entries, 0 to 595
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   archiveType                     596 non-null    object 
 1   climateInterpretation_variable  596 non-null    object 
 2   dataSetName                     596 non-null    object 
 3   datasetId                       596 non-null    object 
 4   geo_meanElev                    592 non-null    float32
 5   geo_meanLat                     596 non-null    float32
 6   geo_meanLon                     596 non-null    float32
 7   originalDatabase                596 non-null    object 
 8   geo_siteName                    596 non-null    object 
 9   originalDataURL                 596 non-null    object 
 10  paleoData_notes                 596 non-null    object 
 11  paleoData_proxy                 596 non-null    object 
 12  paleoData_sensorSpecies         596 

In [17]:
df = df.rename(columns={'climateInterpretation_variable':'climateInterpretation_variableDetail'})

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596 entries, 0 to 595
Data columns (total 15 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           596 non-null    object 
 1   climateInterpretation_variableDetail  596 non-null    object 
 2   dataSetName                           596 non-null    object 
 3   datasetId                             596 non-null    object 
 4   geo_meanElev                          592 non-null    float32
 5   geo_meanLat                           596 non-null    float32
 6   geo_meanLon                           596 non-null    float32
 7   originalDatabase                      596 non-null    object 
 8   geo_siteName                          596 non-null    object 
 9   originalDataURL                       596 non-null    object 
 10  paleoData_notes                       596 non-null    object 
 11  paleoData_proxy    

In [19]:
civ = {}
civ_r = {'moisture': [], 'temperature': [], 'temperature+moisture': [], 
         'N/A': [], 'NOT temperature NOT moisture': []}
for key in set(df['climateInterpretation_variableDetail']):
    lkey = key.lower()
    moisture = np.any(['precip' in lkey, lkey=='m', lkey=='p', lkey=='p_e',
                      'moisture' in lkey, 'hydro' in lkey, 'discharge' in lkey,
                       'lake overflow' in lkey, 'pdsi' in lkey, 'aridity' in lkey,
                        'p_isotope' in lkey, 'p_amount' in lkey, 'monsoon' in lkey, 
                       'd18o_seawater' in lkey, 'salinity' in lkey,
                       'water avail' in lkey, 'groundwater' in lkey,
                       'd18osw' in lkey, 'storm track' in lkey,
                       'rain' in lkey, 'p/e' in lkey, 'e/p' in lkey, 'evaporation' in lkey])
    temperature = np.any(['temp' in lkey, lkey=='t', 't_air' in lkey, 't_water' in lkey, 
                           'sst' in lkey])
    if 'not interpreted to reflect climate' in lkey:
        civ[key]='NOT temperature NOT moisture'
        civ_r['NOT temperature NOT moisture'] += [key]
        continue
    if moisture & temperature:
        civ[key] = 'temperature+moisture'
        civ_r['temperature+moisture']+=[key]
    elif moisture:
        civ[key]='moisture'
        civ_r['moisture']+=[key]
    elif temperature:
        civ[key]='temperature'
        civ_r['temperature']+=[key]
    else:
        civ[key]='N/A'
        civ_r['N/A'] += [key]
        print('not in civ:', key)
print(civ)
for tm, keys in civ_r.items():
    print(tm)
    for key in np.sort(keys):
        print('  ', '-', key)

not in civ: atmospheric circulation
not in civ: PDO
not in civ: Solar insolation
not in civ: d18O
not in civ: S
not in civ: AMO
not in civ: not clear
not in civ: amount effect
not in civ: NULL
not in civ: I_E
not in civ: atmospheric circluation
not in civ: NAO index
not in civ: Source and intensity
not in civ: strength
not in civ: the position between the ecotone of desiduous forests and prairy
not in civ: surface pressure
not in civ: source region
{'Mean Annual Precipitation': 'moisture', '?18O.precipitation': 'moisture', 'rainfall source area': 'moisture', 'precipitation d18O': 'moisture', 'd18O_seawater': 'moisture', 'atmospheric circulation': 'N/A', 'P_amount, RH': 'moisture', 'Precipitation_amount, humidity': 'moisture', 'PDO': 'N/A', 'P_E': 'moisture', 'Solar insolation': 'N/A', 'Moisture flux from the subtropical North Pacific (driven by SST anomalies in the Kuroshio Extension region)': 'temperature+moisture', 'T': 'temperature', 'Precipitation amount (as a function of temperatu

In [20]:
df['climateInterpretation_variable'] = 'N/A'

In [21]:
df['climateInterpretation_variable']=df['climateInterpretation_variableDetail'].apply(lambda x: civ[x])


In [22]:
set(df['climateInterpretation_variable'])

{'N/A',
 'NOT temperature NOT moisture',
 'moisture',
 'temperature',
 'temperature+moisture'}

## add numeric data to dataframe

In [23]:
# add numeric data to dataframe
df['year']             = data_num['year']
df['paleoData_values'] = data_num['paleoData_values']

In [24]:
# mask missing values 
df['paleoData_values'] = df['paleoData_values'].map(lambda x: np.array([-9999.99 if y == 'NA' else float(y) for y in x]))
df['year']             = df['year'].map(lambda x: np.array([-9999.99 if y == 'NA' else float(y) for y in x]))


In [25]:
# mask nans and exclude from dataframe
drop_inds = []
for ii in df.index:
    # print(df.at[ii, 'year'].shape[0]==1)
    if (df.at[ii, 'year'].shape[0]==1)|(len(df.at[ii, 'year'].shape)==0)|(len(df.at[ii, 'paleoData_values'].shape)==0):
        drop_inds+=[df.index[ii]]
        continue
    dd=f.convert_to_nparray(df.at[ii, 'paleoData_values'])
    dy=f.convert_to_nparray(df.at[ii, 'year'])
    
    df.at[ii, 'paleoData_values']=dd.data[(~dd.mask)&(~dy.mask)]
    df.at[ii, 'year']=np.array(df.at[ii, 'year'])[(~dd.mask)&(~dy.mask)]
    if np.any(df.at[ii, 'year']==-9999.99):
        print('-9999.99 detected.')
print(drop_inds, [df.at[ii, 'year'] for ii in drop_inds])
df = df.drop(index=np.unique(drop_inds))

[64, 65, 66, 67, 68, 69] [array([-9999.99]), array([-9999.99]), array([-9999.99]), array([-9999.99]), array([-9999.99]), array([-9999.99])]


In [26]:
# check year is in units CE

BP_records = []
problem_records = []
problem_record_IDs = []
key = 'year'
print('%s: '%key)
vmin=2000
vmax=1
for ii, vv in enumerate(df[key]):
    if np.any(np.array(vv)>2020):
        if np.max(1950-vv)<2020:
            BP_records+=[df.index[ii]]
        else:
            print('%-20s: %10s -- %5s'%(df['dataSetName'].iloc[ii][:30], str(1950-np.round(np.nanmin(vv), 1)),
                                                str(1950-np.round(np.nanmax(vv), 1)))+' %s'%df['yearUnits'].iloc[ii])
            print('%-20s: %10s -- %5s'%(df['dataSetName'].iloc[ii][:30], str(np.round(np.nanmin(vv), 1)),
                                                str(np.round(np.nanmax(vv), 1)))+' %s'%df['yearUnits'].iloc[ii])
            print('------>>>>>  PROBLEM RECORD  <<<<------')
            problem_records += [df.index[ii]]
            problem_record_IDs += [df.at[df.index[ii], 'datasetId']]
            print(df.loc[df.index[ii]][['year', 'yearUnits', 'datasetId', 'originalDataURL']])
            
print('Problem records: ', problem_records)
print('Convert to BP: ', BP_records)

year: 
LS91HOMI            :     2055.5 -- -9598.3 CE
LS91HOMI            :     -105.5 -- 11548.3 CE
------>>>>>  PROBLEM RECORD  <<<<------
year               [-105.5, -58.0, 8.0, 63.4000000000001, 96.4000...
yearUnits                                                         CE
datasetId                                             iso2k_LS91HOMI
originalDataURL          https://www.ncdc.noaa.gov/paleo/study/23092
Name: 313, dtype: object
SP99DEUS            :     7726.0 -- -90.0 CE
SP99DEUS            :    -5776.0 -- 2040.0 CE
------>>>>>  PROBLEM RECORD  <<<<------
year               [2040.0, 1988.0, 1936.0, 1885.0, 1833.0, 1781....
yearUnits                                                         CE
datasetId                                          iso2k_SP99DEUS01A
originalDataURL    https://www.ncdc.noaa.gov/paleo-search/study/5425
Name: 456, dtype: object
Problem records:  [np.int64(313), np.int64(456)]
Convert to BP:  [np.int64(238), np.int64(240), np.int64(244), np.int64(245), 

In [27]:
# convert records to BP
for ii in BP_records:
    df.at[ii, 'year'] = 1950 - df.at[ii, 'year']

In [28]:
# drop problem records
df = df.drop(index=np.unique(problem_records))

In [29]:
# mask Common Era and drop rows with no entries
drop_inds = []
drop_IDs = []
for ii in df.index:
    
    year = np.array(df.at[ii, 'year'], dtype=float)
    vals = np.array(df.at[ii, 'paleoData_values'], dtype=float)
    # mask Common Era
    df.at[ii, 'year']             = year[year>=1]
    df.at[ii, 'paleoData_values'] = vals[year>=1]

    try:
        # print('length: %-20s, %-5s, %-5s'%(df.at[ii, 'datasetId'], ii, str(len(df.at[ii, 'year']))))
        # drop rows with no entries
        if (len(df.at[ii, 'year'].shape)==0):
            print('%10s, %10s'%(str(ii), 'drop nearly empty rows', df.at[ii, 'year']))
            # df.at[ii, 'year']=[df.at[ii, 'year']
            drop_inds+=[ii]
            drop_IDs+=[df.at[ii, 'datasetId']]
        elif (len(df.at[ii, 'year'])==0)|(len(df.at[ii, 'paleoData_values'])==0):
            print(ii, 'drop empty rows', df.at[ii, 'year'])
            drop_inds+=[ii]
            drop_IDs+=[df.at[ii, 'datasetId']]
    except TypeError:
        df.at[ii, 'year']=[df.at[ii, 'year']]
        print(df.at[ii, 'year'] )
        # drop_inds+=[ii]
        # drop_IDs+=[df.at[ii, 'datasetId']]
        
        
print(drop_inds)
# print(df.at[drop_inds[0],year])
# df = df.drop(index=np.unique(drop_inds))

346 drop empty rows []
370 drop empty rows []
430 drop empty rows []
431 drop empty rows []
[array(23.)]
518 drop empty rows []
[346, 370, 430, 431, 518]


In [30]:

df = df.drop(index=np.unique(drop_inds))

In [31]:
drop_inds = []

for ii in df.index:
    if len(df.at[ii,'year'])==0:
        print('empty', ii, df.at[ii,'year'], df.at[ii,'originalDatabase'])
        print(df.at[ii,'paleoData_values'])
        drop_inds += [ii]
    
    if np.std(df.at[ii,'paleoData_values'])==0: 
        print(ii, 'std=0')
    elif np.sum(np.diff(df.at[ii,'paleoData_values'])**2)==0: 
        print(ii, 'diff=0')
    elif np.isnan(np.std(df.at[ii,'paleoData_values'])):
        print(ii, 'std nan')
    else:
        continue
    drop_inds += [ii]
print(drop_inds)
# df = df.drop(index=np.unique(drop_inds))

458 std=0
[458]


In [32]:
df = df.drop(index=np.unique(drop_inds))

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 582 entries, 0 to 595
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           582 non-null    object 
 1   climateInterpretation_variableDetail  582 non-null    object 
 2   dataSetName                           582 non-null    object 
 3   datasetId                             582 non-null    object 
 4   geo_meanElev                          578 non-null    float32
 5   geo_meanLat                           582 non-null    float32
 6   geo_meanLon                           582 non-null    float32
 7   originalDatabase                      582 non-null    object 
 8   geo_siteName                          582 non-null    object 
 9   originalDataURL                       582 non-null    object 
 10  paleoData_notes                       582 non-null    object 
 11  paleoData_proxy         

## save compact dataframe

### save pickle

In [34]:
# # save to a pickle file (security: is it better to save to csv?)
df = df[sorted(df.columns)]
df.to_pickle('iso2k/iso2k_compact.pkl')

### save csv

In [35]:
# save to a list of csv files (metadata, data, year)
df.name='iso2k'
f.write_compact_dataframe_to_csv(df)

METADATA: archiveType, climateInterpretation_variable, climateInterpretation_variableDetail, dataSetName, datasetId, geo_meanElev, geo_meanLat, geo_meanLon, geo_siteName, originalDataURL, originalDatabase, paleoData_notes, paleoData_proxy, paleoData_sensorSpecies, paleoData_units, yearUnits
Saved to /home/jupyter-mnevans/dod2k/iso2k/iso2k_compact_%s.csv


In [36]:
# load dataframe
print(f.load_compact_dataframe_from_csv('iso2k').info())
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 582 entries, 0 to 581
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           582 non-null    object 
 1   climateInterpretation_variable        582 non-null    object 
 2   climateInterpretation_variableDetail  582 non-null    object 
 3   dataSetName                           582 non-null    object 
 4   datasetId                             582 non-null    object 
 5   geo_meanElev                          578 non-null    float32
 6   geo_meanLat                           582 non-null    float32
 7   geo_meanLon                           582 non-null    float32
 8   geo_siteName                          582 non-null    object 
 9   originalDataURL                       582 non-null    object 
 10  originalDatabase                      582 non-null    object 
 11  paleoData_notes    

# check output

## dataset metadata: dataSetName, datasetId, originalDataURL, originalDatabase

### index

In [37]:
# # check index
print(df.index)

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       586, 587, 588, 589, 590, 591, 592, 593, 594, 595],
      dtype='int64', length=582)


### dataSetName

In [38]:
# # check dataSetName
key = 'dataSetName'
print('%s: '%key)
print(df[key].values)

dataSetName: 
['CO00COKY' 'CO00DRBE' 'CO00FERA' 'CO00KUNI' 'CO00URMA' 'CO01TUNG'
 'CO01TUNG' 'CO02KUBE' 'CO03CHBA' 'CO03CHBU' 'CO03COPM' 'CO04BAFI'
 'CO04KIVA' 'CO04LIFI' 'CO04LIRA' 'CO04MGNG' 'CO04PFCH' 'CO04PFRE'
 'CO04ZIMG' 'CO05ASGU' 'CO05BAFI' 'CO05KUBE' 'CO06DATZ' 'CO06LIFI'
 'CO06MOPE' 'CO06MOTO' 'CO06QUNG' 'CO07CAFR' 'CO08ABSU' 'CO08GOBE'
 'CO08HEVE' 'CO08KIPR' 'CO09NAKY' 'CO09NUTB' 'CO10HEIG' 'CO11NUPM'
 'CO12GOVA' 'CO13CABL' 'CO13CAHN' 'CO13CAHN' 'CO13COTB' 'CO13COXM'
 'CO13DESC01A' 'CO13HENG' 'CO13HENG' 'CO13HENG' 'CO14CABT' 'CO14CATI'
 'CO14OSPA' 'CO14OSPA' 'CO14WUCL' 'CO14ZIHO' 'CO14ZIMG' 'CO14ZIMG'
 'CO15ABSP' 'CO17DESC01A' 'CO17DESC02A' 'CO17DESC03A' 'CO17DESC04A'
 'CO17LITW01A' 'CO17MUMA' 'CO17RAPA' 'CO17RAPA' 'CO17WUBO1A' 'CO17XISC01A'
 'CO18DATO01A' 'CO18HECO01A' 'CO18HECO01B' 'CO18RELC01A' 'CO92SHPU'
 'CO93COTW' 'CO94DUUR' 'CO94HEAQ' 'CO94LISE' 'CO95TUNG' 'CO96QUVA'
 'CO96SWBB' 'CO96SWFB' 'CO97CHSY' 'CO98BOFP' 'CO98CHPI' 'CO98EVXM'
 'CO98QUNC' 'CO98SWPR' 'CO99DRGB' '

### datasetId

In [39]:
# # check datasetId

print(len(df.datasetId.unique()))
print(len(df))
key = 'datasetId'
print('%s (starts with): '%key)
print(df[key].values)

582
582
datasetId (starts with): 
['iso2k_Ocean2kHR_001' 'iso2k_Ocean2kHR_124' 'iso2k_Ocean2kHR_019'
 'iso2k_CO00KUNI01Ab' 'iso2k_Ocean2kHR_177' 'iso2k_Ocean2kHR_140'
 'iso2k_Ocean2kHR_141' 'iso2k_Ocean2kHR_102' 'iso2k_Ocean2kHR_006'
 'iso2k_Ocean2kHR_171' 'iso2k_Ocean2kHR_139' 'iso2k_CO04BAFI00A'
 'iso2k_Ocean2kHR_174' 'iso2k_Ocean2kHR_157' 'iso2k_CO04LIRA01A'
 'iso2k_CO04MGNG01A' 'iso2k_CO04PFCH02A' 'iso2k_Ocean2kHR_013'
 'iso2k_Ocean2kHR_020' 'iso2k_Ocean2kHR_166' 'iso2k_Ocean2kHR_142'
 'iso2k_Ocean2kHR_105' 'iso2k_Ocean2kHR_007' 'iso2k_Ocean2kHR_156'
 'iso2k_Ocean2kHR_107' 'iso2k_Ocean2kHR_110' 'iso2k_Ocean2kHR_144'
 'iso2k_Ocean2kHR_193' 'iso2k_Ocean2kHR_010' 'iso2k_Ocean2kHR_094'
 'iso2k_Ocean2kHR_096' 'iso2k_CO08KIPR01A' 'iso2k_Ocean2kHR_012'
 'iso2k_CO09NUTB01B' 'iso2k_Ocean2kHR_097' 'iso2k_Ocean2kHR_160'
 'iso2k_Ocean2kHR_154' 'iso2k_CO13CABL01A' 'iso2k_CO13CAHN01A'
 'iso2k_CO13CAHN02A' 'iso2k_CO13COTB00A' 'iso2k_CO13COXM03A'
 'iso2k_WEB396032ca' 'iso2k_CO13HENG01A' 'iso2k_CO1

### originalDataURL

In [40]:
# originalDataURL
key = 'originalDataURL'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))
# 'this study' should point to the correct URL (PAGES2k)

originalDataURL: 
['http://doi.pangaea.de/10.1594/pangaea.676709'
 'http://doi.pangaea.de/10.1594/pangaea.711835'
 'http://doi.pangaea.de/10.1594/pangaea.735717'
 'http://doi.pangaea.de/10.1594/pangaea.738188'
 'http://doi.pangaea.de/10.1594/pangaea.776444'
 'http://doi.pangaea.de/10.1594/pangaea.780423'
 'http://www.iceandclimate.nbi.ku.dk/data/vinther_etal_2010_data_02feb2010.xls'
 'http://www.ncdc.noaa.gov/paleo/study/5445'
 'http://www1.ncdc.noaa.gov/pub/data/paleo/paleolimnology/europe/germany/ammersee_1999.txt'
 'http://www1.ncdc.noaa.gov/pub/data/paleo/paleolimnology/europe/uk/lough-na-shade2010.txt'
 'http://www1.ncdc.noaa.gov/pub/data/paleo/treering/isotope/asia/russia/'
 'http://www1.ncdc.noaa.gov/pub/data/paleo/treering/isotope/southamerica/'
 'https://agupubs.onlinelibrary.wiley.com/action/downloadsupplement?doi=10.1002%2f2016jc012458&file=jgrc22050-sup-0002-2016jc012458-ds01.xlsx'
 'https://ars-els-cdn-com.libezp.lib.lsu.edu/content/image/1-s2.0-s1367912016304138-mmc1.xlsx

### originalDatabase

In [41]:
# # originalDataSet
key = 'originalDatabase'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))
# Note: the last two records have missing URLs

originalDatabase: 
['Iso2k v1.0.1']


## geographical metadata: elevation, latitude, longitude, site name

### geo_meanElev

In [42]:
# check Elevation
key = 'geo_meanElev'
print('%s: '%key)
print(df[key])
print(np.unique(['%d'%kk for kk in df[key] if np.isfinite(kk)]))

geo_meanElev: 
0        -6.0
1       -25.0
2        -6.0
3        -3.0
4         6.0
        ...  
591    -884.0
592   -1295.0
593    -459.0
594    2493.0
595    2171.0
Name: geo_meanElev, Length: 582, dtype: float32
['-1' '-10' '-102' '-1048' '-11' '-1169' '-1185' '-12' '-1205' '-1245'
 '-1250' '-1295' '-13' '-136' '-1389' '-14' '-143' '-1490' '-16' '-1649'
 '-1727' '-18' '-1895' '-2' '-20' '-2114' '-2259' '-2382' '-2391' '-25'
 '-250' '-2543' '-3' '-316' '-3620' '-366' '-373' '-395' '-3975' '-4'
 '-459' '-5' '-503' '-531' '-547' '-594' '-6' '-620' '-657' '-694' '-695'
 '-7' '-790' '-8' '-84' '-852' '-875' '-884' '-9' '-90' '-968' '0' '10'
 '1000' '1006' '1030' '105' '1051' '1054' '110' '111' '1126' '113' '1140'
 '1156' '1196' '12' '1200' '1239' '124' '1244' '1250' '1260' '1300' '132'
 '1340' '1350' '1354' '1363' '1370' '14' '1400' '1407' '1420' '1439' '150'
 '1542' '158' '1583' '160' '1600' '1620' '1626' '1639' '1640' '1697' '170'
 '1700' '1713' '1730' '1742' '1746' '1770' '178' '179

### geo_meanLat

In [43]:
# # Latitude
key = 'geo_meanLat'
print('%s: '%key)
print(np.unique(['%d'%kk for kk in df[key]]))

geo_meanLat: 
['-1' '-10' '-11' '-12' '-13' '-15' '-16' '-17' '-18' '-19' '-20' '-21'
 '-22' '-23' '-24' '-27' '-28' '-3' '-32' '-36' '-37' '-39' '-4' '-41'
 '-44' '-46' '-5' '-50' '-51' '-64' '-66' '-67' '-69' '-7' '-70' '-71'
 '-72' '-73' '-74' '-75' '-76' '-77' '-78' '-79' '-8' '-80' '-82' '-83'
 '-84' '-86' '-89' '-9' '0' '1' '10' '11' '12' '13' '14' '16' '17' '18'
 '19' '2' '20' '21' '22' '23' '24' '25' '27' '28' '29' '3' '30' '31' '32'
 '33' '34' '35' '36' '37' '38' '39' '4' '40' '41' '42' '43' '44' '45' '46'
 '47' '48' '49' '5' '50' '51' '52' '53' '54' '55' '57' '58' '6' '60' '61'
 '62' '63' '64' '65' '66' '67' '68' '69' '7' '70' '71' '72' '73' '75' '76'
 '77' '78' '79' '8' '80' '81']


### geo_meanLon

In [44]:
# # Longitude 
key = 'geo_meanLon'
print('%s: '%key)
print(np.unique(['%d'%kk for kk in df[key]]))

geo_meanLon: 
['-1' '-104' '-105' '-107' '-109' '-110' '-111' '-113' '-114' '-116'
 '-117' '-118' '-119' '-12' '-120' '-122' '-123' '-124' '-13' '-133'
 '-134' '-136' '-138' '-140' '-145' '-146' '-148' '-149' '-153' '-157'
 '-159' '-16' '-160' '-161' '-162' '-17' '-174' '-19' '-2' '-20' '-22'
 '-24' '-26' '-3' '-35' '-36' '-37' '-38' '-39' '-4' '-41' '-42' '-43'
 '-44' '-45' '-46' '-48' '-49' '-50' '-51' '-53' '-54' '-57' '-6' '-60'
 '-61' '-62' '-64' '-66' '-67' '-68' '-69' '-70' '-71' '-73' '-74' '-75'
 '-76' '-77' '-79' '-8' '-80' '-82' '-83' '-84' '-85' '-86' '-87' '-88'
 '-89' '-9' '-91' '-92' '-94' '-95' '-97' '-99' '0' '1' '10' '100' '101'
 '102' '104' '105' '106' '107' '108' '11' '110' '112' '113' '115' '117'
 '119' '12' '120' '122' '123' '124' '125' '126' '13' '133' '134' '14'
 '140' '141' '143' '144' '145' '148' '15' '151' '152' '153' '158' '159'
 '162' '165' '166' '167' '169' '17' '172' '173' '179' '18' '2' '20' '24'
 '25' '27' '29' '3' '30' '31' '32' '34' '35' '37' '39' '4'

### geo_siteName

In [45]:
# Site Name 
key = 'geo_siteName'
print('%s: '%key)
print(df[key].values)

geo_siteName: 
['Malindi Marine Park' 'Northeast Breakers, Bermuda'
 'Ras Umm Sidd, Red Sea' 'Ningaloo Reef, West Australia'
 'Maiana Atoll, Republic of Kiribati' 'Deplik Tabat Reef, Madang Lagoon'
 'Deplik Tabat Reef, Madang Lagoon' 'North East Breakers, Bermuda'
 'Lombok Strait, Bali, Indonesia' 'Bunaken Island' 'Palmyra (composite)'
 'Savusavu Bay, Fiji' 'Malo Channel, Espiritu Santo Island, Vanuatu'
 'Savusavu Bay' 'Rarotonga, Cook Islands, South Pacific'
 'Papua New Guinea' 'Peros Banhos Atoll'
 'La Reunion, Southwestern Indian Ocean' 'Ifaty Reef, SW Madagascar'
 'Guam Coral' 'Savusavu Bay, Fiji' 'North East Breakers, Bermuda'
 'Mafia Archipelago, Tanzania' 'Savusavu Bay, Fiji'
 'Pedra de Lume, Cape Verde Islands' 'Buccoo Reef, Tobago' 'Rabaul'
 'Coral Sea' 'Mantawai Islands, West Sumatra, Indonesia'
 'Bermuda south shore' 'Cayo Sal, Los Roques Archipelago, Venezuela'
 'Turrumote Reef, La Parguera, Puerto Rico' 'Malindi' 'Fanning Island'
 'Guadeloupe' 'Palmyra' 'Vanuatu' 'Belize' 

## proxy metadata: archive type, proxy type, interpretation

### archiveType

In [46]:
# now check all the entries bit by bit (can be omitted at a later stage)

# archiveType
key = 'archiveType'
print('%s: '%key)
print(np.unique(df[key]))

archiveType: 
['coral' 'glacier ice' 'ground ice' 'lake sediment' 'marine sediment'
 'mollusk shells' 'sclerosponge' 'speleothem' 'terrestrial sediment'
 'tree']


### paleoData_proxy

In [47]:
# paleoData_proxy
key = 'paleoData_proxy'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))

paleoData_proxy: 
['d18O' 'd2H']


### paleoData_notes

In [48]:
# # paleoData_notes
key = 'paleoData_notes'
print('%s: '%key)
print(df[key].values)

paleoData_notes: 
['; climateInterpretation_seasonality changed - was originally -12 -11 1 2 3 4 5 6 7 8 9 10; archiveType changed - was originally Coral (CO)'
 '; climateInterpretation_seasonality changed - was originally -12 -11 -10 -9 -8 1 2 3 4 5 6 7; archiveType changed - was originally Coral (CO)'
 '; climateInterpretation_seasonality changed - was originally bimonthly; archiveType changed - was originally Coral (CO)'
 '; archiveType changed - was originally Coral (CO)'
 '; climateInterpretation_seasonality changed  was originally bimonthly; archiveType changed  was originally Coral (CO); archiveType changed  was originally Coral (CO)'
 '; climateInterpretation_seasonality changed - was originally seasonal; archiveType changed - was originally Coral (CO); archiveType changed - was originally Coral (CO)'
 '; climateInterpretation_seasonality changed - was originally seasonal; archiveType changed - was originally Coral (CO); archiveType changed - was originally Coral (CO)'
 '; clim

### climateInterpretation_variable

In [49]:
# climate_interpretation
key = 'climateInterpretation_variable'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))


climateInterpretation_variable: 
['N/A' 'NOT temperature NOT moisture' 'moisture' 'temperature'
 'temperature+moisture']


### climateInterpretation_variableDetail

In [50]:
# climate_interpretation
key = 'climateInterpretation_variableDetail'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))


climateInterpretation_variableDetail: 
['?18O.precipitation' 'AMO' 'Aridity' 'Asian summer monsoon' 'E/P'
 'E:P (groundwater \\fluid balance\\")"' 'East African Monsoon strength'
 'Effective Moisture' 'Estuary salinity' 'Evaporation' 'I_E' 'M'
 'Mean Annual Precipitation'
 'Moisture flux from the subtropical North Pacific (driven by SST anomalies in the Kuroshio Extension region)'
 'Monsoon strength' 'NAO index' 'NULL' 'P' 'P/E' 'PDO' 'PDSI' 'P_E'
 'P_amount' 'P_amount and temperature' 'P_amount, P_E' 'P_amount, RH'
 'P_amount, RH, T_air, P_E' 'P_amount, T_air' 'P_isotope'
 'Precipitation amount'
 'Precipitation amount (as a function of temperature)'
 'Precipitation source' 'Precipitation_amount, SAM'
 'Precipitation_amount, Temperature_air' 'Precipitation_amount, humidity'
 'S' 'Solar insolation' 'Source and intensity' 'Storm track, Pacific SST'
 'T' 'T_air' 'T_air, P_amount' 'T_air, P_amount, drought index SPEI'
 'T_air, RH, P_amount' 'T_water' 'Temperature and Precipitation amount'


### paleoData_sensorSpecies

In [51]:
# climate_interpretation
key = 'paleoData_sensorSpecies'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))


paleoData_sensorSpecies: 
['Ceratoporella nicholsoni' 'Diploria labyrinthiformis'
 'Diploria strigosa' 'Hydnophora microconos, Porites lobata' 'NA' 'NULL'
 'NaN' 'P. australiensis, possibly P. lobata'
 'Pavona clavus, Pavona gigantea' 'Porites' 'Porites lobata'
 'Porites lutea' 'Porites sp.' 'Siderastrea radians' 'Siderastrea siderea'
 'heliopora' 'labyrinthiformis' 'lamellina' 'lobata' 'lutea' 'nicholsoni']


## data 

### paleoData_units

In [52]:
# paleoData_units
key = 'paleoData_units'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))

paleoData_units: 
['permil']


### paleoData_values

In [53]:
# # paleoData_values
key = 'paleoData_values'

print('%s: '%key)
for ii, vv in enumerate(df[key][:20]):
    try: 
        print('%-30s: %s -- %s'%(df['dataSetName'].iloc[ii][:30], str(np.nanmin(vv)), str(np.nanmax(vv))))
        print(type(vv))
    except: print(df['dataSetName'].iloc[ii], 'NaNs detected.')

paleoData_values: 
CO00COKY                      : -4.705 -- -4.1
<class 'numpy.ndarray'>
CO00DRBE                      : -3.45 -- -2.69
<class 'numpy.ndarray'>
CO00FERA                      : -4.19 -- -2.48
<class 'numpy.ndarray'>
CO00KUNI                      : -5.25 -- -3.5
<class 'numpy.ndarray'>
CO00URMA                      : -5.3044 -- -3.7523
<class 'numpy.ndarray'>
CO01TUNG                      : -5.515 -- -4.344
<class 'numpy.ndarray'>
CO01TUNG                      : -5.484 -- -4.132
<class 'numpy.ndarray'>
CO02KUBE                      : -4.24 -- -2.53
<class 'numpy.ndarray'>
CO03CHBA                      : -5.782 -- -4.127
<class 'numpy.ndarray'>
CO03CHBU                      : -5.758 -- -4.6518
<class 'numpy.ndarray'>
CO03COPM                      : -5.75 -- -4.1
<class 'numpy.ndarray'>
CO04BAFI                      : -0.26 -- 0.36
<class 'numpy.ndarray'>
CO04KIVA                      : -5.43 -- -4.15
<class 'numpy.ndarray'>
CO04LIFI                      : -5.41 -- -4.56
<

### year

In [54]:
# # year
key = 'year'
print('%s: '%key)
for ii, vv in enumerate(df[key][:20]):
    try: print('%-30s: %s -- %s'%(df['dataSetName'].iloc[ii][:30], str(np.nanmin(vv)), str(np.nanmax(vv))))
    except: print('NaNs detected.', vv)

year: 
CO00COKY                      : 1801.0 -- 1994.0
CO00DRBE                      : 1856.0 -- 1920.0
CO00FERA                      : 1751.083 -- 1995.583
CO00KUNI                      : 1879.08 -- 1994.92
CO00URMA                      : 1840.0 -- 1994.5
CO01TUNG                      : 1880.792 -- 1993.042
CO01TUNG                      : 1884.542 -- 1993.042
CO02KUBE                      : 1519.21 -- 1604.21
CO03CHBA                      : 1782.0 -- 1990.0
CO03CHBU                      : 1860.0 -- 1990.58
CO03COPM                      : 928.13 -- 1998.37
CO04BAFI                      : 1940.0 -- 2000.0
CO04KIVA                      : 1928.54 -- 1992.46
CO04LIFI                      : 1781.5 -- 1996.5
CO04LIRA                      : 1726.78 -- 1996.91
CO04MGNG                      : 1910.0 -- 1997.0
CO04PFCH                      : 1961.125 -- 1995.625
CO04PFRE                      : 1832.0 -- 1995.67
CO04ZIMG                      : 1659.625 -- 1995.625
CO05ASGU                      :

### yearUnits

In [55]:
# yearUnits
key = 'yearUnits'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))

yearUnits: 
['CE']
