In [1]:
# -*- coding: utf-8 -*-
"""
Created on Mon Jul  3 12:53:15 2023

@author: lluecke

Create a common database from multiple collated databases
    - PAGES2k (load_pages2k.ipynb)
    - FE23 (Breitenmoser 14) (load_fe23.ipynb)
    - SISAL v3 (load_sisal.ipynb)
    - CH2k (load_ch2k.ipynb)
    - Iso2k (load_iso2k.ipynb)

This database is subject to duplicates, so please run the duplicate detection 
files on the output.

We obtain a common dataframe with the following columns:
columns=['archiveType', 
        'climateInterpretation_variable',
        'climateInterpretation_variableDetail',
        'datasetId',
        'dataSetName',                                                                                
        'geo_meanElev', 
        'geo_meanLat', 
        'geo_meanLon',
        'year', 'yearUnits',                                                                                         
        'paleoData_variableName',
        'paleoData_units',                                                                                           
        'paleoData_values',
        'paleoData_notes',
        'paleoData_sensorSpecies',
        'originalDataURL',
        'originalDatabase'
]


"""


"\nCreated on Mon Jul  3 12:53:15 2023\n\n@author: lluecke\n\nCreate a common database from multiple collated databases\n    - PAGES2k (load_pages2k.ipynb)\n    - FE23 (Breitenmoser 14) (load_fe23.ipynb)\n    - SISAL v3 (load_sisal.ipynb)\n    - CH2k (load_ch2k.ipynb)\n    - Iso2k (load_iso2k.ipynb)\n\n\nWe obtain a common dataframe with the following columns:\ncolumns=['archiveType', \n        'climateInterpretation_variable',\n        'climateInterpretation_variableDetail',\n        'datasetId',\n        'dataSetName',                                                                                \n        'geo_meanElev', \n        'geo_meanLat', \n        'geo_meanLon',\n        'year', 'yearUnits',                                                                                         \n        'paleoData_variableName',\n        'paleoData_units',                                                                                           \n        'paleoData_values',\n        'paleoD

# set up working environment

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature 
from matplotlib.gridspec import GridSpec as GS
from copy import deepcopy as dc

In [4]:
#wdir = '/home/jupyter-lluecke/compile_proxy_database_v2.0' # working directory, this should work, but doesn't seem to...
if os.getcwd().endswith('compile_proxy_database_v2.0'):
    pass
else:
    os.chdir(os.getcwd()+'/..')
wdir = os.getcwd()
print('working directory: '+wdir)
import functions as f

working directory: /home/jupyter-lluecke/compile_proxy_database_v2.1


# load compact dataframes

In [5]:
# read compact dataframes from all the single databases

dataset_names = ['pages2k', 'fe23', 'ch2k', 'iso2k', 'sisal' ]


dfs = []
for ii, dn in enumerate(dataset_names):
    dfs += [f.load_compact_dataframe_from_csv(dn)]
    # dfs += [pd.read_pickle('%s/%s_compact.pkl'%(dn, dn))]
    # print(ddir[ii], set(dfs[ii]['originalDatabase']), dfs[ii].info(verbose=False))
    print(dfs[ii].columns)
    print('------------')
    

Index(['archiveType', 'climateInterpretation_variable',
       'climateInterpretation_variableDetail', 'dataSetName', 'datasetId',
       'geo_meanElev', 'geo_meanLat', 'geo_meanLon', 'geo_siteName',
       'originalDataURL', 'originalDatabase', 'paleoData_notes',
       'paleoData_proxy', 'paleoData_sensorSpecies', 'paleoData_units',
       'paleoData_values', 'year', 'yearUnits'],
      dtype='object')
------------
Index(['archiveType', 'climateInterpretation_variable',
       'climateInterpretation_variableDetail', 'dataSetName', 'datasetId',
       'geo_meanElev', 'geo_meanLat', 'geo_meanLon', 'geo_siteName',
       'originalDataURL', 'originalDatabase', 'paleoData_notes',
       'paleoData_proxy', 'paleoData_sensorSpecies', 'paleoData_units',
       'paleoData_values', 'year', 'yearUnits'],
      dtype='object')
------------
Index(['archiveType', 'climateInterpretation_variable',
       'climateInterpretation_variableDetail', 'dataSetName', 'datasetId',
       'geo_meanElev', 'geo

In [6]:
# create DoD dataframe, start by loading first dataframe
dn = dataset_names[0]
df = pd.read_pickle('%s/%s_compact.pkl'%(dn, dn))
print(dn, set(df['originalDatabase']))
print(df.info(verbose=False))

pages2k {'PAGES2k v2.0.0 (Ocn_103 updated with Dee et al. 2020)'}
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 692 entries, 0 to 691
Columns: 18 entries, archiveType to yearUnits
dtypes: float64(3), object(15)
memory usage: 97.4+ KB
None


In [7]:
# load other dataframes and concatenate to common dataframe
for dn in dataset_names[1:]:
    add_df = pd.read_pickle('%s/%s_compact.pkl'%(dn, dn))
    print(dn, set(add_df['originalDatabase']), add_df.info(verbose=False))
    df = pd.concat([df, add_df], ignore_index=True)
    print('------------')

print(set(df['originalDatabase']))
df.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2754 entries, 0 to 2753
Columns: 18 entries, archiveType to yearUnits
dtypes: float64(3), object(15)
memory usage: 387.4+ KB
fe23 {'FE23 (Breitenmoser et al. (2014))'} None
------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Columns: 18 entries, archiveType to yearUnits
dtypes: float32(3), object(15)
memory usage: 35.2+ KB
ch2k {'CoralHydro2k v1.0.0'} None
------------
<class 'pandas.core.frame.DataFrame'>
Index: 582 entries, 0 to 595
Columns: 18 entries, archiveType to yearUnits
dtypes: float32(3), object(15)
memory usage: 79.6+ KB
iso2k {'Iso2k v1.0.1'} None
------------
<class 'pandas.core.frame.DataFrame'>
Index: 541 entries, 0 to 545
Columns: 18 entries, archiveType to yearUnits
dtypes: float64(3), object(15)
memory usage: 80.3+ KB
sisal {'SISAL v3'} None
------------
{'SISAL v3', 'FE23 (Breitenmoser et al. (2014))', 'PAGES2k v2.0.0 (Ocn_103 updated with Dee et al. 2020)', 'CoralHydro2k v1.0.0', 'Is

In [8]:
print(df.shape)
drop_inds = []
for ii in range(df.shape[0]):
    if len(df.iloc[ii]['year'])==0:
        print('empty', ii, df.iloc[ii]['year'], df.iloc[ii]['originalDatabase'])
        drop_inds += [df.index[ii]]
print(drop_inds)
# df = df.drop(index=drop_inds)

(4841, 18)
[]


# save pickle

In [9]:
# save concatenate dataframe as DoD2k
df.name='DoD2k_v1'
df.to_pickle('dod2k/dod2k_compact.pkl')

In [10]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4841 entries, 0 to 4840
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           4841 non-null   object 
 1   climateInterpretation_variable        4841 non-null   object 
 2   climateInterpretation_variableDetail  4841 non-null   object 
 3   dataSetName                           4841 non-null   object 
 4   datasetId                             4841 non-null   object 
 5   geo_meanElev                          4750 non-null   float64
 6   geo_meanLat                           4841 non-null   float64
 7   geo_meanLon                           4841 non-null   float64
 8   geo_siteName                          4841 non-null   object 
 9   originalDataURL                       4841 non-null   object 
 10  originalDatabase                      4841 non-null   object 
 11  paleoData_notes  

# save csv

In [11]:
# save to a list of csv files (metadata, data, year)
df.name='dod2k'
f.write_compact_dataframe_to_csv(df)

METADATA: archiveType, climateInterpretation_variable, climateInterpretation_variableDetail, dataSetName, datasetId, geo_meanElev, geo_meanLat, geo_meanLon, geo_siteName, originalDataURL, originalDatabase, paleoData_notes, paleoData_proxy, paleoData_sensorSpecies, paleoData_units, yearUnits
Saved to /home/jupyter-lluecke/compile_proxy_database_v2.1/dod2k/dod2k_compact_%s.csv


In [12]:
# load dataframe
print(f.load_compact_dataframe_from_csv('dod2k').info())
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4841 entries, 0 to 4840
Data columns (total 18 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           4841 non-null   object 
 1   climateInterpretation_variable        4841 non-null   object 
 2   climateInterpretation_variableDetail  4841 non-null   object 
 3   dataSetName                           4841 non-null   object 
 4   datasetId                             4841 non-null   object 
 5   geo_meanElev                          4750 non-null   float32
 6   geo_meanLat                           4841 non-null   float32
 7   geo_meanLon                           4841 non-null   float32
 8   geo_siteName                          4841 non-null   object 
 9   originalDataURL                       4841 non-null   object 
 10  originalDatabase                      4841 non-null   object 
 11  paleoData_notes  