# load ch2k data from lipd files and create/save a compact uniform pandas dataframe.  From: https://github.com/CoralHydro2k/ch2k-notebooks/blob/main/CH2k_Filter_Example_Python_Updated.ipynb : Converting LiPD files to a `pandas.DataFrame`

This notebook demonstrates how to convert a collection of LiPD files to a `pandas.DataFrame`.

In [1]:
import pandas as pd
import lipd
import numpy as np
import os

In [2]:
# download and unzip the dataset in LiPD
#!wget https://lipdverse.org/CoralHydro2k/current_version/CoralHydro2k1_0_0.zip
#!unzip CoralHydro2k1_0_0.zip

In [3]:
# load LiPD files from the given directory
D = lipd.readLipd('/home/jupyter-mnevans/compile_proxy_database_v2.0/ch2k/lipdfiles/');
TS = lipd.extractTs(D)
len(TS)

Disclaimer: LiPD files may be updated and modified to adhere to standards

Found: 179 LiPD file(s)
reading: CH03BUN01.lpd
reading: ZI15MER01.lpd
reading: CO03PAL03.lpd
reading: CO03PAL02.lpd
reading: CA13PEL01.lpd
reading: LI06RAR01.lpd
reading: CO03PAL07.lpd
reading: FL18DTO03.lpd
reading: SM06LKF02.lpd
reading: UR00MAI01.lpd
reading: TU95MAD01.lpd
reading: ZI04IFR01.lpd
reading: RE18CAY01.lpd
reading: KU99HOU01.lpd
reading: OS13NLP01.lpd
reading: EV98KIR01.lpd
reading: LI00RAR01.lpd
reading: NU11PAL01.lpd
reading: MA08DTO01.lpd
reading: AB20MEN03.lpd
reading: CA14TIM01.lpd
reading: KA17RYU01.lpd
reading: MC11KIR01.lpd
reading: AB20MEN09.lpd
reading: HE08LRA01.lpd
reading: DA06MAF01.lpd
reading: SM06LKF01.lpd
reading: NA09MAL01.lpd
reading: SW98STP01.lpd
reading: MU18GSI01.lpd
reading: ZI14HOU01.lpd
reading: FL17DTO02.lpd
reading: DA06MAF02.lpd
reading: SA19PAL02.lpd
reading: CO03PAL01.lpd
reading: ZI16ROD01.lpd
reading: OS13NGP01.lpd
reading: CH98PIR01.lpd
reading: RE19GBR02.lpd
read

608

In [4]:
# create a null DataFrame

col_str=[
    'datasetId',
    'dataSetName', 'archiveType',                                                                                
    'geo_meanElev', 'geo_meanLat', 'geo_meanLon',
    'year', 'yearUnits',                                                                                         
    'paleoData_variableName',
    'paleoData_units',                                                                                           
    'paleoData_values',
    'paleoData_notes',
]

df_tmp = pd.DataFrame(index=range(len(TS)), columns=col_str)

In [5]:
# check the keys
TS[0].keys()

dict_keys(['mode', 'time_id', 'archiveType', 'context', 'createdBy', 'datasetId', 'dataSetName', 'lipdverseLink', 'maxYear', 'minYear', 'originalDataUrl', 'changelog', 'pub1_author', 'pub1_citation', 'pub1_doi', 'pub1_firstauthor', 'pub1_journal', 'pub1_title', 'pub1_year', 'pub2_year', 'pub3_year', 'geo_meanLon', 'geo_meanLat', 'geo_meanElev', 'geo_type', 'geo_description', 'geo_ocean', 'geo_ocean2', 'geo_secondarySiteName', 'geo_siteName', '@context', 'lipdVersion', 'tableType', 'paleoData_tableName', 'paleoData_filename', 'paleoData_missingValue', 'year', 'yearUnits', 'paleoData_analyticalError', 'paleoData_analyticalErrorUnits', 'paleoData_archiveSpecies', 'paleoData_ch2kCoreCode', 'paleoData_coralExtensionRate', 'paleoData_coralExtensionRateNotes', 'paleoData_coralHydro2kGroup', 'paleoData_coreName', 'paleoData_isAnomaly', 'paleoData_isComposite', 'paleoData_jcpCorrected', 'paleoData_jcpMeasured', 'paleoData_jcpUsed', 'paleoData_measurementTableName', 'paleoData_notes', 'paleoData

In [6]:
# loop over the timeseries and pick those for global temperature analysis
i = 0                                                                                                                
for ts in TS:
    # need to filter these variables in the list
    if ts['paleoData_variableName'] not in ['year', 'd18OUncertainty', 'SrCaUncertainty']:
        for name in col_str:                                                                                         
            try:
                df_tmp.loc[i, name] = ts[name]                                                                       
            except:
                df_tmp.loc[i, name] = np.nan                                                                         
    
    i += 1 
        
# drop the rows with all NaNs (those not for global temperature analysis)
df = df_tmp.dropna(how='all')
df

Unnamed: 0,datasetId,dataSetName,archiveType,geo_meanElev,geo_meanLat,geo_meanLon,year,yearUnits,paleoData_variableName,paleoData_units,paleoData_values,paleoData_notes
0,ch2kCH03BUN01,CH03BUN01,coral,-3.0,1.5,124.83,"[1860.0, 1860.08, 1860.17, 1860.25, 1860.33, 1...",AD,d18O,permil,"[-5.2043, -5.1091, -5.0987, -5.1878, -5.1646, ...",This paper did not calibrate the d18O proxy or...
2,ch2kZI15MER01,ZI15MER01,coral,-17.0,-17.1,119.6,"[1891.0, 1892.0, 1893.0, 1894.0, 1895.0, 1896....",AD,SrCa,mmol/mol,"[8.968891401, 8.918738941, 8.912204036, 8.9483...",Sr/Ca-SST recconstructed with composite plus s...
4,ch2kZI15MER01,ZI15MER01,coral,-17.0,-17.1,119.6,"[1891.0, 1892.0, 1893.0, 1894.0, 1895.0, 1896....",AD,SrCa_annual,mmol/mol,"[8.968891401, 8.918738941, 8.912204036, 8.9483...",Sr/Ca-SST recconstructed with composite plus s...
6,ch2kCO03PAL03,CO03PAL03,coral,,5.87,-162.13,"[1317.17, 1317.29, 1317.41, 1317.53, 1317.65, ...",AD,d18O,permil,"[-4.79, -4.73, -4.66, -4.66, -4.78, -4.69, -4....",
8,ch2kCO03PAL02,CO03PAL02,coral,,5.87,-162.13,"[1149.08, 1149.2225, 1149.365, 1149.5075, 1149...",AD,d18O,permil,"[-4.631, -4.724, -4.709, -4.707, -4.912, -4.83...",
...,...,...,...,...,...,...,...,...,...,...,...,...
598,ch2kSA19PAL01,SA19PAL01,coral,-10.0,5.878,-162.142,"[1983.58, 1983.67, 1983.75, 1983.83, 1983.92, ...",AD,d18O,permil,"[-5.21, -5.23, -5.17, -5.13, -5.11, -5.05, -5....",
600,ch2kSA19PAL01,SA19PAL01,coral,-10.0,5.878,-162.142,"[1980.75, 1980.83, 1980.92, 1981.0, 1981.08, 1...",AD,SrCa,mmol/mol,"[8.94, 9.08, 9.17, 9.16, 9.16, 9.17, 9.17, 9.1...",
602,ch2kCH97BVB01,CH97BVB01,coral,-7.0,-4.6162,55.817,"[1846.54, 1846.63, 1846.71, 1846.79, 1846.88, ...",AD,d18O,permil,"[-4.483, -4.581, -4.685, -4.751, -4.767, -4.70...",
604,ch2kRA20TAI01,RA20TAI01,coral,-6.0,21.9,120.7,"[1788.04, 1788.13, 1788.21, 1788.29, 1788.38, ...",AD,d18O,permil,"[-5.40874, nan, nan, nan, nan, nan, -4.43476, ...",Monthly Sr/Ca data available from 1788-2013; M...


In [7]:
# double check the variable names we have
set(df['paleoData_variableName'])

{'SrCa', 'SrCa_annual', 'd18O', 'd18O_annual', 'd18O_sw', 'd18O_sw_annual'}

In [8]:
# save to a pickle file
df.to_pickle('../ch2k_compact.pkl')

In [9]:
!pwd
!ls -lst ../ch2k_compact.pkl

/home/jupyter-mnevans/compile_proxy_database_v2.0/ch2k/lipdfiles
3564 -rw-r--r-- 1 jupyter-mnevans jupyter-mnevans 3649130 Jun  5 13:32 ../ch2k_compact.pkl
