# Converting LiPD files to a `pandas.DataFrame`

This notebook demonstrates how to convert a collection of LiPD files to a `pandas.DataFrame`.

In [33]:
import pandas as pd
import lipd
import numpy as np
import os

In [34]:
# download and unzip the dataset in LiPD. Do this once and comment out for later debugging.
#!wget https://lipdverse.org/CoralHydro2k/current_version/CoralHydro2k1_0_0.zip
#!unzip CoralHydro2k1_0_0.zip

In [35]:
# load LiPD files from the given directory
# note that this function has some bug on path switching;
# not any arbitray path can work.  The following actually works.
os.chdir('/research/da22/dbc23/compile_proxy_database_v2.0/ch2k/lipdfiles/')
os.getcwd() # check the pwd is actually what we set. 
lipds = lipd.readLipd('.')
# extract timeseries from the list of LiDP objects
ts_list = lipd.extractTs(lipds)

Disclaimer: LiPD files may be updated and modified to adhere to standards

Found: 179 LiPD file(s)
reading: CA13DIA01.lpd
reading: HE10GUA01.lpd
reading: AB15BHB01.lpd
reading: AL16PUR01.lpd
reading: CH03BUN01.lpd
reading: HE13MIS02.lpd
reading: TU01SIA01.lpd
reading: BO14HTI02.lpd
reading: BA04FIJ01.lpd
reading: MA08DTO01.lpd
reading: DR00NBB01.lpd
reading: PF04PBA01.lpd
reading: TA18TAS01.lpd
reading: MU18RED02.lpd
reading: BR19RED01.lpd
reading: OS14RIP01.lpd
reading: LI99CLI01.lpd
reading: MO20WOA01.lpd
reading: CO03PAL01.lpd
reading: MC04PNG01.lpd
reading: MC11KIR01.lpd
reading: DE16RED01.lpd
reading: CO03PAL06.lpd
reading: MU18GSI01.lpd
reading: MU18NPI01.lpd
reading: KI14PAR01.lpd
reading: AB20MEN06.lpd
reading: QU06RAB01.lpd
reading: FE09OGA01.lpd
reading: RI10PBL01.lpd
reading: AB20MEN09.lpd
reading: KR20SAR01.lpd
reading: AB20MEN07.lpd
reading: DE13HAI01.lpd
reading: KL97DAH01.lpd
reading: KU99HOU01.lpd
reading: CO03PAL05.lpd
reading: CA13SAP01.lpd
reading: CA07FLI01.lpd
read

In [36]:
# create a null DataFrame

col_str=[
    'datasetId',
    'dataSetName', 'archiveType',                                                                                
    'geo_meanElev', 'geo_meanLat', 'geo_meanLon',
    'year', 'yearUnits',                                                                                         
    'paleoData_variableName',
    'paleoData_units',                                                                                           
    'paleoData_values',
    'paleoData_notes',
    'originalDataUrl'
]

df_tmp = pd.DataFrame(index=range(len(ts_list)), columns=col_str)

In [37]:
# check the keys: ask for the keys, but these are many more columns 
# than the col_str null DataFrame, why would you do that? 
# To create a more compact DataFrame without many extraneous columns of metadata.
ts_list[0].keys()

dict_keys(['mode', 'time_id', 'archiveType', 'context', 'createdBy', 'datasetId', 'dataSetName', 'lipdverseLink', 'maxYear', 'minYear', 'originalDataUrl', 'changelog', 'pub1_author', 'pub1_citation', 'pub1_doi', 'pub1_firstauthor', 'pub1_journal', 'pub1_title', 'pub1_year', 'pub2_year', 'pub3_year', 'geo_meanLon', 'geo_meanLat', 'geo_meanElev', 'geo_type', 'geo_description', 'geo_ocean', 'geo_ocean2', 'geo_secondarySiteName', 'geo_siteName', '@context', 'lipdVersion', 'tableType', 'paleoData_tableName', 'paleoData_filename', 'paleoData_missingValue', 'year', 'yearUnits', 'paleoData_analyticalError', 'paleoData_analyticalErrorUnits', 'paleoData_archiveSpecies', 'paleoData_ch2kCoreCode', 'paleoData_coralExtensionRate', 'paleoData_coralHydro2kGroup', 'paleoData_coreName', 'paleoData_isAnomaly', 'paleoData_isComposite', 'paleoData_jcpCorrected', 'paleoData_jcpMeasured', 'paleoData_jcpUsed', 'paleoData_measurementTableName', 'paleoData_samplingNotes', 'paleoData_samplingResolution', 'paleoD

In [44]:
# loop over the timeseries and pick those for global temperature analysis
i = 0      # set counter to zero                                                                                                          
for ts in ts_list: # for each dictionary entry,
    # need to filter these variables in the list
    # if the ts-th dictionary term is not year, or o18 or Sr/Ca uncertainty,
    if ts['paleoData_variableName'] not in ['year', 'd18OUncertainty', 'SrCaUncertainty']:
        # loop over each entry in col_str, the compact set of dataframe column headers
        for name in col_str:                                                                                         
            try:# test for errors: write the data into the loc dimension of df_tmp, ith row, nameth column?
                df_tmp.loc[i, name] = ts[name]                                                                       
            except: # if the value is missing, write nan
                df_tmp.loc[i, name] = np.nan                                                                         
    
    i += 1 # increment the counter
        # I guess the loop is closed... you would only know from the indenting?
# drop the rows with all NaNs (those not for global temperature analysis)
df = df_tmp.dropna(how='all')
df # this just prints the resulting df to console. Well, leaving out the vast middle number of rows.  What if you wanted to view them?

Unnamed: 0,datasetId,dataSetName,archiveType,geo_meanElev,geo_meanLat,geo_meanLon,year,yearUnits,paleoData_variableName,paleoData_units,paleoData_values,paleoData_notes,originalDataUrl
0,ch2kCA13DIA01,CA13DIA01,coral,-6.0,16.064,-86.951,"[1940.0, 1940.09, 1940.17, 1940.35, 1940.44, 1...",AD,d18O,permil,"[-3.957, -3.94, -3.803, -3.794, -4.054, -3.979...",,https://www.ncdc.noaa.gov/paleo/study/17378
2,ch2kHE10GUA01,HE10GUA01,coral,-1.7,16.2,-61.49,"[1895.7, 1895.78, 1895.87, 1895.95, 1896.03, 1...",AD,d18O,permil,"[-4.022, -3.887, -3.807, -3.794, -3.8, -3.81, ...",This calibration data is taken from the top 40...,https://www.ncdc.noaa.gov/paleo/study/12893
4,ch2kHE10GUA01,HE10GUA01,coral,-1.7,16.2,-61.49,"[1895.7, 1895.78, 1895.87, 1895.95, 1896.03, 1...",AD,SrCa,mmol/mol,"[8.838, 8.914, 8.95, 8.935, 8.956, 8.954, 8.92...",This calibration data is taken from the top 40...,https://www.ncdc.noaa.gov/paleo/study/12893
6,ch2kHE10GUA01,HE10GUA01,coral,-1.7,16.2,-61.49,"[1896.0, 1897.0, 1898.0, 1899.0, 1900.0, 1901....",AD,SrCa_annual,mmol/mol,"[8.86, 8.876, 8.895, 8.913, 8.897, 8.9, 8.946,...",This calibration data is taken from the top 40...,https://www.ncdc.noaa.gov/paleo/study/12893
8,ch2kHE10GUA01,HE10GUA01,coral,-1.7,16.2,-61.49,"[1896.0, 1897.0, 1898.0, 1899.0, 1900.0, 1901....",AD,d18O_annual,permil,"[-4.02, -4.098, -4.168, -4.129, -4.132, -4.124...",This calibration data is taken from the top 40...,https://www.ncdc.noaa.gov/paleo/study/12893
...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,ch2kNU09KIR01,NU09KIR01,coral,-9.0,1.8667,-157.4,"[1972.13, 1972.21, 1972.29, 1972.38, 1972.46, ...",AD,SrCa,mmol/mol,"[9.08, 9.06, 9.06, 8.94, 9.0, 8.99, 9.0, 8.98,...",,https://www.ncdc.noaa.gov/paleo/study/8609
600,ch2kNU09KIR01,NU09KIR01,coral,-9.0,1.8667,-157.4,"[1972.13, 1972.21, 1972.29, 1972.38, 1972.46, ...",AD,d18O_sw,permil,"[1.01, 1.19, 0.84, 1.25, 0.77, 0.71, 0.66, 0.7...",,https://www.ncdc.noaa.gov/paleo/study/8609
602,ch2kZI14IFR02,ZI14IFR02,coral,-1.8,-23.1573,43.5882,"[1882.0, 1883.0, 1884.0, 1885.0, 1886.0, 1887....",AD,d18O,permil,"[-4.477, -4.221, -4.426, -4.488, -4.421, -4.47...",Published slopes based on composite coral resu...,https://www.ncdc.noaa.gov/paleo/study/16438
604,ch2kZI14IFR02,ZI14IFR02,coral,-1.8,-23.1573,43.5882,"[1882.0, 1883.0, 1884.0, 1885.0, 1886.0, 1887....",AD,d18O_annual,permil,"[-4.477, -4.221, -4.426, -4.488, -4.421, -4.47...",Published slopes based on composite coral resu...,https://www.ncdc.noaa.gov/paleo/study/16438


In [45]:
# double check the variable names we have
set(df['paleoData_variableName'])

{'SrCa', 'SrCa_annual', 'd18O', 'd18O_annual', 'd18O_sw', 'd18O_sw_annual'}

In [46]:
# see if I understand this: see the 'originalDataUrl' too?
set(df['originalDataUrl'])

{'https://doi.org/10.1594/PANGAEA.874078',
 'https://doi.pangaea.de/10.1594/PANGAEA.743953',
 'https://doi.pangaea.de/10.1594/PANGAEA.830601',
 'https://doi.pangaea.de/10.1594/PANGAEA.88199',
 'https://doi.pangaea.de/10.1594/PANGAEA.88200',
 'https://doi.pangaea.de/10.1594/PANGAEA.887712',
 'https://doi.pangaea.de/10.1594/PANGAEA.891094',
 'https://www.ncdc.noaa.gov/paleo/study/1003972',
 'https://www.ncdc.noaa.gov/paleo/study/1003973',
 'https://www.ncdc.noaa.gov/paleo/study/10373',
 'https://www.ncdc.noaa.gov/paleo/study/10425',
 'https://www.ncdc.noaa.gov/paleo/study/10808',
 'https://www.ncdc.noaa.gov/paleo/study/11935',
 'https://www.ncdc.noaa.gov/paleo/study/12278',
 'https://www.ncdc.noaa.gov/paleo/study/12891',
 'https://www.ncdc.noaa.gov/paleo/study/12893',
 'https://www.ncdc.noaa.gov/paleo/study/12994',
 'https://www.ncdc.noaa.gov/paleo/study/13035',
 'https://www.ncdc.noaa.gov/paleo/study/13439',
 'https://www.ncdc.noaa.gov/paleo/study/15238',
 'https://www.ncdc.noaa.gov/pal

In [48]:
# save to a pickle file (security: is it better to save to csv?)
df.to_pickle('/research/da22/dbc23/compile_proxy_database_v2.0/ch2k/ch2k_compact.pkl')

In [49]:
!ls '/research/da22/dbc23/compile_proxy_database_v2.0/ch2k/ch2k_compact.pkl'

/research/da22/dbc23/compile_proxy_database_v2.0/ch2k/ch2k.pkl
