This file reads the compact dataframes and filters for specific records (e.g. for moisture sensitive records). 
The filtered dataset is saved in a separate directory and can be loaded for further analysis or plotting etc.

Author: Lucie Luecke

Date produced: 21/01/2025

Input: 
reads dataframe with the following keys:
['archiveType', 'climateInterpretation_variable', 'climateInterpretation_variableDetail',
 'dataSetName', 'geo_meanElev', 'geo_meanLat', 'geo_meanLon', 
 'originalDataURL', 'paleoData_TSid', 'paleoData_notes', 'paleoData_proxy', 
 'paleoData_units', 'paleoData_values', 'paleoData_variableName', 'year', 'yearUnits', (optional: 'DuplicateDetails')]

modify directory and filename (currently reads pickle) as needed

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature 
from matplotlib.gridspec import GridSpec as GS
import scipy.io as sio
from copy import deepcopy as dc

In [3]:
# choose working directory
wdir = '/home/jupyter-lluecke/compile_proxy_database_v2.1'
os.chdir(wdir)
print(wdir)
import functions as f # contains functions for plotting 

/home/jupyter-lluecke/compile_proxy_database_v2.1


In [4]:
# read dataframe 
# e.g. PAGES2k but works with any dataframe as long as notation is followed

db_name = 'dod2k_dupfree_dupfree'
# db_name = 'dod2k_dupfree'
# db_name = 'dod2k'

# load dataframe
df = f.load_compact_dataframe_from_csv(db_name)

# df = pd.read_pickle('%s/%s_compact.pkl')

print(df.originalDatabase.unique())

df.name = db_name

['FE23 (Breitenmoser et al. (2014))' 'CoralHydro2k v1.0.0'
 'dod2k_composite_standardised' 'Iso2k v1.0.1'
 'PAGES2k v2.0.0 (Ocn_103 updated with Dee et al. 2020)' 'SISAL v3']


In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4516 entries, 0 to 4515
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           4516 non-null   object 
 1   climateInterpretation_variable        4516 non-null   object 
 2   climateInterpretation_variableDetail  4516 non-null   object 
 3   dataSetName                           4516 non-null   object 
 4   datasetId                             4516 non-null   object 
 5   duplicateDetails                      4516 non-null   object 
 6   geo_meanElev                          4433 non-null   float32
 7   geo_meanLat                           4516 non-null   float32
 8   geo_meanLon                           4516 non-null   float32
 9   geo_siteName                          4516 non-null   object 
 10  originalDataURL                       4516 non-null   object 
 11  originalDatabase 

## filter dataframe for specific record types

In [6]:
# if you want to filter for specific metadata, e.g. temperature or moisture records, run this:


# ---> climateInterpretation_variable
# e.g.

# # filter for >>moisture<< sensitive records only (also include records which are moisture and temperature sensitive)
# df = df.loc[(df['climateInterpretation_variable']=='moisture')|(df['climateInterpretation_variable']=='temperature+moisture')]

# # filter for >>exclusively moisture<< sensitive records only (without t+m)
df = df.loc[(df['climateInterpretation_variable']=='moisture')]

# # filter for >>temperature<< sensitive records only (also include records which are moisture and temperature sensitive)
# df = df.loc[(df['climateInterpretation_variable']=='temperature')|(df['climateInterpretation_variable']=='temperature+moisture'])]



# ---> archiveType and paleoData_proxy
# e.g.

# # filter for specific proxy type, e.g. archiveType='speleothem' and paleoData_proxy='d18O'
# df = df.loc[(df['archiveType']=='speleothem')&(df['paleoData_proxy']=='d18O')]


# ---> paleoData_proxy only
# e.g. 

# df = df.loc[(df['paleoData_proxy']=='MXD')]

# etc.

In [7]:
# df needs name reassigned as it gets lost otherwise after assigning new value to df (through the filtering above)

# for the M+T filtered example, revise df.name to
df.name = db_name + "_M" 
print(df.name)

dod2k_dupfree_dupfree_M


In [8]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1073 entries, 2 to 3409
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           1073 non-null   object 
 1   climateInterpretation_variable        1073 non-null   object 
 2   climateInterpretation_variableDetail  1073 non-null   object 
 3   dataSetName                           1073 non-null   object 
 4   datasetId                             1073 non-null   object 
 5   duplicateDetails                      1073 non-null   object 
 6   geo_meanElev                          1068 non-null   float32
 7   geo_meanLat                           1073 non-null   float32
 8   geo_meanLon                           1073 non-null   float32
 9   geo_siteName                          1073 non-null   object 
 10  originalDataURL                       1073 non-null   object 
 11  originalDatabase      

## save filtered dataframe

In [9]:
# create new directory if dir does not exist
path = '/'+df.name
if not os.path.exists(os.getcwd()+path):
    os.makedirs(os.getcwd()+path)

In [10]:
# save as pickle
df.to_pickle('%s/%sl_compact.pkl'%(df.name, df.name))

In [11]:
# save csv
f.write_compact_dataframe_to_csv(df)

METADATA: archiveType, climateInterpretation_variable, climateInterpretation_variableDetail, dataSetName, datasetId, duplicateDetails, geo_meanElev, geo_meanLat, geo_meanLon, geo_siteName, originalDataURL, originalDatabase, paleoData_notes, paleoData_proxy, paleoData_sensorSpecies, paleoData_units, yearUnits
Saved to /home/jupyter-lluecke/compile_proxy_database_v2.1/dod2k_dupfree_dupfree_M/dod2k_dupfree_dupfree_M_compact_%s.csv


In [12]:
# load dataframe
f.load_compact_dataframe_from_csv(df.name).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1073 entries, 0 to 1072
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   archiveType                           1073 non-null   object 
 1   climateInterpretation_variable        1073 non-null   object 
 2   climateInterpretation_variableDetail  1073 non-null   object 
 3   dataSetName                           1073 non-null   object 
 4   datasetId                             1073 non-null   object 
 5   duplicateDetails                      1073 non-null   object 
 6   geo_meanElev                          1068 non-null   float32
 7   geo_meanLat                           1073 non-null   float32
 8   geo_meanLon                           1073 non-null   float32
 9   geo_siteName                          1073 non-null   object 
 10  originalDataURL                       1073 non-null   object 
 11  originalDatabase 