This notebook goes through the columns of a compact dataframe and displays the (meta)data.

# Set up working environment

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature 
from matplotlib.gridspec import GridSpec as GS
import scipy.io as sio
from copy import deepcopy as dc

In [3]:
# choose working directory
wdir = '/home/jupyter-lluecke/compile_proxy_database_v2.1'
os.chdir(wdir)
print(wdir)
import functions as f # contains functions for plotting 

/home/jupyter-lluecke/compile_proxy_database_v2.1


# read dataframe

In [4]:
# read dataframe 

# db_name = 'dod2k'
db_name = 'dod2k_dupfree'
# db_name = 'ch2k'
# db_name = 'fe23'
# db_name = 'iso2k'
# db_name = 'pages2k'
# db_name = 'sisal'


# load dataframe
df = f.load_compact_dataframe_from_csv(db_name)
# databasedir    = '%s/%s_compact.pkl'%(db_name, db_name)
# df = pd.read_pickle(databasedir)

print(df.info())
df.name = db_name


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4517 entries, 0 to 4516
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   DuplicateDetails                      4517 non-null   object 
 1   archiveType                           4517 non-null   object 
 2   climateInterpretation_variable        4517 non-null   object 
 3   climateInterpretation_variableDetail  4517 non-null   object 
 4   dataSetName                           4517 non-null   object 
 5   datasetId                             4517 non-null   object 
 6   geo_meanElev                          4434 non-null   float32
 7   geo_meanLat                           4517 non-null   float32
 8   geo_meanLon                           4517 non-null   float32
 9   geo_siteName                          4517 non-null   object 
 10  originalDataURL                       4517 non-null   object 
 11  originalDatabase 

In [5]:
# for ii in df.index:
#     # if type(df.at[ii, 'paleoData_values'])==np.ma.core.MaskedArray: continue
#     dd=f.convert_to_nparray(df.at[ii, 'paleoData_values'])
#     df.at[ii, 'paleoData_values']=dd.data[~dd.mask]
#     df.at[ii, 'year']=df.at[ii, 'year'][~dd.mask]

# check output

## dataset metadata: dataSetName, datasetId, originalDataURL, originalDatabase

### index

In [6]:
# # check index
print(df.index)

RangeIndex(start=0, stop=4517, step=1)


### dataSetName

In [7]:
# # check dataSetName
key = 'dataSetName'
print('%s: '%key)
print(df[key].values)
print(np.unique([str(type(dd)) for dd in df[key]]))

dataSetName: 
['africa_keny001' 'africa_keny002' 'africa_morc001' ...
 'northamerica_usa_me017, northamerica_usa_me018'
 'northamerica_usa_mo, northamerica_usa_mo009'
 'northamerica_usa_mt112, northamerica_usa_mt113']
["<class 'str'>"]


### datasetId

In [8]:
# # check datasetId

print(len(df.datasetId.unique()))
print(len(df))
key = 'datasetId'
print('%s (starts with): '%key)
print(df[key].values)
print(np.unique([str(type(dd)) for dd in df[key]]))
print('datasetId starts with: ', np.unique([str(dd.split('_')[0]) for dd in df[key]]))

4517
4517
datasetId (starts with): 
['FE23_africa_keny001' 'FE23_africa_keny002' 'FE23_africa_morc001' ...
 'dod2k_composite_st_FE23_northamerica_usa_me017_FE23_northamerica_usa_me018'
 'dod2k_composite_st_FE23_northamerica_usa_mo_FE23_northamerica_usa_mo009'
 'dod2k_composite_st_FE23_northamerica_usa_mt112_FE23_northamerica_usa_mt113']
["<class 'str'>"]
datasetId starts with:  ['FE23' 'ch2k' 'dod2k' 'iso2k' 'pages2k' 'sisal']


### originalDataURL

In [9]:
# originalDataURL
key = 'originalDataURL'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))
print(np.unique([kk for kk in df[key] if 'this' in kk]))
print(np.unique([str(type(dd)) for dd in df[key]]))
# 'this study' should point to the correct URL (PAGES2k)

originalDataURL: 
['10.1002/2015GL063826' '10.1002/2015gl065397' '10.1002/2016GL071786' ...
 'requested from AJO' 'unpublished' 'www.ncdc.noaa.gov/paleo/study/2474']
[]
["<class 'str'>"]


### originalDatabase

In [10]:
# # originalDataSet
key = 'originalDatabase'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))
print(np.unique([str(type(dd)) for dd in df[key]]))
# Note: the last two records have missing URLs

originalDatabase: 
['CoralHydro2k v1.0.0' 'FE23 (Breitenmoser et al. (2014))' 'Iso2k v1.0.1'
 'PAGES2k v2.0.0 (Ocn_103 updated with Dee et al. 2020)' 'SISAL v3'
 'dod2k_composite_standardised']
["<class 'str'>"]


## geographical metadata: elevation, latitude, longitude, site name

### geo_meanElev

In [11]:
# check Elevation
key = 'geo_meanElev'
print('%s: '%key)
print(df[key])
print(np.unique(['%d'%kk for kk in df[key] if np.isfinite(kk)]))
print(np.unique([str(type(dd)) for dd in df[key]]))

geo_meanElev: 
0       2010.0
1       2010.0
2       2200.0
3       1700.0
4       2200.0
         ...  
4512    1890.0
4513    1798.0
4514      50.0
4515     325.0
4516    2500.0
Name: geo_meanElev, Length: 4517, dtype: float32
['-1' '-10' '-1011' ... '991' '994' '995']
["<class 'float'>"]


### geo_meanLat

In [12]:
# # Latitude
key = 'geo_meanLat'
print('%s: '%key)
print(np.unique(['%d'%kk for kk in df[key]]))
print(np.unique([str(type(dd)) for dd in df[key]]))

geo_meanLat: 
['-10' '-11' '-12' '-13' '-14' '-15' '-16' '-17' '-18' '-19' '-2' '-20'
 '-21' '-22' '-23' '-24' '-25' '-26' '-27' '-28' '-29' '-3' '-31' '-32'
 '-33' '-34' '-35' '-36' '-37' '-38' '-39' '-4' '-40' '-41' '-42' '-43'
 '-44' '-45' '-46' '-47' '-5' '-50' '-51' '-53' '-54' '-6' '-64' '-66'
 '-67' '-69' '-7' '-70' '-71' '-72' '-73' '-74' '-75' '-76' '-77' '-78'
 '-79' '-8' '-80' '-82' '-83' '-84' '-86' '-89' '-9' '0' '1' '10' '11'
 '12' '13' '14' '15' '16' '17' '18' '19' '2' '20' '21' '22' '23' '24' '25'
 '26' '27' '28' '29' '3' '30' '31' '32' '33' '34' '35' '36' '37' '38' '39'
 '4' '40' '41' '42' '43' '44' '45' '46' '47' '48' '49' '5' '50' '51' '52'
 '53' '54' '55' '56' '57' '58' '59' '6' '60' '61' '62' '63' '64' '65' '66'
 '67' '68' '69' '7' '70' '71' '72' '73' '75' '76' '77' '78' '79' '8' '80'
 '81' '82' '9']
["<class 'float'>"]


### geo_meanLon

In [13]:
# # Longitude 
key = 'geo_meanLon'
print('%s: '%key)
print(np.unique(['%d'%kk for kk in df[key]]))
print(np.unique([str(type(dd)) for dd in df[key]]))

geo_meanLon: 
['-1' '-10' '-100' '-101' '-102' '-103' '-104' '-105' '-106' '-107' '-108'
 '-109' '-110' '-111' '-112' '-113' '-114' '-115' '-116' '-117' '-118'
 '-119' '-12' '-120' '-121' '-122' '-123' '-124' '-125' '-126' '-127'
 '-128' '-129' '-13' '-130' '-131' '-132' '-133' '-134' '-135' '-136'
 '-137' '-138' '-139' '-140' '-141' '-142' '-143' '-144' '-145' '-146'
 '-147' '-148' '-149' '-150' '-151' '-152' '-153' '-154' '-157' '-159'
 '-16' '-160' '-161' '-162' '-163' '-169' '-17' '-174' '-18' '-19' '-2'
 '-20' '-22' '-24' '-26' '-27' '-3' '-33' '-35' '-36' '-37' '-38' '-39'
 '-4' '-41' '-42' '-43' '-44' '-45' '-46' '-47' '-48' '-49' '-5' '-50'
 '-51' '-53' '-54' '-55' '-56' '-57' '-58' '-6' '-60' '-61' '-62' '-63'
 '-64' '-65' '-66' '-67' '-68' '-69' '-7' '-70' '-71' '-72' '-73' '-74'
 '-75' '-76' '-77' '-78' '-79' '-8' '-80' '-81' '-82' '-83' '-84' '-85'
 '-86' '-87' '-88' '-89' '-9' '-90' '-91' '-92' '-93' '-94' '-95' '-96'
 '-97' '-98' '-99' '0' '1' '10' '100' '101' '102' '103'

### geo_siteName

In [14]:
# Site Name 
key = 'geo_siteName'
print('%s: '%key)
print(df[key].values)
print(np.unique([str(type(dd)) for dd in df[key]]))

geo_siteName: 
['RagatiForestStationNyeriDistrict' 'RagatiForestStationNyeriDistrict'
 'Tounfite' ... 'COMPOSITE: IronboundIsland + IronboundIslandLongCores'
 'COMPOSITE: JeffersonCo. + JeffersonCountyMissouri'
 'COMPOSITE: YellowMountainRidge1 + YellowMountainRidge1-EntireBarkTrees']
["<class 'str'>"]


## proxy metadata: archive type, proxy type, interpretation

### archiveType

In [15]:
# now check all the entries bit by bit (can be omitted at a later stage)

# archiveType
key = 'archiveType'
print('%s: '%key)
print(np.unique(df[key]))
print(np.unique([str(type(dd)) for dd in df[key]]))

archiveType: 
['bivalve' 'borehole' 'coral' 'documents' 'glacier ice' 'ground ice'
 'hybrid' 'lake sediment' 'marine sediment' 'mollusk shells'
 'sclerosponge' 'speleothem' 'terrestrial sediment' 'tree']
["<class 'str'>"]


### paleoData_proxy

In [16]:
# paleoData_proxy
key = 'paleoData_proxy'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))
print(np.unique([str(type(dd)) for dd in df[key]]))

paleoData_proxy: 
['BSi' 'Documentary' 'MXD' 'Mg/Ca' 'Sr/Ca' 'TEX86' 'TRW' 'alkenone'
 'borehole' 'calcification' 'calcification rate' 'chironomid'
 'chrysophyte' 'd13C' 'd18O' 'd2H' 'diatom' 'dynocist MAT' 'foram Mg/Ca'
 'foram d18O' 'foraminifera' 'growth rate' 'historic' 'hybrid' 'melt'
 'midge' 'planktonic foraminifera' 'pollen' 'reflectance'
 'sed accumulation' 'varve property' 'varve thickness']
["<class 'str'>"]


### paleoData_notes

In [17]:
# # paleoData_notes
key = 'paleoData_notes'
print('%s: '%key)
print(df[key].values)
print(np.unique([str(type(dd)) for dd in df[key]]))

paleoData_notes: 
['Investigator: Stahle' 'Investigator: Stahle' 'Investigator: Stockton'
 ...
 'FE23_northamerica_usa_me017: Investigator: Cook, FE23_northamerica_usa_me018: Investigator: Cook'
 'FE23_northamerica_usa_mo: Investigator: UniversityofChicagoTreeRingLaboratory, FE23_northamerica_usa_mo009: Investigator: Bell'
 'FE23_northamerica_usa_mt112: Investigator: King, FE23_northamerica_usa_mt113: Investigator: Waggoner']
["<class 'str'>"]


### climateInterpretation_variable

In [18]:
# climate_interpretation
key = 'climateInterpretation_variable'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))
print(np.unique([str(type(dd)) for dd in df[key]]))


climateInterpretation_variable: 
['N/A' 'NOT temperature NOT moisture' 'moisture' 'temperature'
 'temperature+moisture']
["<class 'str'>"]


### climateInterpretation_variableDetail

In [19]:
# climate_interpretation
key = 'climateInterpretation_variableDetail'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))
print(np.unique([str(type(dd)) for dd in df[key]]))


climateInterpretation_variableDetail: 
['?18O.precipitation' 'AMO' 'Aridity' 'Asian summer monsoon' 'E/P'
 'E:P (groundwater \\fluid balance\\")"' 'East African Monsoon strength'
 'Effective Moisture' 'Estuary salinity' 'Evaporation'
 'FE23_northamerica_usa_ak054: moisture, FE23_northamerica_usa_ak056: N/A'
 'FE23_northamerica_usa_co037: moisture, FE23_northamerica_usa_co039: moisture'
 'FE23_northamerica_usa_co040: moisture, FE23_northamerica_usa_co043: moisture'
 'FE23_northamerica_usa_me017: moisture, FE23_northamerica_usa_me018: temperature+moisture'
 'FE23_northamerica_usa_mo: moisture, FE23_northamerica_usa_mo009: moisture'
 'FE23_northamerica_usa_mt112: N/A, FE23_northamerica_usa_mt113: N/A'
 'I_E' 'M' 'Mean Annual Precipitation'
 'Moisture flux from the subtropical North Pacific (driven by SST anomalies in the Kuroshio Extension region)'
 'Monsoon strength' 'N/A' 'NAO index' 'NULL' 'P' 'P/E' 'PDO' 'PDSI' 'P_E'
 'P_amount' 'P_amount and temperature' 'P_amount, P_E' 'P_amount, RH

### paleoData_sensorSpecies

In [20]:
# climate_interpretation
key = 'paleoData_sensorSpecies'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))
print(np.unique([str(type(dd)) for dd in df[key]]))


paleoData_sensorSpecies: 
['ABAL' 'ABAM' 'ABBA' 'ABBO' 'ABCE' 'ABCI' 'ABCO' 'ABLA' 'ABMA' 'ABPI'
 'ABPN' 'ABPR' 'ABSB' 'ABSP' 'ACRU' 'ACSH' 'ADHO' 'ADUS' 'AGAU' 'ARAR'
 'ATCU' 'ATSE' 'AUCH' 'BEPU' 'CABU' 'CADE' 'CADN' 'CARO' 'CDAT' 'CDBR'
 'CDDE' 'CDLI' 'CEAN' 'CESP' 'CHLA' 'CHNO' 'Ceratoporella nicholsoni'
 'DABI' 'DACO' 'Diploastrea heliopora' 'Diploria labyrinthiformis'
 'Diploria strigosa' 'FAGR' 'FASY' 'FICU' 'FRNI' 'HABI'
 'Hydnophora microconos, Porites lobata' 'JGAU' 'JUEX' 'JUFO' 'JUOC'
 'JUPH' 'JUPR' 'JURE' 'JUSC' 'JUSP' 'JUVI' 'LADE' 'LAGM' 'LALA' 'LALY'
 'LAOC' 'LASI' 'LGFR' 'LIBI' 'LITU' 'Montastraea faveolata' 'N/A' 'NOBE'
 'NOGU' 'NOME' 'NOPU' 'NOSO' 'NULL' 'NaN' 'Orbicella faveolata' 'PCAB'
 'PCEN' 'PCGL' 'PCGN' 'PCMA' 'PCOB' 'PCOM' 'PCPU' 'PCRU' 'PCSH' 'PCSI'
 'PCSM' 'PCSP' 'PHAL' 'PHAS' 'PHGL' 'PHTR' 'PIAL' 'PIAM' 'PIAR' 'PIBA'
 'PIBN' 'PIBR' 'PICE' 'PICL' 'PICO' 'PIEC' 'PIED' 'PIFL' 'PIHA' 'PIHR'
 'PIJE' 'PIKO' 'PILA' 'PILE' 'PILO' 'PIMO' 'PIMU' 'PIMZ' 'PINI' 'PIPA'


## data 

### paleoData_units

In [21]:
# paleoData_units
key = 'paleoData_units'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))
print(np.unique([str(type(dd)) for dd in df[key]]))

paleoData_units: 
['N/A' 'cm' 'cm/yr' 'dark_sum' 'degC' 'g.cm-2.a-1' 'index' 'mm' 'mm/year'
 'mm/yr' 'mmol/mol' 'percent' 'permil' 'standardized_anomalies' 'z-scores']
["<class 'str'>"]


### paleoData_values

In [22]:
# # paleoData_values
key = 'paleoData_values'

print('%s: '%key)
for ii, vv in enumerate(df[key][:20]):
    try: 
        print('%-30s: %s -- %s'%(df['dataSetName'].iloc[ii][:30], str(np.nanmin(vv)), str(np.nanmax(vv))))
        print(type(vv))
    except: print(df['dataSetName'].iloc[ii], 'NaNs detected.')
print(np.unique([str(type(dd)) for dd in df[key]]))

paleoData_values: 
africa_keny001                : 0.4 -- 1.423
<class 'numpy.ndarray'>
africa_keny002                : 0.499 -- 1.631
<class 'numpy.ndarray'>
africa_morc001                : -0.014 -- 2.226
<class 'numpy.ndarray'>
africa_morc002                : 0.323 -- 1.587
<class 'numpy.ndarray'>
africa_morc003                : 0.004 -- 1.617
<class 'numpy.ndarray'>
africa_morc011                : 0.005 -- 2.094
<class 'numpy.ndarray'>
africa_morc012                : 0.435 -- 1.866
<class 'numpy.ndarray'>
africa_morc013                : 0.166 -- 1.389
<class 'numpy.ndarray'>
africa_morc014                : -0.025 -- 2.012
<class 'numpy.ndarray'>
africa_safr001                : 0.485 -- 2.129
<class 'numpy.ndarray'>
africa_zimb001                : 0.15 -- 2.415
<class 'numpy.ndarray'>
africa_zimb002                : 0.178 -- 2.044
<class 'numpy.ndarray'>
africa_zimb003                : 0.24 -- 2.701
<class 'numpy.ndarray'>
asia_chin004                  : 0.337 -- 1.525
<class 'numpy

### year

In [23]:
# # year
key = 'year'
print('%s: '%key)
for ii, vv in enumerate(df[key][:20]):
    try: print('%-30s: %s -- %s'%(df['dataSetName'].iloc[ii][:30], str(np.nanmin(vv)), str(np.nanmax(vv))))
    except: print('NaNs detected.', vv)
print(np.unique([str(type(dd)) for dd in df[key]]))

year: 
africa_keny001                : 1944.0 -- 1993.0
africa_keny002                : 1950.0 -- 1994.0
africa_morc001                : 1360.0 -- 1983.0
africa_morc002                : 1686.0 -- 1984.0
africa_morc003                : 1755.0 -- 1984.0
africa_morc011                : 1598.0 -- 1984.0
africa_morc012                : 1813.0 -- 1984.0
africa_morc013                : 1854.0 -- 1984.0
africa_morc014                : 1200.0 -- 1984.0
africa_safr001                : 1665.0 -- 1976.0
africa_zimb001                : 1925.0 -- 1994.0
africa_zimb002                : 1877.0 -- 1997.0
africa_zimb003                : 1880.0 -- 1996.0
asia_chin004                  : 1593.0 -- 1989.0
asia_chin005                  : 935.0 -- 1993.0
asia_chin006                  : 966.0 -- 1993.0
asia_indi001                  : 1888.0 -- 1981.0
asia_indi002                  : 1798.0 -- 1980.0
asia_indi003                  : 1752.0 -- 1980.0
asia_indi004                  : 1800.0 -- 1981.0
["<class 'numpy

### yearUnits

In [24]:
# yearUnits
key = 'yearUnits'
print('%s: '%key)
print(np.unique([kk for kk in df[key]]))
print(np.unique([str(type(dd)) for dd in df[key]]))

yearUnits: 
['CE']
["<class 'str'>"]
