In [1]:
# create a duplicate database from ch2k (smallest db)

In [2]:
%load_ext autoreload
%autoreload 2

import sys
import os
from pathlib import Path

# Add parent directory to path (works from any notebook in notebooks/)
# the repo_root should be the parent directory of the notebooks folder
current_dir = Path().resolve()
# Determine repo root
if current_dir.name == 'dod2k': repo_root = current_dir
elif current_dir.parent.name == 'dod2k': repo_root = current_dir.parent
else: raise Exception('Please review the repo root structure (see first cell).')

# Update cwd and path only if needed
if os.getcwd() != str(repo_root):
    os.chdir(repo_root)
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

print(f"Repo root: {repo_root}")
if str(os.getcwd())==str(repo_root):
    print(f"Working directory matches repo root. ")

Repo root: /home/jupyter-lluecke/dod2k_v2.0/dod2k
Working directory matches repo root. 


In [3]:
import pandas as pd
import numpy as np
import datetime

from dod2k_utilities import ut_functions as utf # contains utility functions
from dod2k_utilities import ut_duplicate_search as dup # contains utility functions

### Load dataset

Define the dataset which needs to be screened for duplicates. Input files for the duplicate detection mechanism need to be compact dataframes (`pandas` dataframes with standardised columns and entry formatting). 

The function `load_compact_dataframe_from_csv` loads the dataframe from a `csv` file from `data\DB\`, with `DB` the name of the database. The database name (`db_name`) can be 
- `pages2k`
- `ch2k`
- `iso2k`
- `sisal`
- `fe23`

for the individual databases, or 

- `all_merged`

to load the merged database of all individual databases, or can be any user defined compact dataframe.

In [4]:
# load dataframe
db_name='all_merged' 
df = utf.load_compact_dataframe_from_csv(db_name)

print(df.info())
df.name = db_name


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5320 entries, 0 to 5319
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   archiveType                    5320 non-null   object 
 1   dataSetName                    5320 non-null   object 
 2   datasetId                      5320 non-null   object 
 3   geo_meanElev                   5221 non-null   float32
 4   geo_meanLat                    5320 non-null   float32
 5   geo_meanLon                    5320 non-null   float32
 6   geo_siteName                   5320 non-null   object 
 7   interpretation_direction       5320 non-null   object 
 8   interpretation_seasonality     5320 non-null   object 
 9   interpretation_variable        5320 non-null   object 
 10  interpretation_variableDetail  5320 non-null   object 
 11  originalDataURL                5320 non-null   object 
 12  originalDatabase               5320 non-null   o

In [5]:
m = 100
n=1

# Create synthetic duplicates
dupdf = pd.concat([df[:m]]*n)

# IMPORTANT: Reset index BEFORE the loop
dupdf = dupdf.reset_index(drop=True)  # This creates a clean 0-99 index

# Now create unique datasetIds
dupdf['datasetId'] = [f'{i}' for i in range(len(dupdf))]

# Or if you want to keep track of which are duplicates:
for i in range(len(dupdf)):
    original_idx = i % m 
    dupdf.loc[i, 'datasetId'] = f"record_{original_idx}_copy_{i//m}"

# Set index at the end if needed
dupdf.index = range(len(dupdf))#dupdf['datasetId']

In [6]:
dupdf.index

RangeIndex(start=0, stop=100, step=1)

In [7]:
dupdf.datasetId

0      record_0_copy_0
1      record_1_copy_0
2      record_2_copy_0
3      record_3_copy_0
4      record_4_copy_0
            ...       
95    record_95_copy_0
96    record_96_copy_0
97    record_97_copy_0
98    record_98_copy_0
99    record_99_copy_0
Name: datasetId, Length: 100, dtype: object

## Save duplicate free dataframe

In [8]:
print("=== BEFORE SAVE ===")
print("Type of paleoData_values[0]:", type(dupdf['paleoData_values'].iloc[0]))
print("Dtype of paleoData_values[0]:", dupdf['paleoData_values'].iloc[0].dtype)
print("Shape:", dupdf['paleoData_values'].iloc[0].shape)
print("First 5 values:", dupdf['paleoData_values'].iloc[0][:5])
print("String repr:", repr(str(dupdf['paleoData_values'].iloc[0][:5])))


=== BEFORE SAVE ===
Type of paleoData_values[0]: <class 'numpy.ndarray'>
Dtype of paleoData_values[0]: float32
Shape: (1220,)
First 5 values: [-33.32873 -35.6732  -33.1574  -34.2854  -34.4031 ]
String repr: '[-33.32873 -35.6732  -33.1574  -34.2854  -34.4031 ]'


In [9]:
dupdf = dupdf[sorted(dupdf.columns)]
dupdf.name='dup_test'
os.makedirs(f'data/{dupdf.name}/', exist_ok=True)

In [10]:
dupdf.info()
print(dupdf.name)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   archiveType                    100 non-null    object 
 1   dataSetName                    100 non-null    object 
 2   datasetId                      100 non-null    object 
 3   geo_meanElev                   100 non-null    float32
 4   geo_meanLat                    100 non-null    float32
 5   geo_meanLon                    100 non-null    float32
 6   geo_siteName                   100 non-null    object 
 7   interpretation_direction       100 non-null    object 
 8   interpretation_seasonality     100 non-null    object 
 9   interpretation_variable        100 non-null    object 
 10  interpretation_variableDetail  100 non-null    object 
 11  originalDataURL                100 non-null    object 
 12  originalDatabase               100 non-null    obje

In [11]:
dupdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   archiveType                    100 non-null    object 
 1   dataSetName                    100 non-null    object 
 2   datasetId                      100 non-null    object 
 3   geo_meanElev                   100 non-null    float32
 4   geo_meanLat                    100 non-null    float32
 5   geo_meanLon                    100 non-null    float32
 6   geo_siteName                   100 non-null    object 
 7   interpretation_direction       100 non-null    object 
 8   interpretation_seasonality     100 non-null    object 
 9   interpretation_variable        100 non-null    object 
 10  interpretation_variableDetail  100 non-null    object 
 11  originalDataURL                100 non-null    object 
 12  originalDatabase               100 non-null    obje

### save pickle

In [12]:
# save concatenate dataframe as db_merged
dupdf.to_pickle(f'data/{dupdf.name}/{dupdf.name}_compact.pkl')

### save csv

In [13]:
# save to a list of csv files (metadata, data, year)
utf.write_compact_dataframe_to_csv(dupdf)

METADATA: datasetId, archiveType, dataSetName, geo_meanElev, geo_meanLat, geo_meanLon, geo_siteName, interpretation_direction, interpretation_seasonality, interpretation_variable, interpretation_variableDetail, originalDataURL, originalDatabase, paleoData_notes, paleoData_proxy, paleoData_sensorSpecies, paleoData_units, paleoData_variableName, yearUnits
Saved to /home/jupyter-lluecke/dod2k_v2.0/dod2k/data/dup_test/dup_test_compact_%s.csv


In [14]:
# load dataframe
print(utf.load_compact_dataframe_from_csv(dupdf.name).info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   archiveType                    100 non-null    object 
 1   dataSetName                    100 non-null    object 
 2   datasetId                      100 non-null    object 
 3   geo_meanElev                   100 non-null    float32
 4   geo_meanLat                    100 non-null    float32
 5   geo_meanLon                    100 non-null    float32
 6   geo_siteName                   100 non-null    object 
 7   interpretation_direction       100 non-null    object 
 8   interpretation_seasonality     100 non-null    object 
 9   interpretation_variable        100 non-null    object 
 10  interpretation_variableDetail  100 non-null    object 
 11  originalDataURL                100 non-null    object 
 12  originalDatabase               100 non-null    obje

In [15]:
dupdf

Unnamed: 0,archiveType,dataSetName,datasetId,geo_meanElev,geo_meanLat,geo_meanLon,geo_siteName,interpretation_direction,interpretation_seasonality,interpretation_variable,...,originalDataURL,originalDatabase,paleoData_notes,paleoData_proxy,paleoData_sensorSpecies,paleoData_units,paleoData_values,paleoData_variableName,year,yearUnits
0,GlacierIce,Ant-WDC05A.Steig.2013,record_0_copy_0,1806.000000,-79.459999,-112.089996,WDC05A,positive,Annual,temperature,...,https://www1.ncdc.noaa.gov/pub/data/paleo/page...,PAGES 2k v2.2.0,; climateInterpretation_seasonality changed - ...,d18O,,permil,"[-33.32873, -35.6732, -33.1574, -34.2854, -34....",d18O,"[2005.0, 2004.0, 2003.0, 2002.0, 2001.0, 2000....",CE
1,Wood,NAm-MtLemon.Briffa.2002,record_1_copy_0,2700.000000,32.500000,-110.800003,Mt. Lemon,,,,...,https://www1.ncdc.noaa.gov/pub/data/paleo/page...,PAGES 2k v2.2.0,,ring width,PSME,cm,"[2.76, 2.91, 1.88, 2.51, 2.5, 1.79, 0.915, 0.6...",ring width,"[1568.0, 1569.0, 1570.0, 1571.0, 1572.0, 1573....",CE
2,Wood,NAm-MtLemon.Briffa.2002,record_2_copy_0,2700.000000,32.500000,-110.800003,Mt. Lemon,,,,...,https://www1.ncdc.noaa.gov/pub/data/paleo/page...,PAGES 2k v2.2.0,,ring width,PSME,,"[1.141, 1.198, 0.881, 1.091, 1.097, 0.873, 0.6...",ring width,"[1568.0, 1569.0, 1570.0, 1571.0, 1572.0, 1573....",CE
3,Wood,NAm-MtLemon.Briffa.2002,record_3_copy_0,2700.000000,32.500000,-110.800003,Mt. Lemon,,,,...,https://www1.ncdc.noaa.gov/pub/data/paleo/page...,PAGES 2k v2.2.0,,residual chronology,PSME,,"[1.116, 1.152, 0.768, 1.151, 1.075, 0.811, 0.7...",residual chronology,"[1568.0, 1569.0, 1570.0, 1571.0, 1572.0, 1573....",CE
4,Wood,NAm-MtLemon.Briffa.2002,record_4_copy_0,2700.000000,32.500000,-110.800003,Mt. Lemon,,,,...,https://www1.ncdc.noaa.gov/pub/data/paleo/page...,PAGES 2k v2.2.0,,ARSTAN,PSME,,"[1.143, 1.223, 0.876, 1.1, 1.126, 0.874, 0.679...",ARSTAN,"[1568.0, 1569.0, 1570.0, 1571.0, 1572.0, 1573....",CE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Coral,Ocn-Rarotonga_d18O2R.Linsley.2006,record_95_copy_0,-18.299999,-21.237801,-159.827805,Rarotonga,,,,...,https://www1.ncdc.noaa.gov/pub/data/paleo/page...,PAGES 2k v2.2.0,,d13C,lutea,permil,"[-2.82, -2.91, -3.01, -3.27, -3.12, -2.84, -2....",d13C,"[1996.91, 1996.78, 1996.66, 1996.53, 1996.41, ...",CE
96,Wood,Asi-KYRG014.Solomina.2013,record_96_copy_0,69.000000,42.419998,78.970001,KYRG014,positive,Summer,temperature,...,https://www1.ncdc.noaa.gov/pub/data/paleo/page...,PAGES 2k v2.2.0,,ring width,,,"[0.885, 1.266, 0.865, 0.979, 1.262, 1.032, 1.2...",ring width,"[1551.0, 1552.0, 1553.0, 1554.0, 1555.0, 1556....",CE
97,Coral,Ocn-Lombok.Charles.2003,record_97_copy_0,-3.000000,-8.247300,115.575699,Lombok,negative,subannual,temperature,...,https://www1.ncdc.noaa.gov/pub/data/paleo/page...,PAGES 2k v2.2.0,; climateInterpretation_seasonality changed - ...,d18O,,permil,"[-5.284, -5.114, -5.333, -5.365, -5.237, -5.36...",d18O,"[1990.0, 1989.92, 1989.83, 1989.75, 1989.67, 1...",CE
98,LakeSediment,Arc-HudsonLake.Clegg.2011,record_98_copy_0,657.000000,61.900002,-145.660004,Hudson Lake,positive,Jul,temperature,...,https://www1.ncdc.noaa.gov/pub/data/paleo/page...,PAGES 2k v2.2.0,,chironomid,,degC,"[12.4427, 11.8305, 11.9809, 12.1493, 12.684, 1...",temperature,"[1996.8, 1982.85, 1963.95, 1952.0, 1934.4, 190...",CE


In [16]:
dupdf_reloaded = utf.load_compact_dataframe_from_csv('dup_test')

print("\n=== AFTER RELOAD ===")
print("Type of paleoData_values[0]:", type(dupdf_reloaded['paleoData_values'].iloc[0]))
print("Dtype of paleoData_values[0]:", dupdf_reloaded['paleoData_values'].iloc[0].dtype)
print("Shape:", dupdf_reloaded['paleoData_values'].iloc[0].shape)
print("First 5 values:", dupdf_reloaded['paleoData_values'].iloc[0][:5])
print("String repr:", repr(str(dupdf_reloaded['paleoData_values'].iloc[0][:5])))

# Direct comparison
print("\n=== COMPARISON ===")
print("Arrays equal?:", np.array_equal(dupdf['paleoData_values'].iloc[0], 
                                       dupdf_reloaded['paleoData_values'].iloc[0]))
print("Arrays allclose?:", np.allclose(dupdf['paleoData_values'].iloc[0], 
                                       dupdf_reloaded['paleoData_values'].iloc[0], 
                                       equal_nan=True))


=== AFTER RELOAD ===
Type of paleoData_values[0]: <class 'numpy.ndarray'>
Dtype of paleoData_values[0]: float32
Shape: (1220,)
First 5 values: [-33.32873 -35.6732  -33.1574  -34.2854  -34.4031 ]
String repr: '[-33.32873 -35.6732  -33.1574  -34.2854  -34.4031 ]'

=== COMPARISON ===
Arrays equal?: True
Arrays allclose?: True


In [17]:
os.getcwd()

'/home/jupyter-lluecke/dod2k_v2.0/dod2k'