# Identify duplicate rows (except for density) in source data

10/30, 5/8,7/2023

In [1]:
from pathlib import Path

import pandas as pd

In [2]:
data_pth = Path(".")

## Look for duplicate density emof records in the emof table

In [3]:
emof_df = pd.read_csv(data_pth / 'aligned_csvs' / 'DwC_emof.csv')

In [4]:
emof_density_df = emof_df[emof_df['measurementTypeID'] == "http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL01/"]

There they are, duplicates!

In [5]:
emof_density_valuecounts_df = pd.DataFrame(
    emof_density_df.occurrenceID.value_counts()
).rename(columns={'occurrenceID':'count'})

emof_density_valuecounts_df.head()

Unnamed: 0,count
f6ef5d18-03ac-4d21-979e-ed69470f29ef,2
4b8fff24-04e2-418c-b1be-44d471b25610,2
6feab017-be10-415b-865a-70cdccaf9bc6,2
5bf63e21-448a-4672-974c-47c60b67d4fc,2
938734f0-be6c-4aee-aad9-767a11d66eca,2


**There it is, the 30 duplicates!**

In [6]:
len(emof_density_valuecounts_df.query("count > 1"))

30

An example:

In [7]:
emof_density_df[emof_density_df['occurrenceID'] == '73aa9fe2-b55b-4aaa-8ac5-d7f3eaff21ac']

Unnamed: 0,eventID,occurrenceID,measurementType,measurementTypeID,measurementValue,measurementValueID,measurementUnit,measurementUnitID


The problem is that there are records in the source table that are practically duplicates, in everything **except the `density` values!**

## Look for such duplicates by loading the source csv, dropping `density` (and `time`), and identifying duplicates in the dataframe.

In [8]:
sourcecsvdata_pth = data_pth / "sourcedata" / "bcodmo_dataset_682074_data.csv"

source_df = pd.read_csv(sourcecsvdata_pth, skiprows=[1])

source_dropcols_df = source_df.drop(columns=['density', 'time'])

In [9]:
len(source_dropcols_df[source_dropcols_df.duplicated()])

17

In [10]:
source_dropcols_df[source_dropcols_df.duplicated()]

Unnamed: 0,station,latitude,longitude,day_night,species,date,time_start,life_history_stage,sample_code,mesh_size,depth_max,depth_min,FWC_DS
982,DB,47.378,-123.117,Day,OITHONA_SPINIROSTRIS,20131003,14:11,Copepodite,20131003DBDm2_200,200,56.0,30.0,DS
1008,DB,47.378,-123.117,Day,OSTRACODA,20120712,16:23,Unknown,20120712DBDm1_200,200,70.0,45.0,DS
1289,DB,47.378,-123.117,Day,PSEUDOCALANUS,20131003,14:11,Female;_Adult,20131003DBDm4_200,200,16.0,12.0,DS
1847,DB,47.378,-123.117,Night,EUPHAUSIA_PACIFICA,20120905,23:19,F3;_Furcilia_3,20120905DBBNm4_335,335,20.0,9.0,DS
1852,DB,47.378,-123.117,Night,EUPHAUSIA_PACIFICA,20120905,23:19,F3;_Furcilia_3,20120905DBBNm5_335,335,9.0,0.0,DS
3599,HP,47.581,-122.986,Night,PSEUDOCALANUS,20131001,20:57,Copepodite,20131001HPNm1_200,200,103.0,0.0,FWC
3807,UN,47.812,-122.807,Day,BIVALVIA,20130930,13:54,Unknown,20130930UNDm5_200,200,6.0,0.0,DS
4397,UN,47.812,-122.807,Day,GASTROPODA,20120902,15:13,Unknown,20120902UNDm5_200,200,7.0,0.0,DS
4438,UN,47.812,-122.807,Day,GRAPSIDAE,20120709,17:30,Z3;_Zoea_III,20120709UNDm4_335,335,8.0,0.0,DS
4440,UN,47.812,-122.807,Day,GRAPSIDAE,20120709,17:30,Z4;_Zoea_IV,20120709UNDm4_335,335,8.0,0.0,DS


Distill to the bare minimum information needed to describe (convey to others) these duplicates

In [11]:
cols = ['sample_code', 'species', 'life_history_stage']

source_duplicated_df = source_dropcols_df[source_dropcols_df.duplicated()][cols].sort_values(by=cols)

In [12]:
len(source_duplicated_df)

17

**Hmm, that's only 17, NOT 30!** Could it be that some of the duplication problems actually made it into the occurrence table, and the generation of the `occurrenceID` values?

In [13]:
source_duplicated_df

Unnamed: 0,sample_code,species,life_history_stage
4438,20120709UNDm4_335,GRAPSIDAE,Z3;_Zoea_III
4440,20120709UNDm4_335,GRAPSIDAE,Z4;_Zoea_IV
4937,20120709UNDm4_335,PAGURIDAE,Z1;_Zoea_I
5360,20120709UNDm4_335,THYSANOESSA_RASCHII,F1;_Furcilia_1
6449,20120709UNNm4_200,ONCAEA,Copepodite
1008,20120712DBDm1_200,OSTRACODA,Unknown
4397,20120902UNDm5_200,GASTROPODA,Unknown
1847,20120905DBBNm4_335,EUPHAUSIA_PACIFICA,F3;_Furcilia_3
1852,20120905DBBNm5_335,EUPHAUSIA_PACIFICA,F3;_Furcilia_3
5005,20121001UNDm2_200,PARACALANUS_PARVUS,Female;_Adult


## Look for duplicate occurrence records in the occurrence table

In [14]:
occurrence_df = pd.read_csv(data_pth / 'aligned_csvs' / 'DwC_occurrence.csv')

len(occurrence_df)

6871

In [15]:
occurrence_dropoccurrenceID_df = occurrence_df.drop(columns=['occurrenceID'])

In [16]:
occurrence_dropoccurrenceID_df.head()

Unnamed: 0,eventID,basisOfRecord,occurrenceStatus,sex,lifeStage,dwciri:sex,dwciri:lifeStage,scientificName,scientificNameID,taxonRank,kingdom,phylum,class,order,family,genus,scientificNameAuthorship,verbatimIdentification
0,20120611UNDm1_200,MaterialSample,present,female,adult,http://vocab.nerc.ac.uk/collection/S10/current...,http://vocab.nerc.ac.uk/collection/S11/current...,Acartia clausi,urn:lsid:marinespecies.org:taxname:104251,Species,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Giesbrecht, 1889",ACARTIA_CLAUSI
1,20120611UNDm1_200,MaterialSample,present,male,adult,http://vocab.nerc.ac.uk/collection/S10/current...,http://vocab.nerc.ac.uk/collection/S11/current...,Acartia clausi,urn:lsid:marinespecies.org:taxname:104251,Species,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Giesbrecht, 1889",ACARTIA_CLAUSI
2,20120611UNDm1_200,MaterialSample,present,indeterminate,copepodites C5,http://vocab.nerc.ac.uk/collection/S10/current...,http://vocab.nerc.ac.uk/collection/S11/current...,Acartia clausi,urn:lsid:marinespecies.org:taxname:104251,Species,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Giesbrecht, 1889",ACARTIA_CLAUSI
3,20120611UNDm1_200,MaterialSample,present,indeterminate,medusae,http://vocab.nerc.ac.uk/collection/S10/current...,http://vocab.nerc.ac.uk/collection/S11/current...,Aglantha,urn:lsid:marinespecies.org:taxname:117212,Genus,Animalia,Cnidaria,Hydrozoa,Trachymedusae,Rhopalonematidae,Aglantha,"Haeckel, 1879",AGLANTHA
4,20120611UNDm1_200,MaterialSample,present,indeterminate,veliger,http://vocab.nerc.ac.uk/collection/S10/current...,http://vocab.nerc.ac.uk/collection/S11/current...,Bivalvia,urn:lsid:marinespecies.org:taxname:105,Class,Animalia,Mollusca,Bivalvia,,,,"Linnaeus, 1758",BIVALVIA


In [17]:
cols = ['eventID', 'verbatimIdentification', 'scientificName', 'sex', 'lifeStage']

occurrence_duplicated_df = occurrence_dropoccurrenceID_df[occurrence_dropoccurrenceID_df.duplicated()][cols].sort_values(by=cols)

len(occurrence_duplicated_df)

17

**The same 17 duplicates from the source file.** Makes sense.

In [18]:
occurrence_duplicated_df

Unnamed: 0,eventID,verbatimIdentification,scientificName,sex,lifeStage
1196,20120709UNDm4_335,GRAPSIDAE,Grapsidae,indeterminate,zoeae Z3
1198,20120709UNDm4_335,GRAPSIDAE,Grapsidae,indeterminate,zoeae Z4
1200,20120709UNDm4_335,PAGURIDAE,Paguridae,indeterminate,zoeae Z1
1204,20120709UNDm4_335,THYSANOESSA_RASCHII,Thysanoessa raschii,indeterminate,furcilia F1
1441,20120709UNNm4_200,ONCAEA,Oncaea,indeterminate,copepodites
1595,20120712DBDm1_200,OSTRACODA,Ostracoda,indeterminate,unknown
2364,20120902UNDm5_200,GASTROPODA,Gastropoda,indeterminate,unknown
2693,20120905DBBNm4_335,EUPHAUSIA_PACIFICA,Euphausia pacifica,indeterminate,furcilia F3
2712,20120905DBBNm5_335,EUPHAUSIA_PACIFICA,Euphausia pacifica,indeterminate,furcilia F3
3100,20121001UNDm2_200,PARACALANUS_PARVUS,Paracalanus parvus,female,adult
