In [1]:
import re
import sys
import time
sys.path.append('../src')

import pandas as pd
from ranges.sheets import SheetParser
from ranges.specimen import Specimen, ReviewNeededException

In [2]:
file_name = "../data/ranges/Peromyscus boylii.csv"

accession_data = pd.read_csv(file_name, dtype=str)
accession_data = accession_data.fillna("")
accession_data = accession_data.to_dict(orient="records")

if len(accession_data) == 0:
    print("Sheet is empty")

missing_columns = SheetParser.verify_columns_exist(accession_data[0].keys())
if len(missing_columns) > 0:
    print(f"Missing columns in {file_name}", missing_columns)

In [3]:
review_needed = []
specimens = []
for raw_record in accession_data:
    try:
        specimen = Specimen.from_raw_record(raw_record)
        specimens.append(specimen)
    except Exception as ex:
        review_needed.append([ex.args, raw_record['guid']])

review_needed

[[('MVZ:Mamm:55095', 'reported missing on a note in skin tray'),
  'MVZ:Mamm:55095'],
 [('MVZ:Mamm:183927', 'check tag, is tl 162?'), 'MVZ:Mamm:183927'],
 [('MVZ:Mamm:216445', 'skin tag says emb 2R-1L instead of 2R 4L'),
  'MVZ:Mamm:216445'],
 [('MVZ:Mamm:202038', 'skin tag says weight = 20 g'), 'MVZ:Mamm:202038'],
 [('MVZ:Mamm:216473', 'skin tag says 1R 2L instead of 2R 1L'),
  'MVZ:Mamm:216473'],
 [('MVZ:Mamm:141264', 'Double check tag'), 'MVZ:Mamm:141264'],
 [('MVZ:Mamm:141287', 'Double check tag'), 'MVZ:Mamm:141287'],
 [('MVZ:Mamm:141282', 'Double check tag'), 'MVZ:Mamm:141282'],
 [('MVZ:Mamm:132804', 'Double check tag'), 'MVZ:Mamm:132804'],
 [('MVZ:Mamm:132805', 'Double check tag'), 'MVZ:Mamm:132805']]

In [4]:
attributes = []
unitless_attributes = []
for specimen in specimens:
    try:
        specimen_attributes, specimen_unitless_attributes = specimen.export_attributes()
        attributes.extend(specimen_attributes)
        unitless_attributes.extend(specimen_unitless_attributes)
    except Exception as ex:
        print(ex, specimen.to_dict())

In [5]:
def eliminate_duplicates(attributes):
    existing_records = set()

    result = []
    for attribute in attributes:
        key = f"{attribute['guid']}_{attribute['attribute_type']}"
        if key in existing_records:
            print("Duplicate entries found for guid:", attribute["guid"], "attribute:", attribute["attribute_type"])
        else:
            existing_records.add(key)
            result.append(attribute)
    
    return result

# Check for and eliminate duplicates
attributes = eliminate_duplicates(attributes)
unitless_attributes = eliminate_duplicates(unitless_attributes)

In [6]:
pd.DataFrame(attributes)

Unnamed: 0,guid,attribute_type,attribute_value,attribute_units,attribute_date,attribute_remark,attribute_determiner
0,MVZ:Mamm:17933,total length,196,mm,,,Walter P. Taylor
1,MVZ:Mamm:17933,tail length,93,mm,,,Walter P. Taylor
2,MVZ:Mamm:17933,hind foot with claw,22,mm,,,Walter P. Taylor
3,MVZ:Mamm:17933,ear from notch,18.5,mm,,,Walter P. Taylor
4,MVZ:Mamm:17934,total length,185,mm,,,Walter P. Taylor
...,...,...,...,...,...,...,...
9839,MVZ:Mamm:75971,tail length,96,mm,,,Seth B. Benson
9840,MVZ:Mamm:75971,hind foot with claw,22,mm,,,Seth B. Benson
9841,MVZ:Mamm:75971,ear from notch,18,mm,,,Seth B. Benson
9842,MVZ:Mamm:75971,ear from crown,16,mm,,,Seth B. Benson


In [7]:
pd.DataFrame(unitless_attributes)

Unnamed: 0,guid,attribute_type,attribute_value,attribute_date,attribute_determiner
0,MVZ:Mamm:17933,reproductive data,Testes large,,Walter P. Taylor
1,MVZ:Mamm:17934,reproductive data,Testes large,,Walter P. Taylor
2,MVZ:Mamm:17946,reproductive data,Testes large,,Walter P. Taylor
3,MVZ:Mamm:17935,reproductive data,Testes large,,Tracy I. Storer
4,MVZ:Mamm:17936,reproductive data,ad.,,Walter P. Taylor
...,...,...,...,...,...
939,MVZ:Mamm:75970,reproductive data,no emb,,Seth B. Benson
940,MVZ:Mamm:75965,reproductive data,no emb,,Seth B. Benson
941,MVZ:Mamm:75967,unformatted measurements,"""total length"": ""184+"", ""tail length"": ""86+""",,Seth B. Benson
942,MVZ:Mamm:75967,reproductive data,no emb,,Seth B. Benson


In [8]:
# Save data to files
import os
if not os.path.exists("../out"):
    os.makedirs("../out")

csv_dataframe = pd.DataFrame.from_records(attributes)
csv_dataframe.to_csv(f"../out/numerical_attributes.csv", index=False)

csv_dataframe = pd.DataFrame.from_records(unitless_attributes)
csv_dataframe.to_csv(f"../out/text_attributes.csv", index=False)