In [1]:
import csv
import json
import logging
import os
from typing import IO, TextIO

import pandas as pd

from dadis_client.client import DadisClient

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(message)s')

In [2]:
API_KEY = os.environ["DADIS_API_KEY"]
client = DadisClient(api_key=API_KEY, prod=True)

VBO_BREED_FILENAME = "data/dadisbreedcountry.tsv"

In [3]:
def full_local_match_workflow(
    input_filename: str,
    output_filename: str,
    dadis_api_key: str) -> pd.DataFrame:
    """
    Perform the full matching workflow:

    - Read VBO data from input_filename
    - Match to DADIS to get DADIS ids
    - Save to a new TSV file at output_filename
    """
    client = DadisClient(api_key=dadis_api_key)
    logger.info(f"Reading VBO entries from {input_filename}")
    vbo_data = read_vbo_data(input_filename)
    logger.info(f"Matching to DADIS data")
    matched_breeds = match_vbo_breeds(vbo_data=vbo_data, client=client)

    logger.info(f"Writing output file to {output_filename}:")
    output_file = create_output_tsv(
        input_filename=input_filename, 
        output_filename=output_filename, 
        extra_cols=[
            "dadis_breed_id",
            "dadis_transboundary_id",
            "dadis_update_date",
        ]
    )
    matched_breeds.to_csv(output_file, sep="\t", index=False, header=False)
    output_file.close()
    logger.info("Output written.")
    return matched_breeds


def read_vbo_data(filename: str) -> pd.DataFrame:
    vbo_breeds = pd.read_table(filename, sep="\t", skiprows=[1], low_memory=False).convert_dtypes()
    logger.warning("Fixing swapped column names: dadis_species_name, dadis_country. Remove this code when input data is fixed")
    country = vbo_breeds["dadis_species_name"].copy()
    species = vbo_breeds["dadis_country"].copy()
    vbo_breeds["dadis_country"] = country
    vbo_breeds["dadis_species_name"] = species
    return vbo_breeds


def get_dadis_species(client: DadisClient) -> pd.DataFrame:
    resp = client.get_all_species()
    all_species = []
    for s in resp.response:
        species = {"dadis_species_id": s.id, "dadis_species_name": s.name["en"]}
        all_species.append(species)
    return pd.DataFrame.from_records(all_species)


def get_dadis_all_breeds(client: DadisClient) -> pd.DataFrame:
    resp = client.get_all_breeds()
    df = (
        pd.DataFrame.from_records([
            breed.model_dump() for breed in resp.response
        ])
        .convert_dtypes()
        .rename(columns={
            "id": "dadis_breed_id",
            "name": "dadis_breed_name",
            "iso3": "dadis_iso3_code",
            "speciesId": "dadis_species_id",
            "transboundaryId": "dadis_transboundary_id",
            "updatedAt": "dadis_update_date"
        })
    )
    df["dadis_update_date"] = df["dadis_update_date"].map(
        lambda d: pd.to_datetime(d, unit="ms")
    )
    # Merge species information
    species_df = get_dadis_species(client)
    df = df.merge(species_df, how="left", on="dadis_species_id")
    return df

def match_vbo_breeds(vbo_data: pd.DataFrame, client: DadisClient) -> pd.DataFrame:
    """
    Match VBO breed entries to DADIS, based on breed name, species, and country (ISO3 code)
    """
    logger.info("Fetching DADIS breeds")
    dadis_all = get_dadis_all_breeds(client=client)
    merged = vbo_data.merge(
        dadis_all,
        how="left",
        left_on=["dadis_name", "dadis_species_name", "dadis_iso3_code"],
        right_on=["dadis_breed_name", "dadis_species_name", "dadis_iso3_code"],
        sort=False,
        indicator=True
    )
    n_matched = merged["_merge"].eq("both").sum()
    n_total = len(merged["_merge"])
    logger.info(f"{n_matched} / {n_total} VBO breeds successfully matched to DADIS IDs")
    merged = merged.drop(columns=["_merge", "dadis_breed_name", "dadis_species_id"])
    return merged


def create_output_tsv(input_filename: str, output_filename: str, extra_cols: list[str] = None) -> TextIO:
    """
    Copy the 2 header lines from the input file to the output file. Return
    a file object for the output file, so pandas can write the rest of the file
    """
    file_out = open(output_filename, "w")
    csv_out = csv.writer(file_out, dialect="excel-tab")
    with open(input_filename) as file_in:
        csv_in = csv.reader(file_in, dialect="excel-tab")
        for index, line in enumerate(range(2)):
            header = next(csv_in)
            if extra_cols is not None:
                if index == 0:
                    header += extra_cols
                if index == 1:
                    header += ['' for i in range(len(extra_cols))]
            csv_out.writerow(header)
    return file_out

## Test full workflow

In [4]:
output = full_local_match_workflow(
    input_filename=VBO_BREED_FILENAME,
    output_filename="example_local_breed_output.tsv",
    dadis_api_key=API_KEY
)

Reading VBO entries from data/dadisbreedcountry.tsv
Fixing swapped column names: dadis_species_name, dadis_country. Remove this code when input data is fixed
Matching to DADIS data
Fetching DADIS breeds
14537 / 15143 VBO breeds successfully matched to DADIS IDs
Writing output file to example_local_breed_output.tsv:
Output written.


# Step-by-step workflow for reference/debugging

## DADIS data

In [5]:
dadis_all = get_dadis_all_breeds(client)
dadis_all.head()

Unnamed: 0,dadis_breed_id,dadis_breed_name,dadis_iso3_code,dadis_species_id,dadis_transboundary_id,dadis_update_date,dadis_species_name
0,ecf7d217-63a9-4b4f-aa6c-7afb49091604,Donkey,AFG,3,,2023-06-06 15:13:17,Ass
1,5be063e7-258a-48a8-bd8a-cb3f99bb5222,Afghan,AFG,7,,2023-06-06 15:05:18,Cattle
2,6c1cb9f9-e8b1-4127-b10f-c9cb4d312390,Kandahari,AFG,7,,2021-12-17 08:00:07,Cattle
3,e53309e3-8952-4b4a-864f-37ece264b93e,Konari,AFG,7,,2021-12-17 08:00:10,Cattle
4,00374759-f2aa-4943-9eb2-96e32f181869,Kunari,AFG,7,,2021-12-17 08:00:08,Cattle


In [6]:
dadis_all.query('dadis_breed_name.str.contains("Huacaya")')

Unnamed: 0,dadis_breed_id,dadis_breed_name,dadis_iso3_code,dadis_species_id,dadis_transboundary_id,dadis_update_date,dadis_species_name
323,da0f5e4b-2dc4-4119-bb9a-6d87d88b2a3c,Huacaya,AUS,1,1-1,2022-05-10 08:48:11,Alpaca
1505,1fb630e0-c8e7-4bfc-bbdb-d8d0db22538b,Huacaya,BOL,1,1-1,2022-10-12 14:02:06,Alpaca
1556,78b28cda-58ec-4193-ac62-94a38063fbc2,Huacayas,BOL,23,,2021-12-17 08:09:22,Guinea pig
1987,242da190-ecdf-4b8e-beaf-355de1d2b0d5,Huacaya,CAN,1,1-1,2021-12-17 08:11:51,Alpaca
2262,6778e416-e0ac-42f7-9749-27edce9c1890,Huacaya,CHL,1,1-1,2022-10-12 15:02:29,Alpaca
11183,8a3a635d-0483-445a-a9ed-157d58146e25,Huacaya,PER,1,1-1,2022-10-13 01:02:08,Alpaca


## VBO breed data

In [7]:
vbo_breeds = pd.read_table(VBO_BREED_FILENAME, sep="\t", skiprows=[1], low_memory=False).convert_dtypes()
# TODO: temp fix for wrong/swapped column names
country = vbo_breeds["dadis_species_name"].copy()
species = vbo_breeds["dadis_country"].copy()
vbo_breeds["dadis_country"] = country
vbo_breeds["dadis_species_name"] = species
vbo_breeds.head()

Unnamed: 0,vbo_id,term_label,to_be_ignored,breed_name_to_be_used_for_label,species_to_be_used_for_label,internal_term_merge_indicator_vboid_into_vboid,internal_date_and_issue_separated_by_pipe_yymmdd_dash_issue,GH_issue_NOT_merge_obsolete,parent_ID,source_for_parents,...,replacement_term,replacement_label,comment.1,GH_issue,obsolescence_reason,curator_note,dadis_name,dadis_species_name,dadis_country,dadis_iso3_code
0,VBO:0016872,"obsolete Alaska, United States of America (Nor...",,"Alaska, United States of America",North American deer mouse,,231211-135,https://github.com/monarch-initiative/vertebra...,,,...,,,,https://github.com/monarch-initiative/vertebra...,domain entity does not exist,,Alaska,North American deer mouse,United States of America,USA
1,VBO:0016854,"obsolete Rocky Mountain, United States of Amer...",,"Rocky Mountain, United States of America",Bighorn sheep,,231211-135,https://github.com/monarch-initiative/vertebra...,,,...,,,,https://github.com/monarch-initiative/vertebra...,domain entity does not exist,,Rocky Mountain,Bighorn sheep,United States of America,USA
2,VBO:0001723,"Alpakka, Finland (Alpaca)",,"Alpakka, Finland",Alpaca,,231213-139,https://github.com/monarch-initiative/vertebra...,VBO:0000038,https://www.fao.org/dad-is,...,,,,,,,Alpakka,Alpaca,Finland,FIN
3,VBO:0001724,"Huacaya, Australia (Alpaca)",,"Huacaya, Australia",Alpaca,,,,VBO:0000039,https://www.fao.org/dad-is,...,,,,,,,Huacaya,Alpaca,Australia,AUS
4,VBO:0001725,"Huacaya, Bolivia (Plurinational State of) (Alp...",,"Huacaya, Bolivia (Plurinational State of)",Alpaca,,,,VBO:0000039,https://www.fao.org/dad-is,...,,,,,,,Huacaya,Alpaca,Bolivia (Plurinational State of),BOL


In [8]:
# TODO: ignore obsolete terms? or will these be ignored anyway because the term label won't match?
match_data = vbo_breeds[["vbo_id", "term_label", "dadis_name", "dadis_species_name", "dadis_iso3_code"]]
match_data.head()

Unnamed: 0,vbo_id,term_label,dadis_name,dadis_species_name,dadis_iso3_code
0,VBO:0016872,"obsolete Alaska, United States of America (Nor...",Alaska,North American deer mouse,USA
1,VBO:0016854,"obsolete Rocky Mountain, United States of Amer...",Rocky Mountain,Bighorn sheep,USA
2,VBO:0001723,"Alpakka, Finland (Alpaca)",Alpakka,Alpaca,FIN
3,VBO:0001724,"Huacaya, Australia (Alpaca)",Huacaya,Alpaca,AUS
4,VBO:0001725,"Huacaya, Bolivia (Plurinational State of) (Alp...",Huacaya,Alpaca,BOL


In [9]:
merged = match_data.merge(
    dadis_all,
    how="left",
    left_on=["dadis_name", "dadis_species_name", "dadis_iso3_code"],
    right_on=["dadis_breed_name", "dadis_species_name", "dadis_iso3_code"],
    sort=False,
    indicator=True
)
merged.head()

Unnamed: 0,vbo_id,term_label,dadis_name,dadis_species_name,dadis_iso3_code,dadis_breed_id,dadis_breed_name,dadis_species_id,dadis_transboundary_id,dadis_update_date,_merge
0,VBO:0016872,"obsolete Alaska, United States of America (Nor...",Alaska,North American deer mouse,USA,,,,,NaT,left_only
1,VBO:0016854,"obsolete Rocky Mountain, United States of Amer...",Rocky Mountain,Bighorn sheep,USA,,,,,NaT,left_only
2,VBO:0001723,"Alpakka, Finland (Alpaca)",Alpakka,Alpaca,FIN,4036812f-308c-4b11-8402-eaf171a818a1,Alpakka,1.0,,2023-09-14 00:03:28,both
3,VBO:0001724,"Huacaya, Australia (Alpaca)",Huacaya,Alpaca,AUS,da0f5e4b-2dc4-4119-bb9a-6d87d88b2a3c,Huacaya,1.0,1-1,2022-05-10 08:48:11,both
4,VBO:0001725,"Huacaya, Bolivia (Plurinational State of) (Alp...",Huacaya,Alpaca,BOL,1fb630e0-c8e7-4bfc-bbdb-d8d0db22538b,Huacaya,1.0,1-1,2022-10-12 14:02:06,both


In [10]:
n_matched = merged["_merge"].eq("both").sum()
n_matched
n_total = len(merged["_merge"])
n_total

15143

In [11]:
merged["_merge"].value_counts()

_merge
both          14537
left_only       606
right_only        0
Name: count, dtype: int64

In [12]:
merged.query('_merge == "left_only"')

Unnamed: 0,vbo_id,term_label,dadis_name,dadis_species_name,dadis_iso3_code,dadis_breed_id,dadis_breed_name,dadis_species_id,dadis_transboundary_id,dadis_update_date,_merge
0,VBO:0016872,"obsolete Alaska, United States of America (Nor...",Alaska,North American deer mouse,USA,,,,,NaT,left_only
1,VBO:0016854,"obsolete Rocky Mountain, United States of Amer...",Rocky Mountain,Bighorn sheep,USA,,,,,NaT,left_only
108,VBO:0001829,"Zamorano-Leonés-Spain, Spain (Ass)",Zamorano-Leonés-Spain,Ass,ESP,,,,,NaT,left_only
156,VBO:0001877,"Ass & mules (no breed indication), Luxembourg ...",Ass & mules (no breed indication),Ass,LUX,,,,,NaT,left_only
160,VBO:0001881,"Burro, Mexico (Ass)",Burro,Ass,MEX,,,,,NaT,left_only
...,...,...,...,...,...,...,...,...,...,...,...
15138,VBO:0016870,"Japanese Draft, Japan (Horse)",Japanese Draft,Horse,JPN,,,,,NaT,left_only
15139,VBO:0016871,"Kentucky Mountain Saddle, United States of Ame...",Kentucky Mountain Saddle,Horse,USA,,,,,NaT,left_only
15140,VBO:0016873,"Essex, United Kingdom of Great Britain and Nor...",Essex,Pig,GBR,,,,,NaT,left_only
15141,VBO:0016874,"Minnesota, United States of America (Pig)",Minnesota,Pig,USA,,,,,NaT,left_only


## Matching

In [13]:
matched = match_vbo_breeds(vbo_data=vbo_breeds, client=client)
matched.head()

Fetching DADIS breeds
14537 / 15143 VBO breeds successfully matched to DADIS IDs


Unnamed: 0,vbo_id,term_label,to_be_ignored,breed_name_to_be_used_for_label,species_to_be_used_for_label,internal_term_merge_indicator_vboid_into_vboid,internal_date_and_issue_separated_by_pipe_yymmdd_dash_issue,GH_issue_NOT_merge_obsolete,parent_ID,source_for_parents,...,GH_issue,obsolescence_reason,curator_note,dadis_name,dadis_species_name,dadis_country,dadis_iso3_code,dadis_breed_id,dadis_transboundary_id,dadis_update_date
0,VBO:0016872,"obsolete Alaska, United States of America (Nor...",,"Alaska, United States of America",North American deer mouse,,231211-135,https://github.com/monarch-initiative/vertebra...,,,...,https://github.com/monarch-initiative/vertebra...,domain entity does not exist,,Alaska,North American deer mouse,United States of America,USA,,,NaT
1,VBO:0016854,"obsolete Rocky Mountain, United States of Amer...",,"Rocky Mountain, United States of America",Bighorn sheep,,231211-135,https://github.com/monarch-initiative/vertebra...,,,...,https://github.com/monarch-initiative/vertebra...,domain entity does not exist,,Rocky Mountain,Bighorn sheep,United States of America,USA,,,NaT
2,VBO:0001723,"Alpakka, Finland (Alpaca)",,"Alpakka, Finland",Alpaca,,231213-139,https://github.com/monarch-initiative/vertebra...,VBO:0000038,https://www.fao.org/dad-is,...,,,,Alpakka,Alpaca,Finland,FIN,4036812f-308c-4b11-8402-eaf171a818a1,,2023-09-14 00:03:28
3,VBO:0001724,"Huacaya, Australia (Alpaca)",,"Huacaya, Australia",Alpaca,,,,VBO:0000039,https://www.fao.org/dad-is,...,,,,Huacaya,Alpaca,Australia,AUS,da0f5e4b-2dc4-4119-bb9a-6d87d88b2a3c,1-1,2022-05-10 08:48:11
4,VBO:0001725,"Huacaya, Bolivia (Plurinational State of) (Alp...",,"Huacaya, Bolivia (Plurinational State of)",Alpaca,,,,VBO:0000039,https://www.fao.org/dad-is,...,,,,Huacaya,Alpaca,Bolivia (Plurinational State of),BOL,1fb630e0-c8e7-4bfc-bbdb-d8d0db22538b,1-1,2022-10-12 14:02:06
