In [1]:
import csv
import json
import os
from typing import IO, TextIO

import pandas as pd

from dadis_client.client import DadisClient

In [2]:
API_KEY = os.environ["DADIS_API_KEY"]

In [3]:
client = DadisClient(api_key=API_KEY, prod=True)

In [4]:
def full_matching_workflow(
    input_filename: str,
    output_filename: str,
    dadis_api_key: str) -> pd.DataFrame:
    """
    Perform the full matching workflow:

    - Read VBO data from input_filename
    - Match to DADIS to get dadis_transboundary_id
    - Save to a new TSV file at output_filename
    """
    client = DadisClient(api_key=dadis_api_key)
    vbo_data = read_vbo_data(input_filename)
    matched_breeds = match_vbo_breeds(vbo_data=vbo_data, client=client)

    print(f"Writing output file to {output_filename}:")
    output_file = create_output_tsv(input_filename=input_filename, output_filename=output_filename, extra_cols=["dadis_transboundary_id"])
    matched_breeds.to_csv(output_file, sep="\t", index=False, header=False)
    output_file.close()
    print("Output written.")
    return matched_breeds
    

def read_vbo_data(filename: str) -> pd.DataFrame:
    df = pd.read_table(filename, skiprows=[1]).convert_dtypes()
    return df


def get_dadis_species(client: DadisClient) -> pd.DataFrame:
    resp = client.get_all_species()
    all_species = []
    for s in resp.response:
        species = {"dadis_species_id": s.id, "dadis_species_name": s.name["en"]}
        all_species.append(species)
    return pd.DataFrame.from_records(all_species)


def get_canonical_dadis_transboundary(client: DadisClient) -> pd.DataFrame:
    """
    DADIS has a canonical name for each transboundary breed, fetch these
    and return them as a dataframe
    """
    resp = client.get_all_transboundary_names()
    df = (
        pd.DataFrame.from_records([b.model_dump() for b in resp.response])
        .rename(columns={"speciesId": "dadis_species_id"})
    )
    species_df = get_dadis_species(client)
    df = df.merge(species_df, how="left", on="dadis_species_id")
    df = df.rename(columns={"id": "dadis_transboundary_id", "name": "dadis_breed_name"})
    return df

def get_all_dadis_transboundary(client: DadisClient) -> pd.DataFrame:
    """
    Get all names for DADIS transboundary breeds, some VBO entries
    may use non-canonical names
    """
    resp = client.get_all_transboundary_breeds()
    df = (
        pd.DataFrame.from_records([b.model_dump() for b in resp.response])
        .rename(columns={
            "speciesId": "dadis_species_id", 
            "name": "dadis_breed_name", 
            "transboundaryId": "dadis_transboundary_id",
            "id": "dadis_breed_id",
            "iso3": "dadis_iso3_code",
        })
        .drop_duplicates(subset=["dadis_species_id", "dadis_breed_name", "dadis_transboundary_id"])
    )
    species_df = get_dadis_species(client)
    result = (
        df
        .merge(species_df, how="left", on="dadis_species_id")
        .sort_values(["dadis_transboundary_id", "dadis_breed_name"])
    )
    return result


def get_simple_matches(vbo_data: pd.DataFrame, client: DadisClient) -> pd.DataFrame:
    """
    Match VBO entries to DADIS transboundary breeds based on their
    canonical names. Return a dataframe containing the matches
    """
    dadis_canonical = get_canonical_dadis_transboundary(client=client)
    match_data = vbo_data.query("to_be_ignored != 'duplicate'")[["vbo_id", "term_label", "dadis_name", "dadis_species_name"]]
    simple_matches = (
        match_data.merge(
            dadis_canonical,
            how="left",
            left_on=["dadis_name", "dadis_species_name"],
            right_on=["dadis_breed_name", "dadis_species_name"],
            sort=False,
        )
        .drop(columns=["dadis_breed_name", "dadis_species_name"])
        .convert_dtypes()
    )
    return simple_matches


def get_extra_matches(vbo_data: pd.DataFrame, client: DadisClient) -> pd.DataFrame:
    dadis_all = get_all_dadis_transboundary(client=client)
    match_data = vbo_data.query("to_be_ignored != 'duplicate'")[["vbo_id", "term_label", "dadis_name", "dadis_species_name"]]
    extra_matches = (
        match_data.merge(
            dadis_all,
            how="left",
            left_on=["dadis_name", "dadis_species_name"],
            right_on=["dadis_breed_name", "dadis_species_name"],
            indicator=True,
            # Need to ensure we use sort=False so order stays consistent with original
            sort=False,
        )
        .drop(columns=["dadis_breed_name", "dadis_species_name"])
        .convert_dtypes()
        .drop_duplicates(subset=["vbo_id", "dadis_transboundary_id"])
    )
    counts = extra_matches["vbo_id"].value_counts()
    duplicates = counts.loc[counts >=2 ].index.tolist()
    multiple_matches = extra_matches.loc[extra_matches["vbo_id"].isin(duplicates), :]
    print("The following entries matched against multiple DADIS entries - will not be updated")
    print(multiple_matches["vbo_id"])
    extra_matches = extra_matches.loc[~ extra_matches["vbo_id"].isin(duplicates), :]
    return extra_matches


def match_vbo_breeds(vbo_data: pd.DataFrame, client: DadisClient) -> pd.DataFrame:
    """
    Match VBO entries to DADIS, based on both canonical name (preferred) or
    other transboundary name.

    Return a modified copy of vbo_data, with the 'dadis_transboundary_id' added
    """
    print("Matching to canonical DADIS names")
    simple_matches = get_simple_matches(vbo_data, client)
    print("Matching to other DADIS names")
    extra_matches = get_extra_matches(vbo_data, client)
    all_matches = simple_matches.merge(
        extra_matches[["vbo_id", "dadis_transboundary_id", "dadis_species_id"]],
        how="left",
        on="vbo_id",
        suffixes=(None, "_extra"),
        sort=False
    ).convert_dtypes()
    # Fill in simple matches with extra matches, where no simple match was found
    simple_ids = all_matches["dadis_transboundary_id"]
    extra_ids = all_matches["dadis_transboundary_id_extra"]
    all_matches["dadis_transboundary_id"] = simple_ids.where(~ simple_ids.isna(), extra_ids)

    n_total = all_matches.shape[0]
    n_matched = all_matches["dadis_transboundary_id"].notna().sum()
    print(f"{n_matched} / {n_total} VBO entries matched with DADIS")
    
    return vbo_data.merge(
        all_matches[["vbo_id", "dadis_transboundary_id"]],
        on="vbo_id",
        how="left",
        sort=False
    )


def create_output_tsv(input_filename: str, output_filename: str, extra_cols: list[str] = None) -> TextIO:
    """
    Copy the 2 header lines from the input file to the output file. Return
    a file object for the output file, so pandas can write the rest of the file
    """
    file_out = open(output_filename, "w")
    csv_out = csv.writer(file_out, dialect="excel-tab")
    with open(input_filename) as file_in:
        csv_in = csv.reader(file_in, dialect="excel-tab")
        for index, line in enumerate(range(2)):
            header = next(csv_in)
            if extra_cols is not None:
                if index == 0:
                    header += extra_cols
                if index == 1:
                    header += ['' for i in range(len(extra_cols))]
            csv_out.writerow(header)
    return file_out

# Test full updating process

In [5]:
result = full_matching_workflow(input_filename="data/dadistransbound.tsv", output_filename="example_transboundary_output.tsv", dadis_api_key=API_KEY)

Matching to canonical DADIS names
Matching to other DADIS names
The following entries matched against multiple DADIS entries - will not be updated
19      VBO:0000056
20      VBO:0000056
67      VBO:0000104
68      VBO:0000104
144     VBO:0000184
145     VBO:0000184
168     VBO:0000207
169     VBO:0000207
177     VBO:0000215
178     VBO:0000215
227     VBO:0000265
228     VBO:0000265
235     VBO:0000272
236     VBO:0000272
279     VBO:0000315
280     VBO:0000315
286     VBO:0000321
287     VBO:0000321
355     VBO:0000389
356     VBO:0000389
389     VBO:0000422
390     VBO:0000422
414     VBO:0000446
415     VBO:0000446
614     VBO:0000645
615     VBO:0000645
677     VBO:0000717
678     VBO:0000717
711     VBO:0000751
712     VBO:0000751
911     VBO:0000951
912     VBO:0000951
1107    VBO:0001146
1108    VBO:0001146
1123    VBO:0001161
1124    VBO:0001161
1212    VBO:0001249
1213    VBO:0001249
1217    VBO:0001253
1218    VBO:0001253
1423    VBO:0001461
1424    VBO:0001461
1504    VBO:0

In [6]:
result.head()

Unnamed: 0,vbo_id,term_label,to_be_ignored,breed_name_to_be_used_for_label,species_to_be_used_for_label,internal_term_merge_indicator_vboid_into_vboid,internal_date_and_issue_separated_by_pipe_yymmdd_dash_issue,gh_issue_NOT_merge_obsolete,parent_ID,source_for_parents,...,comment.1,GH_issue,obsolescence_reason,curator_note,dadis_name,dadis_species_name,dadis_country,dadis_iso3_code,in_subset_transboundary,dadis_transboundary_id
0,VBO:0000012,Dromedary Bactrian Camel,,,,,,,NCBITaxon:9836,,...,,,,,,,,,,
1,VBO:0000038,,,Alpaca,Alpaca,,231120-126|231212-137,https://github.com/monarch-initiative/vertebra...,NCBITaxon:30538,https://www.fao.org/dad-is,...,,,,,Alpaca,Alpaca,,,http://purl.obolibrary.org/obo/vbo#transboundary,
2,VBO:0000039,Huacaya (Alpaca),,Huacaya,Alpaca,,231120-126,https://github.com/monarch-initiative/vertebra...,NCBITaxon:30538,https://www.fao.org/dad-is,...,,,,,Huacaya,Alpaca,,,http://purl.obolibrary.org/obo/vbo#transboundary,1-1
3,VBO:0000040,Suri (Alpaca),,Suri,Alpaca,,231120-126,https://github.com/monarch-initiative/vertebra...,NCBITaxon:30538,https://www.fao.org/dad-is,...,,,,,Suri,Alpaca,,,http://purl.obolibrary.org/obo/vbo#transboundary,1-2
4,VBO:0000041,,,American Bison,American Bison,,231120-126|231212-137,https://github.com/monarch-initiative/vertebra...,NCBITaxon:9901,https://www.fao.org/dad-is,...,,,,,American Bison,American Bison,,,http://purl.obolibrary.org/obo/vbo#transboundary,2-1


In [7]:
result.loc[result["vbo_id"] == "VBO:0000991", :]

Unnamed: 0,vbo_id,term_label,to_be_ignored,breed_name_to_be_used_for_label,species_to_be_used_for_label,internal_term_merge_indicator_vboid_into_vboid,internal_date_and_issue_separated_by_pipe_yymmdd_dash_issue,gh_issue_NOT_merge_obsolete,parent_ID,source_for_parents,...,comment.1,GH_issue,obsolescence_reason,curator_note,dadis_name,dadis_species_name,dadis_country,dadis_iso3_code,in_subset_transboundary,dadis_transboundary_id
936,VBO:0000991,Icelandic Horse (Horse),,Icelandic Horse,Horse,VBO:0000990 into VBO:0000991,230207-75|231120-126,,NCBITaxon:9796,https://www.fao.org/dad-is,...,,,,,Icelandic Horse,Horse,,,http://purl.obolibrary.org/obo/vbo#transboundary,24-73
937,VBO:0000991,,duplicate,Iceland Pony,Horse,VBO:0000990 into VBO:0000991,230207-75|231120-126,,,,...,,https://github.com/monarch-initiative/vertebra...,,,Iceland Pony,Horse,,,http://purl.obolibrary.org/obo/vbo#transboundary,24-73


# Step-by-step process: for debugging

Note: may not be fully up to date with the process above.

In [8]:
dadis_species = get_dadis_species(client)
dadis_species.head()

Unnamed: 0,dadis_species_id,dadis_species_name
0,1,Alpaca
1,2,American Bison
2,3,Ass
3,4,Bactrian camel
4,5,Buffalo


In [9]:
dadis_canonical = get_canonical_dadis_transboundary(client=client)
dadis_canonical.head()

Unnamed: 0,dadis_transboundary_id,dadis_species_id,dadis_breed_name,dadis_species_name
0,1-1,1,Huacaya,Alpaca
1,1-2,1,Suri,Alpaca
2,2-1,2,American Bison,American Bison
3,3-1,3,Asino Sardo,Ass
4,3-2,3,Balkan Donkey,Ass


In [10]:
dadis_all = get_all_dadis_transboundary(client=client)
dadis_all.head()

Unnamed: 0,dadis_breed_id,dadis_breed_name,dadis_iso3_code,dadis_species_id,dadis_transboundary_id,updatedAt,dadis_species_name
147,da0f5e4b-2dc4-4119-bb9a-6d87d88b2a3c,Huacaya,AUS,1,1-1,1652172491000,Alpaca
148,32354ff8-1f74-4e90-abb5-657753427508,Suri,AUS,1,1-2,1652172491000,Alpaca
617,56378f94-0804-411b-a7bd-a9b0a43bf187,Sury,BOL,1,1-2,1665583332000,Alpaca
1091,ba06f2a8-caca-4dce-8c22-8599b349b85e,Test,DEU,1,1-2,1678781404000,Alpaca
242,0c273dfe-f832-431b-938a-b9235a6dbffc,Chital,AUS,11,11-1,1652172493000,Deer


In [11]:
dadis_all.dtypes

dadis_breed_id            object
dadis_breed_name          object
dadis_iso3_code           object
dadis_species_id           int64
dadis_transboundary_id    object
updatedAt                  int64
dadis_species_name        object
dtype: object

## VBO transboundary data

In [12]:
VBO_TRANSBOUNDARY_FILENAME = "data/dadistransbound.tsv"

In [13]:
vbo_transboundary = pd.read_table(VBO_TRANSBOUNDARY_FILENAME, skiprows=[1]).convert_dtypes()
# Drop problematic entry: VBO:0000991 - Icelandic Horse/Iceland Pony
#dup_entry = (vbo_transboundary["VBO id"] == "VBO:0000991") & (vbo_transboundary["term label"].isna())
#vbo_transboundary = vbo_transboundary.loc[~ dup_entry, :]
vbo_transboundary.head()

Unnamed: 0,vbo_id,term_label,to_be_ignored,breed_name_to_be_used_for_label,species_to_be_used_for_label,internal_term_merge_indicator_vboid_into_vboid,internal_date_and_issue_separated_by_pipe_yymmdd_dash_issue,gh_issue_NOT_merge_obsolete,parent_ID,source_for_parents,...,replacement_label,comment.1,GH_issue,obsolescence_reason,curator_note,dadis_name,dadis_species_name,dadis_country,dadis_iso3_code,in_subset_transboundary
0,VBO:0000012,Dromedary Bactrian Camel,,,,,,,NCBITaxon:9836,,...,,,,,,,,,,
1,VBO:0000038,,,Alpaca,Alpaca,,231120-126|231212-137,https://github.com/monarch-initiative/vertebra...,NCBITaxon:30538,https://www.fao.org/dad-is,...,,,,,,Alpaca,Alpaca,,,http://purl.obolibrary.org/obo/vbo#transboundary
2,VBO:0000039,Huacaya (Alpaca),,Huacaya,Alpaca,,231120-126,https://github.com/monarch-initiative/vertebra...,NCBITaxon:30538,https://www.fao.org/dad-is,...,,,,,,Huacaya,Alpaca,,,http://purl.obolibrary.org/obo/vbo#transboundary
3,VBO:0000040,Suri (Alpaca),,Suri,Alpaca,,231120-126,https://github.com/monarch-initiative/vertebra...,NCBITaxon:30538,https://www.fao.org/dad-is,...,,,,,,Suri,Alpaca,,,http://purl.obolibrary.org/obo/vbo#transboundary
4,VBO:0000041,,,American Bison,American Bison,,231120-126|231212-137,https://github.com/monarch-initiative/vertebra...,NCBITaxon:9901,https://www.fao.org/dad-is,...,,,,,,American Bison,American Bison,,,http://purl.obolibrary.org/obo/vbo#transboundary


In [14]:
vbo_transboundary.columns

Index(['vbo_id', 'term_label', 'to_be_ignored',
       'breed_name_to_be_used_for_label', 'species_to_be_used_for_label',
       'internal_term_merge_indicator_vboid_into_vboid',
       'internal_date_and_issue_separated_by_pipe_yymmdd_dash_issue',
       'gh_issue_NOT_merge_obsolete', 'parent_ID', 'source_for_parents',
       'source_for_parents.1', 'parent_ID.1', 'source_for_parents.2',
       'source_for_parents.3', 'most_common_name',
       'synonym_type_most_common_name', 'source_for_most_common_name',
       'source_for_most_common_name.1', 'synonym_dadis_name_not_used_in_label',
       'source_for_synonym_dadis', 'source_for_synonym_dadis.1', 'synonym_1',
       'source_for_synonym_1', 'source_for_synonym_1.1', 'synonym_2',
       'source_for_synonym_2', 'source_for_synonym_2.1', 'synonym_3',
       'source_for_synonym_3', 'source_for_synonym_3.1', 'synonym_4',
       'source_for_synonym_4', 'source_for_synonym_4.1', 'synonym_5',
       'source_for_synonym_5', 'source_for_synon

In [15]:
vbo_transboundary.loc[vbo_transboundary["vbo_id"] == "VBO:0000991", :]

Unnamed: 0,vbo_id,term_label,to_be_ignored,breed_name_to_be_used_for_label,species_to_be_used_for_label,internal_term_merge_indicator_vboid_into_vboid,internal_date_and_issue_separated_by_pipe_yymmdd_dash_issue,gh_issue_NOT_merge_obsolete,parent_ID,source_for_parents,...,replacement_label,comment.1,GH_issue,obsolescence_reason,curator_note,dadis_name,dadis_species_name,dadis_country,dadis_iso3_code,in_subset_transboundary
936,VBO:0000991,Icelandic Horse (Horse),,Icelandic Horse,Horse,VBO:0000990 into VBO:0000991,230207-75|231120-126,,NCBITaxon:9796,https://www.fao.org/dad-is,...,,,,,,Icelandic Horse,Horse,,,http://purl.obolibrary.org/obo/vbo#transboundary
937,VBO:0000991,,duplicate,Iceland Pony,Horse,VBO:0000990 into VBO:0000991,230207-75|231120-126,,,,...,,,https://github.com/monarch-initiative/vertebra...,,,Iceland Pony,Horse,,,http://purl.obolibrary.org/obo/vbo#transboundary


In [16]:
vbo_transboundary.shape

(1674, 91)

In [17]:
def get_simple_matches(vbo_data: pd.DataFrame, client: DadisClient) -> pd.DataFrame:
    """
    Match VBO entries to DADIS transboundary breeds based on their
    canonical names. Return a dataframe containing the matches
    """
    dadis_canonical = get_canonical_dadis_transboundary(client=client)
    match_data = vbo_data.query("to_be_ignored != 'duplicate'")[["vbo_id", "term_label", "dadis_name", "dadis_species_name"]]
    simple_matches = (
        match_data.merge(
            dadis_canonical,
            how="left",
            left_on=["dadis_name", "dadis_species_name"],
            right_on=["dadis_breed_name", "dadis_species_name"],
            sort=False,
        )
        .drop(columns=["dadis_breed_name", "dadis_species_name"])
        .convert_dtypes()
    )
    return simple_matches


def get_extra_matches(vbo_data: pd.DataFrame, client: DadisClient) -> pd.DataFrame:
    dadis_all = get_all_dadis_transboundary(client=client)
    match_data = vbo_data.query("to_be_ignored != 'duplicate'")[["vbo_id", "term_label", "dadis_name", "dadis_species_name"]]
    extra_matches = (
        match_data.merge(
            dadis_all,
            how="left",
            left_on=["dadis_name", "dadis_species_name"],
            right_on=["dadis_breed_name", "dadis_species_name"],
            indicator=True,
            # Need to ensure we use sort=False so order stays consistent with original
            sort=False,
        )
        .drop(columns=["dadis_breed_name", "dadis_species_name"])
        .convert_dtypes()
        .drop_duplicates(subset=["vbo_id", "dadis_transboundary_id"])
    )
    counts = extra_matches["vbo_id"].value_counts()
    duplicates = counts.loc[counts >=2 ].index.tolist()
    multiple_matches = extra_matches.loc[extra_matches["vbo_id"].isin(duplicates), :]
    print("The following entries matched against multiple DADIS entries - will not be updated")
    print(multiple_matches["vbo_id"])
    extra_matches = extra_matches.loc[~ extra_matches["vbo_id"].isin(duplicates), :]
    return extra_matches


def match_vbo_breeds(vbo_data: pd.DataFrame, client: DadisClient) -> pd.DataFrame:
    """
    Match VBO entries to DADIS, based on both canonical name (preferred) or
    other transboundary name.

    Return a modified copy of vbo_data, with the 'dadis_transboundary_id' added
    """
    print("Matching to canonical DADIS names")
    simple_matches = get_simple_matches(vbo_data, client)
    print("Matching to other DADIS names")
    extra_matches = get_extra_matches(vbo_data, client)
    all_matches = simple_matches.merge(
        extra_matches[["vbo_id", "dadis_transboundary_id", "dadis_species_id"]],
        how="left",
        on="vbo_id",
        suffixes=(None, "_extra"),
        sort=False
    ).convert_dtypes()
    # Fill in simple matches with extra matches, where no simple match was found
    simple_ids = all_matches["dadis_transboundary_id"]
    extra_ids = all_matches["dadis_transboundary_id_extra"]
    all_matches["dadis_transboundary_id"] = simple_ids.where(~ simple_ids.isna(), extra_ids)

    n_total = all_matches.shape[0]
    n_matched = all_matches["dadis_transboundary_id"].notna().sum()
    print(f"{n_matched} / {n_total} VBO entries matched with DADIS")
    
    return vbo_data.merge(
        all_matches[["vbo_id", "dadis_transboundary_id"]],
        on="vbo_id",
        how="left",
        sort=False
    )

In [18]:
simple_matches = get_simple_matches(vbo_data=vbo_transboundary, client=client)
simple_matches.head()

Unnamed: 0,vbo_id,term_label,dadis_name,dadis_transboundary_id,dadis_species_id
0,VBO:0000012,Dromedary Bactrian Camel,,,
1,VBO:0000038,,Alpaca,,
2,VBO:0000039,Huacaya (Alpaca),Huacaya,1-1,1.0
3,VBO:0000040,Suri (Alpaca),Suri,1-2,1.0
4,VBO:0000041,,American Bison,2-1,2.0


In [19]:
simple_matches.loc[simple_matches["vbo_id"] == "VBO:0000991", :]

Unnamed: 0,vbo_id,term_label,dadis_name,dadis_transboundary_id,dadis_species_id
936,VBO:0000991,Icelandic Horse (Horse),Icelandic Horse,24-73,24


In [20]:
extra_matches = get_extra_matches(vbo_data=vbo_transboundary, client=client)
extra_matches.head()

The following entries matched against multiple DADIS entries - will not be updated
19      VBO:0000056
20      VBO:0000056
67      VBO:0000104
68      VBO:0000104
144     VBO:0000184
145     VBO:0000184
168     VBO:0000207
169     VBO:0000207
177     VBO:0000215
178     VBO:0000215
227     VBO:0000265
228     VBO:0000265
235     VBO:0000272
236     VBO:0000272
279     VBO:0000315
280     VBO:0000315
286     VBO:0000321
287     VBO:0000321
355     VBO:0000389
356     VBO:0000389
389     VBO:0000422
390     VBO:0000422
414     VBO:0000446
415     VBO:0000446
614     VBO:0000645
615     VBO:0000645
677     VBO:0000717
678     VBO:0000717
711     VBO:0000751
712     VBO:0000751
911     VBO:0000951
912     VBO:0000951
1107    VBO:0001146
1108    VBO:0001146
1123    VBO:0001161
1124    VBO:0001161
1212    VBO:0001249
1213    VBO:0001249
1217    VBO:0001253
1218    VBO:0001253
1423    VBO:0001461
1424    VBO:0001461
1504    VBO:0001541
1505    VBO:0001541
Name: vbo_id, dtype: string


Unnamed: 0,vbo_id,term_label,dadis_name,dadis_breed_id,dadis_iso3_code,dadis_species_id,dadis_transboundary_id,updatedAt,_merge
0,VBO:0000012,Dromedary Bactrian Camel,,,,,,,left_only
1,VBO:0000038,,Alpaca,,,,,,left_only
2,VBO:0000039,Huacaya (Alpaca),Huacaya,da0f5e4b-2dc4-4119-bb9a-6d87d88b2a3c,AUS,1.0,1-1,1652172491000.0,both
3,VBO:0000040,Suri (Alpaca),Suri,32354ff8-1f74-4e90-abb5-657753427508,AUS,1.0,1-2,1652172491000.0,both
4,VBO:0000041,,American Bison,,,,,,left_only


In [21]:
extra_matches.loc[extra_matches["vbo_id"] == "VBO:0000991", :]

Unnamed: 0,vbo_id,term_label,dadis_name,dadis_breed_id,dadis_iso3_code,dadis_species_id,dadis_transboundary_id,updatedAt,_merge
952,VBO:0000991,Icelandic Horse (Horse),Icelandic Horse,,,,,,left_only


In [22]:
all_matches = simple_matches.merge(
        extra_matches[["vbo_id", "dadis_transboundary_id", "dadis_species_id"]],
        how="left",
        on="vbo_id",
        suffixes=(None, "_extra"),
        sort=False
    ).convert_dtypes()
# Fill in simple matches with extra matches, where no simple match was found
simple_ids = all_matches["dadis_transboundary_id"]
extra_ids = all_matches["dadis_transboundary_id_extra"]
all_matches["dadis_transboundary_id"] = simple_ids.where(~ simple_ids.isna(), extra_ids)


In [23]:
all_matches.loc[all_matches["vbo_id"] == "VBO:0000991", :]

Unnamed: 0,vbo_id,term_label,dadis_name,dadis_transboundary_id,dadis_species_id,dadis_transboundary_id_extra,dadis_species_id_extra
936,VBO:0000991,Icelandic Horse (Horse),Icelandic Horse,24-73,24,,


In [24]:
vbo_matched = match_vbo_breeds(vbo_data=vbo_transboundary, client=client)
print(vbo_matched.shape)
vbo_matched.head()

Matching to canonical DADIS names
Matching to other DADIS names
The following entries matched against multiple DADIS entries - will not be updated
19      VBO:0000056
20      VBO:0000056
67      VBO:0000104
68      VBO:0000104
144     VBO:0000184
145     VBO:0000184
168     VBO:0000207
169     VBO:0000207
177     VBO:0000215
178     VBO:0000215
227     VBO:0000265
228     VBO:0000265
235     VBO:0000272
236     VBO:0000272
279     VBO:0000315
280     VBO:0000315
286     VBO:0000321
287     VBO:0000321
355     VBO:0000389
356     VBO:0000389
389     VBO:0000422
390     VBO:0000422
414     VBO:0000446
415     VBO:0000446
614     VBO:0000645
615     VBO:0000645
677     VBO:0000717
678     VBO:0000717
711     VBO:0000751
712     VBO:0000751
911     VBO:0000951
912     VBO:0000951
1107    VBO:0001146
1108    VBO:0001146
1123    VBO:0001161
1124    VBO:0001161
1212    VBO:0001249
1213    VBO:0001249
1217    VBO:0001253
1218    VBO:0001253
1423    VBO:0001461
1424    VBO:0001461
1504    VBO:0

Unnamed: 0,vbo_id,term_label,to_be_ignored,breed_name_to_be_used_for_label,species_to_be_used_for_label,internal_term_merge_indicator_vboid_into_vboid,internal_date_and_issue_separated_by_pipe_yymmdd_dash_issue,gh_issue_NOT_merge_obsolete,parent_ID,source_for_parents,...,comment.1,GH_issue,obsolescence_reason,curator_note,dadis_name,dadis_species_name,dadis_country,dadis_iso3_code,in_subset_transboundary,dadis_transboundary_id
0,VBO:0000012,Dromedary Bactrian Camel,,,,,,,NCBITaxon:9836,,...,,,,,,,,,,
1,VBO:0000038,,,Alpaca,Alpaca,,231120-126|231212-137,https://github.com/monarch-initiative/vertebra...,NCBITaxon:30538,https://www.fao.org/dad-is,...,,,,,Alpaca,Alpaca,,,http://purl.obolibrary.org/obo/vbo#transboundary,
2,VBO:0000039,Huacaya (Alpaca),,Huacaya,Alpaca,,231120-126,https://github.com/monarch-initiative/vertebra...,NCBITaxon:30538,https://www.fao.org/dad-is,...,,,,,Huacaya,Alpaca,,,http://purl.obolibrary.org/obo/vbo#transboundary,1-1
3,VBO:0000040,Suri (Alpaca),,Suri,Alpaca,,231120-126,https://github.com/monarch-initiative/vertebra...,NCBITaxon:30538,https://www.fao.org/dad-is,...,,,,,Suri,Alpaca,,,http://purl.obolibrary.org/obo/vbo#transboundary,1-2
4,VBO:0000041,,,American Bison,American Bison,,231120-126|231212-137,https://github.com/monarch-initiative/vertebra...,NCBITaxon:9901,https://www.fao.org/dad-is,...,,,,,American Bison,American Bison,,,http://purl.obolibrary.org/obo/vbo#transboundary,2-1


In [25]:
vbo_matched["vbo_id"].value_counts()

vbo_id
VBO:0000991    2
VBO:0000142    1
VBO:0000040    1
VBO:0000041    1
VBO:0000042    1
              ..
VBO:0016849    1
VBO:0016850    1
VBO:0016851    1
VBO:0016852    1
VBO:0016853    1
Name: count, Length: 1673, dtype: Int64

In [26]:
vbo_matched.loc[vbo_matched["vbo_id"] == "VBO:0000991", :]

Unnamed: 0,vbo_id,term_label,to_be_ignored,breed_name_to_be_used_for_label,species_to_be_used_for_label,internal_term_merge_indicator_vboid_into_vboid,internal_date_and_issue_separated_by_pipe_yymmdd_dash_issue,gh_issue_NOT_merge_obsolete,parent_ID,source_for_parents,...,comment.1,GH_issue,obsolescence_reason,curator_note,dadis_name,dadis_species_name,dadis_country,dadis_iso3_code,in_subset_transboundary,dadis_transboundary_id
936,VBO:0000991,Icelandic Horse (Horse),,Icelandic Horse,Horse,VBO:0000990 into VBO:0000991,230207-75|231120-126,,NCBITaxon:9796,https://www.fao.org/dad-is,...,,,,,Icelandic Horse,Horse,,,http://purl.obolibrary.org/obo/vbo#transboundary,24-73
937,VBO:0000991,,duplicate,Iceland Pony,Horse,VBO:0000990 into VBO:0000991,230207-75|231120-126,,,,...,,https://github.com/monarch-initiative/vertebra...,,,Iceland Pony,Horse,,,http://purl.obolibrary.org/obo/vbo#transboundary,24-73


### Test TSV output with double header

In [27]:
# Create file with just the 2 header lines
output_tsv = create_output_tsv(input_filename=VBO_TRANSBOUNDARY_FILENAME, output_filename="output_test.tsv", extra_cols=["dadis_transboundary_id"])
# Write the DF from pandas, not including the header
vbo_transboundary.to_csv(output_tsv, sep="\t", index=False, header=False)