In [1]:
import pandas as pd
import re
import json
from typing import List
from sssom import Mapping, MappingSet
from IPython.core.display import HTML
import yaml
from typing import List
from sssom import Mapping, MappingSet
from sssom.sssom_document import MappingSetDocument
from sssom.parsers import to_mapping_set_dataframe
from sssom.writers import write_table

head_count=200

# Inputs
pato_labels_file="../ontology/reports/pato.tsv"
uberon_labels_file="../ontology/reports/uberon.tsv"
efo_labels_file="../ontology/reports/efo.tsv"
oba_labels_file="../ontology/reports/oba.tsv"
cl_labels_file="../ontology/reports/cl.tsv"
go_labels_file="../ontology/reports/go.tsv"
vt_labels_file="../ontology/reports/vt.tsv"
chebi_labels_file="../ontology/reports/chebi.tsv"

oba_efo_mapping_file="../mappings/oba-efo.sssom.tsv"
oba_vt_mapping_file="../mappings/oba-vt.sssom.tsv"
oba_efo_mapping_metadata_file="../mappings/oba-efo.sssom.yml"
oba_vt_mapping_metadata_file="../mappings/oba-vt.sssom.yml"
oba_efo_exclusion_file="../mappings/oba-efo-mapping-exclusions.sssom.tsv" # Contains all terms we decided not to map for one reason or another
oba_vt_exclusion_file="../mappings/oba-vt-mapping-exclusions.sssom.tsv" # Contains all terms we decided not to map for one reason or another

# Outputs
oba_vt_mapping_candidates_file="../mappings/oba-vt-unreviewed.sssom.tsv" #VT mappings identified by pipeline but not reviewed
oba_vt_mapping_missed_file="../mappings/oba-vt-missed.sssom.tsv" #VT mappings identified by looking at OBA IRIs (no need for review)
oba_vt_mapping_unmapped_file="../mappings/oba-vt-unmapped.sssom.tsv" #VT terms that have not been mapped so far (excluding reviewed and candidate mappings)
oba_vt_dosdp_candidates_file="../mappings/oba-vt-unreviewed.dosdp.tsv" # VT terms with candidate DOSDP mappings

oba_efo_mapping_candidates_file="../mappings/oba-efo-unreviewed.sssom.tsv" # see above vt analog
oba_efo_mapping_unmapped_file="../mappings/oba-efo-unmapped.sssom.tsv" # see above vt analog
oba_efo_dosdp_candidates_file="../mappings/oba-efo-unreviewed.dosdp.tsv" # see above vt analog

# Generic printing function; all printouts are passed through here
def echo(string):
    #print(string)
    pass


# OBA Alignment Work

Make sure that you update the source data

```
sh run.sh make prepare_oba_alignment
```

We are doing three seperate things here:

1. Aligning EFO with OBA. This involves matching EFO classes to VT and OBA classes with SSSOM
2. Trying to patternise EFO classes by matching qualities and entities in its labels
3. Aligning VT with OBA by trying to patternise VT classes that have not been included in OBA so far


## Reading all data

The only input to this simple process is a table with labels and exact synonyms for all participating ontologies:

For matching potential DOSDP patterns:

* PATO (Qualities)
* UBERON (Entities)
* CL (Entities)
* GO (Entities)

For matching traits in general:

* VT
* EFO
* OBA

In [40]:
def save_tsv(df, filename):
    if "index" in df.columns:
        df = df.drop("index", axis=1, errors='ignore')
    df.to_csv(filename, sep="\t", index=False)


def mentions_token(value, list_of_strings):
    for l in list_of_strings:
        if l in value:
            #echo(f"value: {value}: {l}")
            return True
    return False


def read_labels(fn, prefix, labels_must_match=[]):
    labels = pd.read_csv(fn, sep="\t", low_memory=False)
    labels = labels.astype(str)
    labels.columns = ['id', 'predicate', 'value', 'type']
    labels = labels[['id', 'predicate', 'value']]
    labels['id']=labels['id'].str.replace('<http://purl.obolibrary.org/obo/','', regex=False)
    labels['id']=labels['id'].str.replace('<http://www.ebi.ac.uk/efo/','', regex=False)
    labels['id']=labels['id'].str.replace('>','', regex=False)
    labels['id']=labels['id'].str.replace('_',':', regex=False)
    labels['predicate']=labels['predicate'].str.replace('<http://www.w3.org/2000/01/rdf-schema#','rdfs:', regex=False)
    labels['predicate']=labels['predicate'].str.replace('<http://www.geneontology.org/formats/oboInOwl#','oboInOwl:', regex=False)
    labels['predicate']=labels['predicate'].str.replace('>','', regex=False)
    labels=labels[labels['id'].str.contains(prefix)]
    if labels_must_match:
        vt_efo_mapping_unmapped[vt_efo_mapping_unmapped['value']].head()
        labels=labels[labels['value'].apply(lambda v: mentions_token(v, labels_must_match))]
        #labels=labels[labels['value'].str.contains('|'.join(labels_must_match))]
    return labels

In [3]:
efo_labels=read_labels(efo_labels_file, "EFO")
efo_labels

Unnamed: 0,id,predicate,value
0,EFO:0010138,rdfs:label,nitrite measurement
1,EFO:0020165,rdfs:label,AT-rich interactive domain-containing protein ...
2,EFO:0008214,rdfs:label,lymphotactin measurement
3,EFO:0005273,rdfs:label,sleep depth
4,EFO:0010773,rdfs:label,CD5 measurement
...,...,...,...
4530,EFO:0005117,oboInOwl:hasExactSynonym,"Regulated on Activation, Normal T cell Express..."
4531,EFO:0009272,oboInOwl:hasExactSynonym,VCA-IgG seropositivity
4532,EFO:0009272,oboInOwl:hasExactSynonym,VCA seropositivity
4533,EFO:0010381,oboInOwl:hasExactSynonym,PC 36:3


In [4]:
vt_labels=read_labels(vt_labels_file, "VT")
vt_labels

Unnamed: 0,id,predicate,value
0,VT:0003477,rdfs:label,nerve fiber response trait
1,VT:0003799,rdfs:label,macrophage migration trait
2,VT:0005208,rdfs:label,iris stroma morphology trait
3,VT:0010765,rdfs:label,head and neck integrity trait
4,VT:0010508,rdfs:label,neurocranium mass
...,...,...,...
5991,VT:0002295,oboInOwl:hasExactSynonym,pulmonary circulation
5992,VT:0000953,oboInOwl:hasExactSynonym,oligoglia morphology trait
5993,VT:0005463,oboInOwl:hasExactSynonym,CD4+ T cell physiology trait
5994,VT:0005463,oboInOwl:hasExactSynonym,CD4+ T cell function


#### Tokens to reduce search space

Label tokens created to reduce search space for relevant entities. There is no point to searching through 100000 CHEBI enties to match every label if only 1000 relevant chebi entities theoretically exist. For CHEBI for example, this reduces the search space by 2/3.

In [5]:
# VT tokens
vt_label_tokens = []
[vt_label_tokens.extend(l) for l in vt_labels['value'].str.split()]

# EFO tokens
efo_label_tokens = []
[efo_label_tokens.extend(l) for l in efo_labels['value'].str.split()]
efo_label_tokens.append('quality')

label_tokens = list(set(vt_label_tokens+efo_label_tokens))
label_tokens = [token for token in label_tokens if len(token)>3]
len(label_tokens)

5511

#### Loading tables from EQ identification

In [6]:
pato_labels=read_labels(pato_labels_file, "PATO", label_tokens)
pato_labels

Unnamed: 0,id,predicate,value
4,PATO:0001519,rdfs:label,sound quality
13,PATO:0002215,rdfs:label,falciform
19,PATO:0000142,rdfs:label,obsolete substance
29,PATO:0000764,rdfs:label,passive
37,PATO:0002303,rdfs:label,decreased object quality
...,...,...,...
20278,PATO:0002045,oboInOwl:hasExactSynonym,dendriform
20310,PATO:0001624,oboInOwl:hasExactSynonym,having decreased function
20311,PATO:0001624,oboInOwl:hasExactSynonym,partial functionality
20312,PATO:0001624,oboInOwl:hasExactSynonym,low functionality


In [7]:
chebi_labels=read_labels(chebi_labels_file, "CHEBI", label_tokens)
chebi_labels

Unnamed: 0,id,predicate,value
0,CHEBI:96359,rdfs:label,"3-(1,3-benzodioxol-5-yl)-1-[[(4S,5S)-2-[(2R)-1..."
2,CHEBI:91284,rdfs:label,alpha-D-Man-(1->2)-alpha-D-Man-(1->2)-alpha-D-...
4,CHEBI:37429,rdfs:label,aldonolactone phosphate
7,CHEBI:153383,rdfs:label,"N-[(3R,4R,5S,6R)-5-[(2S,3R,4R,5S,6R)-3-Acetami..."
8,CHEBI:184015,rdfs:label,8-allyloxypsoralen
...,...,...,...
254253,CHEBI:157941,oboInOwl:hasExactSynonym,(2S)-2-[[(2S)-2-[[(2S)-2-aminopropanoyl]amino]...
254255,CHEBI:77795,oboInOwl:hasExactSynonym,"1-(2,4-difluorophenyl)-6-fluoro-7-[(3S)-3-meth..."
254257,CHEBI:189596,oboInOwl:hasExactSynonym,[(2R)-3-henicosanoyloxy-2-hydroxypropyl] 2-(tr...
254258,CHEBI:179100,oboInOwl:hasExactSynonym,"methyl 2-hydroxy-2-methyl-1,3,5,6,7,8-hexahydr..."


In [8]:
uberon_labels=read_labels(uberon_labels_file, "UBERON", label_tokens)
uberon_labels

Unnamed: 0,id,predicate,value
1,UBERON:0007232,rdfs:label,2 cell stage
2,UBERON:0007713,rdfs:label,fourth sacral spinal ganglion
3,UBERON:4300020,rdfs:label,anal fin basal cartilage
6,UBERON:0009657,rdfs:label,artery of lip
7,UBERON:0002370,rdfs:label,thymus
...,...,...,...
48701,UBERON:0008874,oboInOwl:hasExactSynonym,arbor alveolaris
48702,UBERON:0008874,oboInOwl:hasExactSynonym,primary pulmonary lobule
48704,UBERON:0008874,oboInOwl:hasExactSynonym,respiratory lobule
48706,UBERON:0001999,oboInOwl:hasExactSynonym,Hyrtl's muscle


In [9]:
cl_labels=read_labels(cl_labels_file, "CL", label_tokens)
cl_labels

Unnamed: 0,id,predicate,value
6,CL:1000380,rdfs:label,type 1 vestibular sensory cell of epithelium o...
17,CL:0005021,rdfs:label,mesenchymal lymphangioblast
18,CL:0000397,rdfs:label,ganglion interneuron
20,CL:1000291,rdfs:label,myocyte of posterior internodal tract
21,CL:0000878,rdfs:label,central nervous system macrophage
...,...,...,...
46342,CL:0000935,oboInOwl:hasExactSynonym,"CD4-negative, CD8-negative, alpha-beta intraep..."
46343,CL:0000935,oboInOwl:hasExactSynonym,"CD4-negative, CD8-negative, alpha-beta intraep..."
46344,CL:0000935,oboInOwl:hasExactSynonym,"CD4-negative, CD8-negative, alpha-beta intraep..."
46447,CL:0002670,oboInOwl:hasExactSynonym,type I spiral ligament fibrocyte


In [10]:
go_labels=read_labels(go_labels_file, "GO", label_tokens)
go_labels

Unnamed: 0,id,predicate,value
0,GO:0071389,rdfs:label,cellular response to mineralocorticoid stimulus
1,GO:0007561,rdfs:label,imaginal disc eversion
2,GO:0051685,rdfs:label,maintenance of ER location
3,GO:0034275,rdfs:label,kynurenic acid metabolic process
4,GO:0060870,rdfs:label,cell wall disassembly involved in floral organ...
...,...,...,...
122608,GO:0015997,oboInOwl:hasExactSynonym,ubiquinone formation monooxygenase activity
122609,GO:0015997,oboInOwl:hasExactSynonym,ubiquinone anabolism monooxygenase activity
122610,GO:0015997,oboInOwl:hasExactSynonym,coenzyme Q biosynthesis monooxygenase activity
122611,GO:0015997,oboInOwl:hasExactSynonym,coenzyme Q biosynthetic process monooxygenase ...


In [11]:
oba_labels=read_labels(oba_labels_file, "OBA")
oba_labels

Unnamed: 0,id,predicate,value
0,OBA:0005640,rdfs:label,philtrum amount
10,OBA:VT0009921,rdfs:label,transitional stage T3 B cell morphology trait
15,OBA:VT0010453,rdfs:label,abdominal wall mass
22,OBA:VT0005208,rdfs:label,iris stroma morphology trait
24,OBA:VT0001148,rdfs:label,testes size trait
...,...,...,...
73419,OBA:0002099,oboInOwl:hasExactSynonym,amount of sensory perception of smell
73420,OBA:0003173,oboInOwl:hasExactSynonym,symmetry of ear
73422,OBA:0003002,oboInOwl:hasExactSynonym,2-D shape of phalanx of manus
73425,OBA:1000249,oboInOwl:hasExactSynonym,quality of basicranium


### Loading mappings and exclusion file

In [12]:
from sssom.parsers import parse_sssom_table
with open(oba_efo_mapping_metadata_file, 'r') as stream:
    oba_efo_meta=yaml.safe_load(stream)
with open(oba_vt_mapping_metadata_file, 'r') as stream:
    oba_vt_meta=yaml.safe_load(stream)

efo_mapping_exclusions=pd.read_csv(oba_efo_exclusion_file,sep="\t")
vt_mapping_exclusions=pd.read_csv(oba_vt_exclusion_file,sep="\t")
oba_efo_sssom = parse_sssom_table(oba_efo_mapping_file, prefix_map=oba_efo_meta['curie_map'], meta=oba_efo_meta)
oba_vt_sssom = parse_sssom_table(oba_vt_mapping_file, prefix_map=oba_vt_meta['curie_map'], meta=oba_vt_meta)

In [13]:
oba_vt_sssom.df

Unnamed: 0,subject_id,predicate_id,object_id,match_type
0,VT:0000000,skos:exactMatch,OBA:VT0000000,HumanCurated
1,VT:0000002,skos:exactMatch,OBA:VT0000002,HumanCurated
2,VT:0000003,skos:exactMatch,OBA:VT0000003,HumanCurated
3,VT:0000005,skos:exactMatch,OBA:VT0000005,HumanCurated
4,VT:0000006,skos:exactMatch,OBA:VT0000006,HumanCurated
...,...,...,...,...
1965,VT:3000004,skos:exactMatch,OBA:VT3000004,HumanCurated
1966,VT:4000007,skos:exactMatch,OBA:VT4000007,HumanCurated
1967,VT:4000009,skos:exactMatch,OBA:VT4000009,HumanCurated
1968,VT:4000011,skos:exactMatch,OBA:VT4000011,HumanCurated


In [14]:
oba_efo_sssom.df

Unnamed: 0,subject_id,subject_label,subject_category,predicate_id,object_id,object_label,object_category,match_type,subject_source,object_source,mapping_date,confidence
0,EFO:0004301,blood viscosity,measurement,skos:exactMatch,OBA:VT3000004,blood viscosity trait,quality,HumanCurated,EFO,OBA,2021-09-10,1.0
1,EFO:0004325,blood pressure,measurement,skos:exactMatch,OBA:VT0000183,blood pressure trait,quality,HumanCurated,EFO,OBA,2021-09-10,1.0
2,EFO:0004832,optic disc size measurement,measurement,skos:exactMatch,OBA:VT0006216,optic disc size trait,quality,HumanCurated,EFO,OBA,2021-09-10,1.0


## Library of functions used for matching below

You dont have to read the library for now, skip it. It contains all methods needed for the matching process

In [15]:
# These are words we consider noise for the sake of this alignment process. Its probably worth adding more
stopwords = ["of", "in"]
replacements = {
    'measurement': 'quality',
    'trait': 'quality',
}

# Function that describes what we consider a "whole word" match in terms of regex. Probably
# Can be simplified.
def whole_word_regex(stopwords):
    stopwords_regex = []
    for stopword in stopwords:
        stopword = re.escape(stopword)
        stopwords_regex.append(f"[ .;,:]{stopword}[ .;,:]")
        stopwords_regex.append(f"[ ]{stopword}$")
        stopwords_regex.append(f"^{stopword}[ .;,:]")
        stopwords_regex.append(f"^{stopword}$")
    return stopwords_regex

# The label is prepared: lower cased, trimmed, stop words removed.
# Note this should probably do stemming, lemmatisation and _proper_ stop word removal (and, or, of) as well
def prepare_label(value, stopwords=[], replacements=replacements):
    stopwords_regex = whole_word_regex(stopwords)                           
    if isinstance(value, str):
        for regex_value in stopwords_regex:
            value = re.sub(regex_value, "", value)
        for replace_this in replacements:
            value = value.replace(replace_this, replacements[replace_this])
        value = value.lower()
        value = value.strip()
        return value
    else: 
        return ""


# Determines if the "value_to_match" is contained in label _as a whole word_. So active would not be 
# matched in a label which says inactive.
def whole_word_match(label, value_to_match, min_match_size=3):
    if value_to_match in label:
        whole_word_regexes = whole_word_regex([value_to_match,])
        for regex_value in whole_word_regexes:
            if(re.search(regex_value, label)):
                #echo(f"Match found: {regex_value} in {label}")
                return True
    return False

# the value (probably an OBA, VT or EFO label) is matched against everything in the dataframe df. 
# strict_word_order=False would split the words of the label into a list, then sorting them, which allows
# matching words which are simply changed in order, like "cell count" vs "count of cell"
def get_matches(value, df, strict_word_order=True, exact_only=True, min_match_size=3):
    matches = []
    if not strict_word_order:
        # This sorts the words in the string before attempting to match
        value = " ".join(sorted(value.split(" ")))
    
    for index, row in df.iterrows():
        curie = row['id']
        predicate = row['predicate']
        label = prepare_label(row['value'])
        if len(label)>min_match_size:
            if not strict_word_order:
                label = " ".join(sorted(label.split(" ")))
            if label==value:
                matches.append({"match_string": label, "object_id": curie , "object_label": row['value'], 'object_match_field' : predicate, 'predicate_id': 'skos:exactMatch' })
            elif not exact_only and whole_word_match(value, label, min_match_size):
                matches.append({"match_string": label, "object_id": curie , "object_label": row['value'], 'object_match_field' : predicate, 'predicate_id': 'skos:relatedMatch' })
    
    return matches

def print_matches(matches):
    if matches:
        echo("Matches:")
        for q in matches:
            l = q['object_label']
            echo(f"  * {l}")
    
# Remove previous matches from label, with the goal of speeding up matching
# For example, if a PATO quality is matched, we removed it from the label 
def reduce_label(label, matches):
    label_reduced = label
    for q in matches:
        ### exceptions to this: we dont want to remove "cell" or similar ever from the
        # Label as this may cause issues, like not matching a cell type when you have
        # bipolar cell morphology and cell morphology as the quality
        q_label = q['match_string']
        q_label = q_label.replace("cell","")     
        label_reduced = label_reduced.replace(q_label,"")
    label_reduced = label_reduced.strip()
    return label_reduced

def to_sssom_df(list_of_mappings, name):
    prefix_map={
        'OBA': 'http://purl.obolibrary.org/obo/OBA_',
        'GO': 'http://purl.obolibrary.org/obo/GO_',
        'UBERON': 'http://purl.obolibrary.org/obo/UBERON_',
        'CL': 'http://purl.obolibrary.org/obo/CL_',
        'VT': 'http://purl.obolibrary.org/obo/VT_',
        'PATO': 'http://purl.obolibrary.org/obo/PATO_',
        'EFO': 'http://www.ebi.ac.uk/efo/EFO_',
        'oboInOwl': 'http://www.geneontology.org/formats/oboInOwl#',
        'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 
    }
    
    ## Create empty mapping set
    mapping_set_id = f"https://w3id.org/sssom/commons/phenotype/{name}"
    license = "https://creativecommons.org/licenses/by/4.0/"
    ms = MappingSet(mapping_set_id=mapping_set_id, license=license)

    ## Add mappings to Mapping Set
    ms.mappings = list_of_mappings
    
    ## Create Mapping set document
    doc = MappingSetDocument(mapping_set=ms, prefix_map=prefix_map)
    msdf = to_mapping_set_dataframe(doc)
    return msdf.df

def align_ontology(df_input, pato_labels=[], cl_labels=[], uberon_labels=[], go_labels=[], chebi_labels=[], head_count=0, alignment_id="unspecified"):
    # Preprocess input
    df = df_input.copy()

    if head_count > 0:
        df = df.head(head_count)
    df = df.reset_index()  # make sure indexes pair with number of rows
        
    # Declare output variables
    mlist: List[Mapping] = []

    dosdp_matches = []

    for index, row in df.iterrows():
        curie = row['id']
        mdict = {}
        
        cell_types = []
        bp_mf_ccs = []
        anatomical_entities = []
        chebi_entities = []
        
        
        predicate = row['predicate']
        label = prepare_label(row['value'])
        
        mdict['subject_id'] = curie
        mdict['subject_match_field'] = predicate
        mdict['subject_label'] = label
        mdict['match_type'] = 'Lexical'

        echo("")
        echo("--------------------------------")
        echo(f"{curie}: {label}")

        echo("")
        echo("Matching OBA")
        traits_oba = get_matches(label, oba_labels, strict_word_order=True, exact_only=True)

        if traits_oba:
            echo("")
            echo("Outcomes")
            for matches in [traits_oba]:
                for matches_mdict in matches:
                    echo(f"    {matches_mdict['object_id']} ({json.dumps(matches_mdict, sort_keys=True, indent=4)})")
                    matches_mdict.update(mdict)
                    mlist.append(Mapping(**matches_mdict))
        
        echo("")
        echo("Matching PATO")
        qualities = get_matches(label, pato_labels, exact_only=False)
        print_matches(qualities)

        if not qualities:
            echo(f"No quality matches for {label}, skipping rest..")
            continue

        # Make Label wo quality reference
        skip_next = False

        label_reduced = reduce_label(label, qualities)

        if len(label_reduced) < 3:
            echo(f"Label ({label_reduced}) now (after PATO) has less than 3 characters, skipping rest.")
            skip_next = True

        if not skip_next:
            echo("")
            echo("Matching CL")
            cell_types = get_matches(label_reduced, cl_labels, strict_word_order=True, exact_only=False)
            print_matches(cell_types)

            label_reduced = reduce_label(label_reduced, cell_types)

            if len(label_reduced) < 3:
                echo(f"Label ({label_reduced}) now (after CL) has less than 3 characters, skipping rest.")
                skip_next = True

        if not skip_next:
            echo("")
            echo("Matching Uberon")
            anatomical_entities = get_matches(label_reduced, uberon_labels, strict_word_order=True, exact_only=False)
            print_matches(anatomical_entities)

            label_reduced = reduce_label(label_reduced, anatomical_entities)

            if len(label_reduced) < 3:
                echo(f"Label ({label_reduced}) now (after Uberon) has less than 3 characters, skipping GO.")
                skip_next = True

        if not skip_next:
            echo("")
            echo("Matching CHEBI")
            chebi_entities = get_matches(label_reduced, chebi_labels, strict_word_order=True, exact_only=False)
            print_matches(chebi_entities)

            label_reduced = reduce_label(label_reduced, chebi_entities)

            if len(label_reduced) < 3:
                echo(f"Label ({label_reduced}) now (after CHEBI) has less than 3 characters, skipping GO.")
                skip_next = True

        if not skip_next:
            echo("")
            echo("Matching GO")
            bp_mf_ccs = get_matches(label_reduced, go_labels, exact_only=False)
            print_matches(bp_mf_ccs)

        unique_qualities = [qmatches['object_label'] for qmatches in qualities]
        unique_qualities = list(set(unique_qualities))
        for qmatches in qualities:
            pattern_dict = mdict.copy()
            pattern_dict['quality'] = qmatches['object_id']
            pattern_dict['quality_label'] = qmatches['object_label']
            if qmatches['object_label'] == "quality" and len(unique_qualities) > 1:
                echo("There is a match to PATO:quality, but other qualities exist.. Skipping this one.")
                continue
            for matches_list in [anatomical_entities, cell_types, bp_mf_ccs]:
                largest_e = []
                cache_es = {}
                for ematches in matches_list:
                    process = True
                    if len(largest_e) > 0:
                        for e in largest_e:
                            if e in ematches['object_label']:
                                # Bigger one found
                                largest_e.remove(e)
                                process = True
                                break
                            elif ematches['object_label'] in e:
                                process = False
                                break

                    if process:
                        largest_e.append(ematches['object_label'])
                        pattern_dict['attribute'] = ematches['object_id']
                        pattern_dict['attribute_label'] = ematches['object_label']
                        np = pattern_dict.copy()
                        cache_es[ematches['object_label']] = np
                for e in largest_e:
                    dosdp_matches.append(cache_es[e])
    
    df_mappings = pd.DataFrame(dosdp_matches)
    df_sssom = to_sssom_df(mlist, alignment_id)
    
    if len(df_mappings)>0:
        post_mapping_unmapped = df[~df['id'].isin(df_mappings['subject_id'])]
    else:
        post_mapping_unmapped = df

    return df_mappings, df_sssom, post_mapping_unmapped


## EFO - OBA alignment

In the following, we will attempt an EFO alignment with OBA. At the same time, we will try to patternise EFO classes, with the goal of including these newly patternised EFO classes straight into OBA and mapping them back to EFO.

The process goes like this:

For all measurement terms T in EFO, 

1. Try to match T to OBA (for the purpose of having a straight mapping)
3. Try to match a PATO term.
4. If 3 was successful, we also try to match an UBERON, CL and/or GO term.
5. EFO-OBA mappings are exported as SSSOM
6. EFO to EQ mappings are exported as a DOSDP pattern file with a bit of metadata

### Methods for the process

The central ideas in the mapping process are:

1. We remove noisy words like "trait" or "measurement" from our labels prior to alignment (we also do the usual preprocessing like lower casing etc)
2. For EFO-OBA/VT we take a naive process which attempts to only match _exactly_ the preprocessed labels
3. For the EFO-EQ patternisation we search for occurrences of E's in the label of the EFO term


### Prepare unmapped EFO elements

In [16]:
# We only care about unmapped elements!
# We look at all the efo classes that are neither mentioned in the oba to efo mappings we already have
# nor in the file with the exclusions

efo_unmapped = efo_labels[~efo_labels['id'].isin(oba_efo_sssom.df['subject_id'])].copy()
if len(efo_mapping_exclusions)>0:
    efo_unmapped = efo_unmapped[~efo_unmapped['id'].isin(efo_mapping_exclusions['subject_id'])].copy()
efo_unmapped

Unnamed: 0,id,predicate,value
0,EFO:0010138,rdfs:label,nitrite measurement
1,EFO:0020165,rdfs:label,AT-rich interactive domain-containing protein ...
2,EFO:0008214,rdfs:label,lymphotactin measurement
3,EFO:0005273,rdfs:label,sleep depth
4,EFO:0010773,rdfs:label,CD5 measurement
...,...,...,...
4530,EFO:0005117,oboInOwl:hasExactSynonym,"Regulated on Activation, Normal T cell Express..."
4531,EFO:0009272,oboInOwl:hasExactSynonym,VCA-IgG seropositivity
4532,EFO:0009272,oboInOwl:hasExactSynonym,VCA seropositivity
4533,EFO:0010381,oboInOwl:hasExactSynonym,PC 36:3


### Mapping

In [17]:
efo_dosdp_candidates, efo_mapping_candidates, oba_efo_mapping_unmapped = align_ontology(df_input=efo_unmapped, pato_labels=pato_labels, cl_labels=cl_labels, uberon_labels=uberon_labels, go_labels=go_labels, chebi_labels=chebi_labels, head_count=head_count)

# export the entirely unmapped data for future reference

save_tsv(efo_dosdp_candidates, oba_efo_dosdp_candidates_file)
save_tsv(efo_mapping_candidates, oba_efo_mapping_candidates_file)
save_tsv(oba_efo_mapping_unmapped, oba_efo_mapping_unmapped_file)

### Results

In [18]:
display(HTML("<h4>DOSDP mapping candidates</h4>"))
display(HTML(efo_dosdp_candidates.to_html()))

display(HTML("<h4>SSSOM mapping candidates</h4>"))
display(HTML(efo_mapping_candidates.to_html()))

display(HTML("<h4>Top 10 unmapped candidates</h4>"))
display(HTML(oba_efo_mapping_unmapped.head(10).to_html()))

Unnamed: 0,subject_id,subject_match_field,subject_label,match_type,quality,quality_label,attribute,attribute_label
0,EFO:0005273,rdfs:label,sleep depth,Lexical,PATO:0001595,depth,GO:0030431,sleep
1,EFO:0007719,rdfs:label,carotid artery external diameter quality,Lexical,PATO:0001334,diameter,UBERON:0005396,carotid artery
2,EFO:0009230,rdfs:label,reticulocyte corpuscular hemoglobin distribution width,Lexical,PATO:0000921,width,CL:0000558,reticulocyte
3,EFO:0009230,rdfs:label,reticulocyte corpuscular hemoglobin distribution width,Lexical,PATO:0000060,distribution,CL:0000558,reticulocyte
4,EFO:0020760,rdfs:label,t-cell surface glycoprotein cd4 quality,Lexical,PATO:0000001,quality,CL:0000084,T-cell
5,EFO:0020760,rdfs:label,t-cell surface glycoprotein cd4 quality,Lexical,PATO:0000001,quality,GO:0009986,cell surface
6,EFO:0020354,rdfs:label,ephrin type-b receptor 4 quality,Lexical,PATO:0000001,quality,GO:0005106,ephrin
7,EFO:0007579,rdfs:label,blood chromium quality,Lexical,PATO:0000001,quality,UBERON:0000178,blood
8,EFO:0009238,rdfs:label,immature plasma cell count,Lexical,PATO:0001501,immature,CL:0000786,plasma cell
9,EFO:0004767,rdfs:label,visceral:subcutaneous adipose tissue ratio,Lexical,PATO:0001470,ratio,UBERON:0001013,adipose tissue


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,match_type,subject_match_field,object_match_field,match_string
0,EFO:0004884,breast size,skos:exactMatch,OBA:1000025,breast size,Lexical,rdfs:label,rdfs:label,breast size
1,EFO:0006936,optic disc quality,skos:exactMatch,OBA:1000929,optic disc quality,Lexical,rdfs:label,rdfs:label,optic disc quality


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,match_type,subject_match_field,object_match_field,match_string
0,EFO:0004884,breast size,skos:exactMatch,OBA:1000025,breast size,Lexical,rdfs:label,rdfs:label,breast size
1,EFO:0006936,optic disc quality,skos:exactMatch,OBA:1000929,optic disc quality,Lexical,rdfs:label,rdfs:label,optic disc quality


#### Distribution of PATO qualities matched

This can help identifying the next set of entities to work on.

In [19]:
if len(efo_dosdp_candidates)>0:
    df_quals = efo_dosdp_candidates['quality_label'].value_counts().rename_axis('unique_values').reset_index(name='counts')
    echo(df_quals.head(20))

#### Matches involving "Volume"

See https://github.com/obophenotype/bio-attribute-ontology/issues/97

In [20]:
if len(efo_dosdp_candidates)>0:
    df_vol=efo_dosdp_candidates[efo_dosdp_candidates['quality_label']=='volume']
    display(HTML(df_vol.to_html()))

Unnamed: 0,subject_id,subject_match_field,subject_label,match_type,quality,quality_label,attribute,attribute_label
13,EFO:0010326,rdfs:label,precuneus cortex volume quality@en,Lexical,PATO:0000918,volume,UBERON:0006093,precuneus cortex
16,EFO:0009405,rdfs:label,parasubiculum volume,Lexical,PATO:0000918,volume,UBERON:0004683,parasubiculum
21,EFO:0600045,rdfs:label,pancreas volume,Lexical,PATO:0000918,volume,UBERON:0001264,pancreas
26,EFO:0010330,rdfs:label,superior parietal cortex volume quality@en,Lexical,PATO:0000918,volume,UBERON:0006094,superior parietal cortex
49,EFO:0010317,rdfs:label,paracentral lobule volume quality@en,Lexical,PATO:0000918,volume,UBERON:0035933,paracentral lobule


## VT - OBA alignment

### Prepare unmapped VT terms

In [21]:

# Check if some have been missed (IRI analysis)
vt_oba_mapping = oba_labels.copy()
vt_oba_mapping['subject_id'] = vt_oba_mapping[vt_oba_mapping['id'].str.contains(":VT")]['id'].str.replace("OBA:VT","VT:")
echo(len(vt_oba_mapping))

# Identify those that have an VT mappings based on the OBO IRI
vt_oba_mapping = vt_oba_mapping[~vt_oba_mapping['subject_id'].isna()]
echo(len(vt_oba_mapping))

# Get the set of mappings that are not yet in the official OBA-VT sssom mapping file
vt_id_mapped = vt_oba_mapping[~vt_oba_mapping['subject_id'].isin(oba_vt_sssom.df['subject_id'])].copy()

# exclude all VT terms already fully mapped in the sssom file
vt_unmapped = vt_labels[~vt_labels['id'].isin(oba_vt_sssom.df['subject_id'])].copy()
echo(len(vt_unmapped))

# exclude all VT terms that are trivially mapped through ID mapping
vt_unmapped = vt_unmapped[~vt_unmapped['id'].isin(vt_oba_mapping['subject_id'])].copy()
echo(len(vt_unmapped))

### Removed excluded terms from unmapped VT set
if len(vt_mapping_exclusions)>0:
    vt_unmapped = vt_unmapped[~vt_unmapped.id.isin(mapping_exclusions['subject_id'])].copy()

# export already mapped VT ids that are not in sssom yet as SSSOM
vt_id_mapped['predicate_id']="skos:exactMatch"
vt_id_mapped['object_id']=vt_id_mapped['id']
vt_id_mapped['match_type']="HumanCurated"
vt_id_mapped = vt_id_mapped[['subject_id','predicate_id','object_id','match_type']].copy().drop_duplicates()
save_tsv(vt_id_mapped, oba_vt_mapping_missed_file)


### Mapping

In [22]:
vt_dosdp_candidates, vt_mapping_candidates, vt_efo_mapping_unmapped = align_ontology(df_input=vt_unmapped, pato_labels=pato_labels, cl_labels=cl_labels, uberon_labels=uberon_labels, go_labels=go_labels, chebi_labels=chebi_labels, head_count=head_count)

# export the entirely unmapped data for future reference

save_tsv(vt_dosdp_candidates, oba_vt_dosdp_candidates_file)
save_tsv(vt_mapping_candidates, oba_vt_mapping_candidates_file)
save_tsv(vt_efo_mapping_unmapped, oba_vt_mapping_unmapped_file)


### Results

In [24]:
display(HTML("<h4>DOSDP mapping candidates</h4>"))
display(HTML(efo_dosdp_candidates.to_html()))

display(HTML("<h4>SSSOM mapping candidates</h4>"))
display(HTML(efo_mapping_candidates.to_html()))

display(HTML("<h4>Top 10 unmapped candidates</h4>"))
display(HTML(vt_efo_mapping_unmapped.head(10).to_html()))

Unnamed: 0,subject_id,subject_match_field,subject_label,match_type,quality,quality_label,attribute,attribute_label
0,EFO:0005273,rdfs:label,sleep depth,Lexical,PATO:0001595,depth,GO:0030431,sleep
1,EFO:0007719,rdfs:label,carotid artery external diameter quality,Lexical,PATO:0001334,diameter,UBERON:0005396,carotid artery
2,EFO:0009230,rdfs:label,reticulocyte corpuscular hemoglobin distribution width,Lexical,PATO:0000921,width,CL:0000558,reticulocyte
3,EFO:0009230,rdfs:label,reticulocyte corpuscular hemoglobin distribution width,Lexical,PATO:0000060,distribution,CL:0000558,reticulocyte
4,EFO:0020760,rdfs:label,t-cell surface glycoprotein cd4 quality,Lexical,PATO:0000001,quality,CL:0000084,T-cell
5,EFO:0020760,rdfs:label,t-cell surface glycoprotein cd4 quality,Lexical,PATO:0000001,quality,GO:0009986,cell surface
6,EFO:0020354,rdfs:label,ephrin type-b receptor 4 quality,Lexical,PATO:0000001,quality,GO:0005106,ephrin
7,EFO:0007579,rdfs:label,blood chromium quality,Lexical,PATO:0000001,quality,UBERON:0000178,blood
8,EFO:0009238,rdfs:label,immature plasma cell count,Lexical,PATO:0001501,immature,CL:0000786,plasma cell
9,EFO:0004767,rdfs:label,visceral:subcutaneous adipose tissue ratio,Lexical,PATO:0001470,ratio,UBERON:0001013,adipose tissue


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,match_type,subject_match_field,object_match_field,match_string
0,EFO:0004884,breast size,skos:exactMatch,OBA:1000025,breast size,Lexical,rdfs:label,rdfs:label,breast size
1,EFO:0006936,optic disc quality,skos:exactMatch,OBA:1000929,optic disc quality,Lexical,rdfs:label,rdfs:label,optic disc quality


Unnamed: 0,index,id,predicate,value
6,14,VT:0006230,rdfs:label,iris stroma cell quantity
14,27,VT:0002327,rdfs:label,respiratory function trait
15,29,VT:0002911,rdfs:label,inhibitory postsynaptic potential
22,49,VT:0001964,rdfs:label,auditory threshold
23,50,VT:0010060,rdfs:label,muscle fatty acid cis-9-C16:1 amount
25,52,VT:0010543,rdfs:label,catecholamine amount
26,54,VT:0010696,rdfs:label,preovulatory follicle quantity
28,60,VT:0003148,rdfs:label,cochlear coil quantity
33,70,VT:1000720,rdfs:label,nonfunctional nipple quantity
37,78,VT:0003424,rdfs:label,neuronal precursor differentiation trait


In [32]:
vt_efo_mapping_unmapped


Unnamed: 0,index,id,predicate,value
6,14,VT:0006230,rdfs:label,iris stroma cell quantity
14,27,VT:0002327,rdfs:label,respiratory function trait
15,29,VT:0002911,rdfs:label,inhibitory postsynaptic potential
22,49,VT:0001964,rdfs:label,auditory threshold
23,50,VT:0010060,rdfs:label,muscle fatty acid cis-9-C16:1 amount
25,52,VT:0010543,rdfs:label,catecholamine amount
26,54,VT:0010696,rdfs:label,preovulatory follicle quantity
28,60,VT:0003148,rdfs:label,cochlear coil quantity
33,70,VT:1000720,rdfs:label,nonfunctional nipple quantity
37,78,VT:0003424,rdfs:label,neuronal precursor differentiation trait


For how to best deal with this complex curation worfklow, see OBA documentation.