In [22]:
import pandas as pd
import sys
import os
!pip install curies
!pip install sssom -U
import curies
from sssom.parsers import from_sssom_dataframe
from sssom.context import get_default_metadata
from sssom.writers import write_table


Collecting argparse>=1.4.0
  Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)


Installing collected packages: argparse
Successfully installed argparse-1.4.0


In [2]:
profile = sys.argv[1]
profile = "all"
stopwords = ['abnormally','abnormal','aberrant','variant']
outdir = "../curation/data"
uphenorelease_dir = "../curation/upheno-release/{}/".format(profile)

## IN
upheno_mapping_logical = os.path.join(uphenorelease_dir,"upheno_mapping_logical.csv")
upheno_species_lexical_file = os.path.join(uphenorelease_dir,"upheno_species_lexical.csv")

## OUT
upheno_mapping_all = os.path.join(uphenorelease_dir,"upheno_mapping_all.csv")
upheno_mapping_lexical = os.path.join(uphenorelease_dir,"upheno_mapping_lexical.csv")
upheno_mapping_lexical_template = os.path.join(uphenorelease_dir,"upheno_mapping_lexical_template.csv")
upheno_mapping_problematic = os.path.join(uphenorelease_dir,"upheno_mapping_problematic.csv")

## Load lexical data
df = pd.read_csv(upheno_species_lexical_file)
df.columns = ['iri','p','label']

## Load logical mappings
dfl1 = pd.read_csv(upheno_mapping_logical)[['p1','p2']]
dfl1.columns = ['subject_id','object_id']
dfl2 = dfl1.copy()
dfl2.columns = ['object_id','subject_id']
dfl = pd.concat([dfl1, dfl2], ignore_index=True, sort =False)
dfl = dfl.drop_duplicates()
dfl['mapping_justification']="semapv:LogicalReasoning"

## Prepare dataframe for labels
df_label = df[df['p']=="http://www.w3.org/2000/01/rdf-schema#label"][['iri','label']]
df_label.columns = ['iri','label']

In [3]:
df.head()

Unnamed: 0,iri,p,label
0,http://purl.obolibrary.org/obo/UPHENO_0001001,http://www.w3.org/2000/01/rdf-schema#label,phenotype
1,http://purl.obolibrary.org/obo/UPHENO_0001001,http://www.w3.org/2000/01/rdf-schema#label,Phenotype
2,http://purl.obolibrary.org/obo/UPHENO_0001003,http://www.w3.org/2000/01/rdf-schema#label,phenotype by ontology source
3,http://purl.obolibrary.org/obo/UPHENO_0001005,http://www.w3.org/2000/01/rdf-schema#label,abnormal phenotype by ontology source
4,http://purl.obolibrary.org/obo/MP_0001417,http://www.w3.org/2000/01/rdf-schema#label,decreased exploration in new environment (MPO)


In [4]:
dfl.head()

Unnamed: 0,subject_id,object_id,mapping_justification
0,http://purl.obolibrary.org/obo/MP_0011461,http://purl.obolibrary.org/obo/HP_0011279,semapv:LogicalReasoning
1,http://purl.obolibrary.org/obo/ZP_0003906,http://purl.obolibrary.org/obo/ZP_0009649,semapv:LogicalReasoning
2,http://purl.obolibrary.org/obo/DDPHENO_0000005,http://purl.obolibrary.org/obo/WBPhenotype_000...,semapv:LogicalReasoning
3,http://purl.obolibrary.org/obo/XPO_0131732,http://purl.obolibrary.org/obo/ZP_0102753,semapv:LogicalReasoning
4,http://purl.obolibrary.org/obo/HP_0012091,http://purl.obolibrary.org/obo/MP_0002693,semapv:LogicalReasoning


In [5]:
df_label.head()

Unnamed: 0,iri,label
0,http://purl.obolibrary.org/obo/UPHENO_0001001,phenotype
1,http://purl.obolibrary.org/obo/UPHENO_0001001,Phenotype
2,http://purl.obolibrary.org/obo/UPHENO_0001003,phenotype by ontology source
3,http://purl.obolibrary.org/obo/UPHENO_0001005,abnormal phenotype by ontology source
4,http://purl.obolibrary.org/obo/MP_0001417,decreased exploration in new environment (MPO)


In [6]:
# Preprocess labels. The most important aspect to this the stopword removal. this is done by matching a stopword
# that means 'abnormal', removing it and then adding the actual prefix 'abnormal'. For example, "cell morphology, aberrant"
# will become 'abnormal cell morphology'. Other than that, most special characters other than space and the ' tick-mark
# Are removed

def apply_stopword(x, stopword):
    if x:
        if stopword in x:
            x = "abnormal "+x.replace(stopword, '')
    return x

def preprocess_labels(df, stopwords):
    df['label'] = df['label'].astype(str)
    df['label_pp'] = df['label'].str.replace(r"[(][A-Z]+[)]", "")
    df['label_pp'] = df['label_pp'].str.lower()
    df['label_pp'] = df['label_pp'].str.replace(r"[^0-9a-z' ]", "")

    for stopword in stopwords:
        df['label_pp'] = df['label_pp'].apply(lambda x: apply_stopword(x,stopword))

    df['label_pp'] = df['label_pp'].str.strip()
    df['label_pp'] = df['label_pp'].str.replace(r"[ ]+", " ")
    df=df[~df['iri'].astype(str).str.startswith('http://purl.obolibrary.org/obo/UPHENO_')]
    df=df[df['label_pp']!=""]
    d=df[['iri','label_pp']]
    d.columns=['iri','label']
    d=d.drop_duplicates()
    return d

d = preprocess_labels(df,stopwords)
l = df_label[~df_label['iri'].astype(str).str.startswith('http://purl.obolibrary.org/obo/UPHENO_')]
print(len(d))

  df['label_pp'] = df['label'].str.replace(r"[(][A-Z]+[)]", "")
  df['label_pp'] = df['label_pp'].str.replace(r"[^0-9a-z' ]", "")
  df['label_pp'] = df['label_pp'].str.replace(r"[ ]+", " ")


174040


In [7]:
dd=d.groupby('label')['iri'].apply(list).to_dict()

In [8]:
# This step is a complicated hack that tries to get rid of them of the false exact synonyms. 
# The idea is this: if there is an exact synonym between two terms within an ontology, we get rid of the link. 
# Sometimes, however, a synonym is shared between more than one term within and ontology and across: 
# These cases need to be

import re

def get_dupes(a):
    seen = {}
    dupes = []

    for x in a:
        if x not in seen:
            seen[x] = 1
        else:
            if seen[x] == 1:
                dupes.append(x)
            seen[x] += 1
    return dupes

cases = dict()
cases_internal = dict()
i = 0

exclude_synonyms = dict()

for label in dd:
    iris = dd.get(label)
    onts = [re.sub('[_][0-9]+', '', iri.replace("http://purl.obolibrary.org/obo/","")) for iri in iris]
    if len(onts)>1:
        if len(onts) != len(set(onts)):
            if len(set(onts))>1:
                cases[label] = iris
                print("-----------------------")
                print(label)
                print(iris)
                dupes = get_dupes(onts)
                for dupe in dupes:
                    for iri in iris:
                        if dupe in iri:
                            if label not in exclude_synonyms:
                                exclude_synonyms[label]=[]
                            exclude_synonyms[label].append(iri)
            else:
                cases_internal[label] = iris
                for iri in iris:
                    if label not in exclude_synonyms:
                        exclude_synonyms[label]=[]
                    exclude_synonyms[label].append(iri)


print(len(cases_internal))
print(len(cases))
print(len(dd))

-----------------------
asd
['http://purl.obolibrary.org/obo/HP_0000729', 'http://purl.obolibrary.org/obo/HP_0001631', 'http://purl.obolibrary.org/obo/MP_0010403']
39
1
169326


In [9]:
x = d

In [10]:
# Remove all those IRIs that contained duplicates determined in the previous step
d=x
print(len(d))
for label in exclude_synonyms:
    for iri in exclude_synonyms[label]:
        d = d[~((d['iri']==iri) & (d['label']==label))]
print(len(d))
d = pd.merge(d,l,on=['iri','label'],how='outer')
print(len(d))

174040
173960
273897


In [11]:
dd=d.groupby('label')['iri'].apply(list).to_dict()

In [12]:
def pairwise(t):
    it = iter(t)
    return zip(it,it)

def invert_dol_nonunique(d):
    newdict = {}
    for k in d:
        for v in d[k]:
            newdict.setdefault(v, []).append(k)
    return newdict

def merge_label_equivalent_cliques(dd_rv):
    merge_labels = dict()
    for iri in dd_rv:
        labels_to_merge = dd_rv.get(iri)
        if len(labels_to_merge)>1:
            for lab in labels_to_merge:
                if lab not in merge_labels:
                    merge_labels[lab] = []
                merge_labels[lab] = list(set(merge_labels[lab]+labels_to_merge))
    return merge_labels

dd_rv = invert_dol_nonunique(dd)
merge_labels = merge_label_equivalent_cliques(dd_rv)

In [13]:
l[l['iri']=="http://purl.obolibrary.org/obo/HP_0011138"]

Unnamed: 0,iri,label
30282,http://purl.obolibrary.org/obo/HP_0011138,Abnormality of skin adnexa morphology (HPO)


In [24]:
converter = curies.get_obo_converter()

def compute_mappings(dd,l,meta):
    data = []
    done = set()
    for label in dd:
        if label in done:
            continue
        done.add(label)
        iris = dd.get(label)
        if label in merge_labels:
            for lab in merge_labels[label]:
                iris.extend(dd.get(lab))
                done.add(lab)
        iris = list(set(iris))
        if len(iris)>1:
            #print(iris)
            pairs = pairwise(iris)
            for pair in pairs:
                curie1 = pair[0]
                curie2 = pair[1]
                data.append([curie1, curie2])
                data.append([curie2, curie1])
    df_mappings =  pd.DataFrame.from_records(data)
    df_mappings = df_mappings.drop_duplicates()
    df_mappings['mapping_justification'] = 'semapv:LexicalMatching'
    df_mappings['predicate_id'] = 'skos:exactMatch'
    df_mappings.columns = ['subject_id','object_id','mapping_justification','predicate_id']
    df_maps = pd.merge(df_mappings,l,  how='left', left_on=['subject_id'], right_on=['iri'])
    df_maps=df_maps.drop('iri',axis=1)
    df_maps = pd.merge(df_maps, l,  how='left', left_on=['object_id'], right_on=['iri'])
    df_maps=df_maps.drop('iri',axis=1)
    df_maps['subject_source']=["obo:"+re.sub('[_][0-9]+', '', iri.replace("http://purl.obolibrary.org/obo/","")).lower() for iri in df_maps['subject_id'].values]
    df_maps['object_source']=["obo:"+re.sub('[_][0-9]+', '', iri.replace("http://purl.obolibrary.org/obo/","")).lower() for iri in df_maps['object_id'].values]
    df_maps['subject_id']=[converter.compress(iri) for iri in df_maps['subject_id'].values]
    df_maps['object_id']=[converter.compress(iri) for iri in df_maps['object_id'].values]
    df_maps.rename(columns={'label_x': 'subject_label', 'label_y': 'object_label'}, inplace=True)
    df_maps = df_maps[df_maps['subject_id'].notna()]
    df_maps = df_maps[df_maps['object_id'].notna()]
    df_maps = df_maps[df_maps['predicate_id'].notna()]
    default_metadata = get_default_metadata()
    msdf = from_sssom_dataframe(df=df_maps, meta=meta, prefix_map=default_metadata.prefix_map)
    return msdf

meta = {
            "mapping_set_id": 'http://w3id.org/sssom/commons/monarch/upheno_lexical.custom.sssom.tsv',
            "mapping_set_description": 'A manual matching pipeline maintained by the Monarch Initiative.',
            "license": 'https://creativecommons.org/publicdomain/zero/1.0/'
       }
df_mapping_msdf = compute_mappings(dd,l,meta)

df_mapping = df_mapping_msdf.df

print(len(df_mapping))


df_mapping.head()

7444


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,subject_source,object_source
0,APO:0000045,sterile (APO),skos:exactMatch,FBcv:0000364,sterile (DPO),semapv:LexicalMatching,obo:apo,obo:fbcv
1,APO:0000045,sterile (APO),skos:exactMatch,MP:0001924,infertility (MPO),semapv:LexicalMatching,obo:apo,obo:mp
2,APO:0000080,stress resistance (APO),skos:exactMatch,FYPO:0002046,resistance to stress (FYPO),semapv:LexicalMatching,obo:apo,obo:fypo
3,APO:0000082,osmotic stress resistance (APO),skos:exactMatch,FYPO:0000851,resistance to osmotic stress (FYPO),semapv:LexicalMatching,obo:apo,obo:fypo
4,APO:0000083,oxidative stress resistance (APO),skos:exactMatch,FYPO:0007161,resistance to oxidative stress (FYPO),semapv:LexicalMatching,obo:apo,obo:fypo


In [28]:
df_mapping_msdf.clean_prefix_map()
with open(upheno_mapping_lexical, "w") as file:
    write_table(df_mapping_msdf, file)

In [None]:
## Step to investigate why there are mappings of terms within the same ontology.. 
## Since exact synonyms and labels were used, no such mapping should exist
## We drop them

w=df_mapping[df_mapping['subject_source']==df_mapping['object_source']]
df_maps = df_mapping[df_mapping['subject_source']!=df_mapping['object_source']]
print(len(w))
w.to_csv(upheno_mapping_problematic,index=False)
#df_maps
# print(df_mapping[df_mapping['subject_id']=="http://purl.obolibrary.org/obo/ZP_0006897"])
df_mapping_template = df_mapping[['subject_id','object_id']].copy()
df_mapping_template.columns = ['Ontology ID','EquivalentClasses']

df_mapping_template.loc[-1] = ['ID', 'AI obo:UPHENO_0000002']  # adding a row
df_mapping_template.index = df_mapping_template.index + 1  # shifting index
df_mapping_template.sort_index(inplace=True) 

#df_mapping.to_csv(upheno_mapping_lexical,index=False)
df_mapping_template.to_csv(upheno_mapping_lexical_template,index=False)

In [None]:
# Merging the logical mappings with the lexical ones for comparison
#print(df_maps.head())
df_m = pd.merge(df_maps[['subject_id','object_id','mapping_justification']], dfl,  how='outer', on=['subject_id','object_id'])
df_m = pd.merge(df_m,l,  how='left', left_on=['subject_id'], right_on=['iri'])
df_m=df_m.drop('iri',1)
df_m = pd.merge(df_m, l,  how='left', left_on=['object_id'], right_on=['iri'])
df_m=df_m.drop('iri',1)
df_m['mapping_justification'] = df_m["mapping_justification_x"].astype(str)+"-" + df_m["mapping_justification_y"].astype(str)
df_m['mapping_justification'] = df_m['mapping_justification'].str.replace("-nan", "")
df_m['mapping_justification'] = df_m['mapping_justification'].str.replace("nan-", "")
df_m=df_m.drop('mapping_justification_x',1)
df_m=df_m.drop('mapping_justification_y',1)

#print(df_m['mapping_justification'].value_counts(normalize=True))
#print(df_m['mapping_justification'].value_counts())

df_m.to_csv(upheno_mapping_all,index=False)

df_m.rename(columns={'label_x': 'subject_label', 'label_y': 'object_label'}, inplace=True)

df_m