In [1]:
%reset -f

In [61]:
from oaklib import OntologyResource
from oaklib.implementations import SqlImplementation
from oaklib.utilities.lexical.lexical_indexer import (
    create_lexical_index,
    lexical_index_to_sssom,
    load_mapping_rules,
    save_lexical_index,
)
from sssom.parsers import parse_sssom_table
from sssom import compare_dataframes
from sssom.parsers import split_dataframe
from sssom.util import MappingSetDataFrame, filter_prefixes, invert_mappings
from sssom.io import get_metadata_and_prefix_map, filter_file
from sssom.writers import write_table

import pandas as pd
import yaml


In [3]:
mondo_sssom = "../ontology/tmp/mondo.sssom.tsv"
rules = "../ontology/config/mondo-match-rules.yaml"
config= "../ontology/metadata/mondo.sssom.config.yml"

In [9]:
%%time

meta = get_metadata_and_prefix_map(config)

with open(config, "r") as f:
    yml = yaml.safe_load(f)
    
mappings_msdf = parse_sssom_table(mondo_sssom)

print(len(mappings_msdf.df))
mappings_msdf.df.head()



71450
CPU times: user 34.8 s, sys: 190 ms, total: 35 s
Wall time: 37.1 s


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification
0,MONDO:0000001,disease or disorder,skos:exactMatch,SCTID:64572001,,semapv:UnspecifiedMatching
1,MONDO:0000001,disease or disorder,skos:exactMatch,Orphanet:377788,,semapv:UnspecifiedMatching
2,MONDO:0000001,disease or disorder,skos:exactMatch,UMLS:C0012634,,semapv:UnspecifiedMatching
3,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C2991,Disease or Disorder,semapv:UnspecifiedMatching
4,MONDO:0000001,disease or disorder,skos:exactMatch,MESH:D004194,,semapv:UnspecifiedMatching
...,...,...,...,...,...,...
71445,MONDO:8000030,obsolete morphological anomaly,skos:exactMatch,Orphanet:377791,,semapv:UnspecifiedMatching
71446,MONDO:8000031,obsolete subtype of a disorder,skos:exactMatch,Orphanet:557494,,semapv:UnspecifiedMatching
71447,MONDO:8000032,obsolete malformation syndrome,skos:exactMatch,Orphanet:377789,,semapv:UnspecifiedMatching
71448,MONDO:8000033,obsolete group of disorders,skos:exactMatch,Orphanet:557492,,semapv:UnspecifiedMatching


In [10]:
%%time

ontology_resource = OntologyResource(slug='../ontology/tmp/merged.db', local=True)
oi = SqlImplementation(ontology_resource)

ruleset = load_mapping_rules(rules)
syn_rules = [x.synonymizer for x in ruleset.rules if x.synonymizer]
lexical_index = create_lexical_index(oi=oi, synonym_rules=syn_rules)


CPU times: user 7min 53s, sys: 15.5 s, total: 8min 8s
Wall time: 8min 10s


In [50]:
%%time

msdf = lexical_index_to_sssom(oi, lexical_index, ruleset=load_mapping_rules(rules), meta=meta)
print(len(msdf.df))
msdf.df.head()

1358017
CPU times: user 21min 16s, sys: 1min 14s, total: 22min 30s
Wall time: 23min 10s


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string
0,<http://identifiers.org/hgnc/10001>,RGS5,skos:exactMatch,OMIM:603276,RGS5,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,rgs5
1,<http://identifiers.org/hgnc/10001>,RGS5,skos:exactMatch,OMIM:603276,RGS5,semapv:LexicalMatching,oaklib,0.8,rdfs:label,oio:hasExactSynonym,rgs5
2,<http://identifiers.org/hgnc/10004>,RGS9,skos:exactMatch,OMIM:604067,RGS9,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,rgs9
3,<http://identifiers.org/hgnc/10004>,RGS9,skos:exactMatch,OMIM:604067,RGS9,semapv:LexicalMatching,oaklib,0.8,rdfs:label,oio:hasExactSynonym,rgs9
4,<http://identifiers.org/hgnc/10006>,RHAG,skos:exactMatch,OMIM:180297,RHAG,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,rhag


In [51]:
pd.DataFrame(msdf.df['subject_id'].str.split(':').apply(lambda x: x[0] )).value_counts()

subject_id
NCBITaxon     735219
Orphanet      338365
MONDO         126067
DOID          114602
ICD10CM        15557
<http           7814
NCIT            6342
OMIM            5136
ICD10WHO        3165
HP              2451
<https          1600
CHEBI            758
GO               410
CL               267
ENVO              86
FOODON            37
SYMP              28
PO                27
BFO               10
GENO              10
MFOMD              9
HsapDv             8
NBO                7
PATO               6
CARO               5
MF                 5
SO                 5
MAXO               4
IAO                4
ECTO               2
MPATH              2
DISDRIV            2
OGMS               1
OBI                1
RO                 1
ECO                1
ExO                1
UBERON             1
obo                1
dtype: int64

In [55]:
pd.DataFrame(msdf.df.loc[msdf.df['subject_id'].str.contains("http")]['subject_id'].value_counts())

Unnamed: 0,subject_id
<https://www.omim.org/phenotypicSeries/PS606215>,10
<https://www.omim.org/phenotypicSeries/PS115430>,9
<https://www.omim.org/phenotypicSeries/PS268000>,9
<https://www.omim.org/phenotypicSeries/PS163950>,8
<https://www.omim.org/phenotypicSeries/PS142623>,8
...,...
<http://purl.bioontology.org/ontology/STY/T092>,1
<https://www.omim.org/phenotypicSeries/PS236670>,1
<http://purl.bioontology.org/ontology/STY/T098>,1
<http://purl.bioontology.org/ontology/STY/T100>,1


In [53]:
pd.DataFrame(msdf.df['object_id'].str.split(':').apply(lambda x: x[0] )).value_counts()

object_id
NCBITaxon    734783
Orphanet     394151
MONDO        109898
OMIM          57337
DOID          26010
NCIT          20152
ICD10WHO       8599
ICD10CM        3552
SYMP           1105
UBERON          974
HP              544
obo             230
CHEBI           201
GO              123
CL               99
PATO             99
PO               37
SO               36
ENVO             27
FOODON           20
NBO              13
TRANS             4
dct               4
RO                3
OGMS              3
OBI               3
MPATH             3
MFOMD             2
UPHENO            1
BFO               1
MF                1
MAXO              1
rdfs              1
dtype: int64

In [56]:
%%time

prefix_of_interest = yml["subject_prefixes"]
msdf.df = filter_prefixes(df=msdf.df, filter_prefixes=prefix_of_interest, features=["subject_id", "object_id"])
print(len(msdf.df))
msdf.df.head()

606566
CPU times: user 9min 57s, sys: 25.5 s, total: 10min 23s
Wall time: 10min 43s


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string
10456,DOID:0001816,angiosarcoma,skos:exactMatch,Orphanet:263413,Angiosarcoma,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,angiosarcoma
10457,DOID:0001816,angiosarcoma,skos:exactMatch,MONDO:0016982,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,hemangiosarcoma
10458,DOID:0001816,angiosarcoma,skos:closeMatch,MONDO:0016982,angiosarcoma,semapv:LexicalMatching,oaklib,0.5,oio:hasDbXref,oio:hasDbXref,icdo:9120/3
10459,DOID:0001816,angiosarcoma,skos:closeMatch,MONDO:0016982,angiosarcoma,semapv:LexicalMatching,oaklib,0.5,oio:hasDbXref,oio:hasDbXref,mesh:d006394
10460,DOID:0001816,angiosarcoma,skos:exactMatch,MONDO:0016982,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,rdfs:label,oio:hasExactSynonym,angiosarcoma


In [57]:
msdf.df['predicate_modifier']= ""
msdf.df = invert_mappings(msdf.df, subject_prefix="MONDO", merge_inverted = False)
print(len(msdf.df))
msdf.df.head()

148117


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string,predicate_modifier
146808,MONDO:0000001,disease,skos:exactMatch,NCIT:C2991,Disease or Disorder,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,disease or disorder,
146809,MONDO:0000001,disease,skos:exactMatch,NCIT:C25457,Condition,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,condition,
146811,MONDO:0000001,disease,skos:exactMatch,Orphanet:377788,Disease,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,disease,
146814,MONDO:0000001,disease,skos:exactMatch,Orphanet:557493,disorder,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,disorder,
146815,MONDO:0000001,disease,skos:exactMatch,Orphanet:377788,Disease,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,disease,


In [58]:
msdf.remove_mappings(mappings_msdf)
print(len(msdf.df))
msdf.df.head()

75593


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string,predicate_modifier
0,MONDO:0000001,disease,skos:exactMatch,NCIT:C25457,Condition,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,condition,
1,MONDO:0000001,disease,skos:exactMatch,Orphanet:557493,disorder,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,disorder,
2,MONDO:0000001,disease,skos:exactMatch,NCIT:C156809,Medical Condition,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,medical condition,
3,MONDO:0000004,adrenocortical insufficiency,skos:exactMatch,NCIT:C113211,Hypocortisolemia,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,hypocortisolemia,
4,MONDO:0000022,nocturnal enuresis,skos:exactMatch,OMIM:600631,"enuresis, nocturnal, 1",semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,bedwetting,


In [59]:
msdf.df[msdf.df['object_id'].str.startswith("DOID")]

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string,predicate_modifier
891,MONDO:0004990,breast tumor luminal A or B,skos:narrowMatch,DOID:0060548,luminal breast carcinoma A,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasBroadSynonym,breast tumor luminal,
892,MONDO:0004990,breast tumor luminal A or B,skos:narrowMatch,DOID:0060548,luminal breast carcinoma A,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasBroadSynonym,luminal breast cancer,
32677,MONDO:0016982,angiosarcoma,skos:closeMatch,DOID:0001816,angiosarcoma,semapv:MappingInversion,oaklib,0.5,oio:hasDbXref,oio:hasDbXref,icdo:9120/3,
32678,MONDO:0016982,angiosarcoma,skos:closeMatch,DOID:0001816,angiosarcoma,semapv:MappingInversion,oaklib,0.5,oio:hasDbXref,oio:hasDbXref,mesh:d006394,
32679,MONDO:0003022,pediatric angiosarcoma,skos:narrowMatch,DOID:0001816,angiosarcoma,semapv:MappingInversion,oaklib,0.8,oio:hasBroadSynonym,rdfs:label,angiosarcoma,
...,...,...,...,...,...,...,...,...,...,...,...,...
70395,MONDO:0004944,neurosyphilis,skos:closeMatch,DOID:9988,tertiary neurosyphilis,semapv:MappingInversion,oaklib,0.5,oio:hasDbXref,oio:hasDbXref,mesh:d009494,
70396,MONDO:0004944,neurosyphilis,skos:closeMatch,DOID:9988,tertiary neurosyphilis,semapv:MappingInversion,oaklib,0.5,oio:hasDbXref,oio:hasDbXref,icd10cm:a52.3,
70397,MONDO:0015691,hypereosinophilic syndrome,skos:closeMatch,DOID:999,hypereosinophilic syndrome,semapv:MappingInversion,oaklib,0.5,oio:hasRelatedSynonym,oio:hasExactSynonym,eosinophilia,
70398,MONDO:0004946,hypoglycemia,skos:closeMatch,DOID:9993,hypoglycemia,semapv:MappingInversion,oaklib,0.5,oio:hasDbXref,oio:hasDbXref,mesh:d007003,


In [62]:
with open("mondo_unmapped.tsv", "w", encoding="utf8") as f:
    write_table(msdf, f)