In [1]:
%reset -f

In [2]:
import pandas as pd
from sssom.parsers import parse_sssom_table
from sssom import compare_dataframes
from sssom.parsers import split_dataframe
from sssom.util import MappingSetDataFrame
from os.path import join
from oaklib import OntologyResource
from oaklib.implementations import SqlImplementation
import textdistance

In [3]:
lexmatch_file = "../mappings/mondo-sources-all-lexical.sssom.tsv"
mondo_sssom = "../ontology/tmp/mondo.sssom.tsv"
dir_name = "dataframes"
exclusion_files = [
    "../ontology/reports/doid_term_exclusions.txt",
    "../ontology/reports/omim_term_exclusions.txt",
    "../ontology/reports/ordo_term_exclusions.txt",   
    "../ontology/reports/ncit_term_exclusions.txt",
    "../ontology/reports/icd10cm_term_exclusions.txt"
]


In [4]:
# Functions

def add_distance(df, col_name, txt_dist_pkg):
    df\
    .insert(\
            len(df.columns),\
            col_name,\
            df.apply\
            (\
                lambda x: txt_dist_pkg\
                 (\
                    x.subject_label.lower(), x.object_label.lower()\
                    if pd.notnull(x.object_label) else "99"\
                 ),\
                axis=1,\
            )\
           )

def print_prefixes(df):
    object_prefixes = df['object_id'].str.split(':').apply(lambda x: x[0] ).drop_duplicates()
    subject_prefixes = df['subject_id'].str.split(':').apply(lambda x: x[0] ).drop_duplicates()
    predicate_ids = df['predicate_id'].drop_duplicates()
    

    print(f"subject_prefixes:\n {subject_prefixes} \n \
          object_prefixes:\n {object_prefixes} \n \
          predicate_ids:\n {predicate_ids} ")

    
def flip_predicate(predicate_id):
    flip_dict = {
        "skos:closeMatch": "skos:closeMatch",
        "skos:relatedMatch": "skos:relatedMatch",
        "skos:narrowMatch" : "skos:broadMatch",
        "skos:broadMatch" : "skos:narrowMatch",
        "skos:exactMatch" : "skos:exactMatch"
    }
    
    return flip_dict[predicate_id]

def compare_and_comment_df(mondo_df, lex_df):
    df = compare_dataframes(mondo_df, lex_df).combined_dataframe
    df['comment'] = df['comment'].str.replace('UNIQUE_1', "MONDO_MAPPINGS")
    df['comment'] = df['comment'].str.replace('UNIQUE_2', "LEXMATCH")
    return df

def get_unmapped_df(comparison_df):
#     mappings = ["LEXMATCH", "MONDO_MAPPINGS"]
#     unmapped_df = comparison_df[
#         (comparison_df['comment'].str.contains("|".join(mappings)))
#     ]
    unmapped_lex_df = comparison_df[
        comparison_df['object_id'].str.contains("|".join(in_lex_but_not_mondo_list)) &
        comparison_df['comment'].str.contains("LEXMATCH")
    ]
    
    unmapped_mondo_df = comparison_df[
        comparison_df['object_id'].str.contains("|".join(in_mondo_but_not_lex_list)) &
        comparison_df['comment'].str.contains("MONDO_MAPPINGS")
    ]
    
    new_df = pd.concat([unmapped_lex_df, unmapped_mondo_df], axis=0)
    return new_df

def export_unmatched_exact(unmapped_df, match_type, fn):
        
    unmapped_exact = unmapped_df[(unmapped_df['comment'] == match_type) & (unmapped_df['predicate_id'] == 'skos:exactMatch')]
    unmapped_exact = replace_by_mondo_preds(unmapped_exact)
    robot_row_dict = {
        "subject_id":["ID"],
        "predicate_id": [">A oboInOwl:source"],
        "object_id": ["A oboInOwl:hasDbXref"],
        "object_label": [">A sssom:object_label"]
    }
    column_seq = unmapped_exact.columns
    unmapped_exact = pd.concat([pd.DataFrame.from_dict(robot_row_dict, orient='columns'),unmapped_exact] , axis=0)
#     unmapped_exact.index += 1
#     unmapped_exact = unmapped_exact.sort_index()
    unmapped_exact = unmapped_exact[column_seq]
    unmapped_exact.to_csv(join(dir_name, fn), sep='\t', index = False)
    return unmapped_exact.head()

def make_msdf(df, prefix_map, meta):
    combined_msdf = MappingSetDataFrame(df=df, prefix_map=prefix_map, metadata=meta)
    return combined_msdf

def mapped_curie_list(df):
    all_id_df = pd.DataFrame(pd.concat([df['subject_id'], df['object_id']]))\
                .drop_duplicates(ignore_index=True)
    all_id_df = all_id_df[~all_id_df[0].str.startswith('MONDO')]
    return all_id_df[0].to_list()

def replace_by_mondo_preds(df):
    mondo_codes = {
        "skos:exactMatch": "MONDO:equivalentTo",
        "skos:relatedMatch": "MONDO:relatedTo",
        "skos:narrowMatch": "MONDO:mondoIsBroaderThanSource",
        "skos:broadMatch": "MONDO:mondoIsNarrowerThanSource",
    }
    df['predicate_id'] = df['predicate_id'].apply(lambda x: mondo_codes[x] if x in mondo_codes else x)
    return df
    

In [5]:
%%time
msdf_lex = parse_sssom_table(lexmatch_file)
msdf_mondo = parse_sssom_table(mondo_sssom)

# Use OAK to get 'object_label'
ontology_resource = OntologyResource(slug='../ontology/tmp/merged.db', local=True)
oi = SqlImplementation(ontology_resource)

CPU times: user 1min 4s, sys: 894 ms, total: 1min 5s
Wall time: 1min 6s


In [6]:
%%time
exclusion_list = []

for file_path in exclusion_files:
    with open(file_path) as f_input:
        exclusion_list.extend(f_input.read().split('\n'))
        
exclusion_list

CPU times: user 17.6 ms, sys: 3.65 ms, total: 21.2 ms
Wall time: 22.5 ms


['DOID:0040001',
 'DOID:0040002',
 'DOID:0040003',
 'DOID:0040004',
 'DOID:0040005',
 'DOID:0040006',
 'DOID:0040007',
 'DOID:0040008',
 'DOID:0040009',
 'DOID:0040010',
 'DOID:0040011',
 'DOID:0040012',
 'DOID:0040013',
 'DOID:0040014',
 'DOID:0040015',
 'DOID:0040016',
 'DOID:0040017',
 'DOID:0040018',
 'DOID:0040019',
 'DOID:0040020',
 'DOID:0040021',
 'DOID:0040022',
 'DOID:0040023',
 'DOID:0040024',
 'DOID:0040025',
 'DOID:0040026',
 'DOID:0040027',
 'DOID:0040028',
 'DOID:0040029',
 'DOID:0040030',
 'DOID:0040031',
 'DOID:0040032',
 'DOID:0040033',
 'DOID:0040034',
 'DOID:0040035',
 'DOID:0040036',
 'DOID:0040037',
 'DOID:0040038',
 'DOID:0040040',
 'DOID:0040041',
 'DOID:0040042',
 'DOID:0040043',
 'DOID:0040044',
 'DOID:0040045',
 'DOID:0040046',
 'DOID:0040047',
 'DOID:0040048',
 'DOID:0040049',
 'DOID:0040050',
 'DOID:0040051',
 'DOID:0040052',
 'DOID:0040053',
 'DOID:0040054',
 'DOID:0040055',
 'DOID:0040056',
 'DOID:0040057',
 'DOID:0040058',
 'DOID:0040059',
 'DOID:0040061

In [7]:
%%time
all_lex_ids = mapped_curie_list(msdf_lex.df)
all_mondo_ids = mapped_curie_list(msdf_mondo.df)

in_lex_but_not_mondo_list = [x for x in all_lex_ids if x not in all_mondo_ids and x not in exclusion_list]
in_mondo_but_not_lex_list = [x for x in all_mondo_ids if x not in all_lex_ids and x not in exclusion_list]


CPU times: user 8min 10s, sys: 607 ms, total: 8min 11s
Wall time: 8min 12s


In [8]:
in_lex_but_not_mondo_list

['DOID:0060042',
 'DOID:0060204',
 'DOID:0070020',
 'DOID:0070141',
 'DOID:0070212',
 'DOID:0070309',
 'DOID:0070311',
 'DOID:0070326',
 'DOID:0070327',
 'DOID:0070331',
 'DOID:0070333',
 'DOID:0070336',
 'DOID:0070341',
 'DOID:0070343',
 'DOID:0070348',
 'DOID:0070349',
 'DOID:0070351',
 'DOID:0070358',
 'DOID:0070359',
 'DOID:0070360',
 'DOID:0070361',
 'DOID:0070362',
 'DOID:0080380',
 'DOID:0080390',
 'DOID:0080407',
 'DOID:0080410',
 'DOID:0080411',
 'DOID:0080503',
 'DOID:0080511',
 'DOID:0080546',
 'DOID:0080574',
 'DOID:0080594',
 'DOID:0080597',
 'DOID:0080602',
 'DOID:0080606',
 'DOID:0080607',
 'DOID:0080610',
 'DOID:0080612',
 'DOID:0080638',
 'DOID:0080641',
 'DOID:0080644',
 'DOID:0080645',
 'DOID:0080646',
 'DOID:0080647',
 'DOID:0080650',
 'DOID:0080651',
 'DOID:0080652',
 'DOID:0080661',
 'DOID:0080662',
 'DOID:0080663',
 'DOID:0080664',
 'DOID:0080665',
 'DOID:0080666',
 'DOID:0080669',
 'DOID:0080670',
 'DOID:0080671',
 'DOID:0080672',
 'DOID:0080673',
 'DOID:0080674

In [9]:
pd.DataFrame(msdf_mondo.df['object_id'].str.split(':').apply(lambda x: x[0] )).value_counts()

object_id
UMLS         16622
DOID          9814
Orphanet      9783
OMIM          9301
SCTID         9048
MESH          8155
NCIT          6880
ICD10CM       1206
OMIMPS         532
ICD10WHO        18
MedDRA          12
MEDGEN           2
dtype: int64

In [10]:
print_prefixes(msdf_mondo.df)

subject_prefixes:
 0    MONDO
Name: subject_id, dtype: object 
           object_prefixes:
 0           SCTID
1        Orphanet
2            NCIT
3            DOID
4            UMLS
5            MESH
11         OMIMPS
52        ICD10CM
169          OMIM
602        MedDRA
650      ICD10WHO
25327      MEDGEN
Name: object_id, dtype: object 
           predicate_ids:
 0        skos:exactMatch
26109    skos:broadMatch
Name: predicate_id, dtype: object 


In [11]:
%%time
# msdf_mondo.df[msdf_mondo.df['object_id'].str.contains('ICD')]
# "ICD10CM", "MONDO"
# msdf_mondo.df = msdf_mondo.df[(condition_1 & condition_2) | (condition_3 & condition_4)]
# msdf_mondo.df['object_label'] = msdf_mondo.df['object_id'].apply(lambda x: oi.label(x))

msdf_mondo.df['object_label'] = msdf_mondo.df['object_id'].apply(lambda x: oi.label(x))
msdf_mondo.df.head()


CPU times: user 1min 9s, sys: 4.63 s, total: 1min 14s
Wall time: 1min 14s


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label
0,MONDO:0000001,disease or disorder,skos:exactMatch,SCTID:64572001,semapv:UnspecifiedMatching,
1,MONDO:0000001,disease or disorder,skos:exactMatch,Orphanet:377788,semapv:UnspecifiedMatching,
2,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C2991,semapv:UnspecifiedMatching,Disease or Disorder
3,MONDO:0000001,disease or disorder,skos:exactMatch,DOID:4,semapv:UnspecifiedMatching,disease
4,MONDO:0000001,disease or disorder,skos:exactMatch,UMLS:C0012634,semapv:UnspecifiedMatching,


In [12]:
condition_1 = msdf_mondo.df['subject_id'].str.contains("MONDO")
condition_2 = msdf_mondo.df['object_id'].str.contains("ICD10CM")
condition_3 = msdf_mondo.df['object_id'].str.contains('|'.join((["OMIM","OMIMPS"])))
condition_4 = msdf_mondo.df['object_id'].str.contains("Orphanet")
condition_5 = msdf_mondo.df['object_id'].str.contains("DOID")
condition_6 = msdf_mondo.df['object_id'].str.contains("NCIT")

mondo_icd_df = msdf_mondo.df[condition_1 & condition_2]
mondo_omim_df = msdf_mondo.df[condition_1 & condition_3]
mondo_ordo_df = msdf_mondo.df[condition_1 & condition_4]
mondo_doid_df = msdf_mondo.df[condition_1 & condition_5]
mondo_ncit_df = msdf_mondo.df[condition_1 & condition_6]

mondo_icd_df.head()
mondo_omim_df.head()
mondo_ordo_df.head()
mondo_doid_df.head()
mondo_ncit_df.head()


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label
2,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C2991,semapv:UnspecifiedMatching,Disease or Disorder
10,MONDO:0000004,adrenocortical insufficiency,skos:exactMatch,NCIT:C26691,semapv:UnspecifiedMatching,Adrenocortical Insufficiency
17,MONDO:0000022,nocturnal enuresis,skos:exactMatch,NCIT:C118172,semapv:UnspecifiedMatching,Nocturnal Enuresis
48,MONDO:0000087,polymicrogyria,skos:exactMatch,NCIT:C116936,semapv:UnspecifiedMatching,Polymicrogyria
51,MONDO:0000088,precocious puberty,skos:exactMatch,NCIT:C79704,semapv:UnspecifiedMatching,Precocious Puberty


In [13]:
print_prefixes(msdf_lex.df)

subject_prefixes:
 0             DOID
98290      ICD10CM
115072    ICD10WHO
118862       MONDO
176282        NCIT
Name: subject_id, dtype: object 
           object_prefixes:
 0        MONDO
7         NCIT
10    ICD10WHO
17        DOID
35     ICD10CM
Name: object_id, dtype: object 
           predicate_ids:
 0       skos:exactMatch
6       skos:broadMatch
12      skos:closeMatch
158    skos:narrowMatch
Name: predicate_id, dtype: object 


### Flipping subject_id and object_id if MONDO not subject_id prefix

**Predicate impact**

- skos:closeMatch <=> skos:relatedMatch
- skos:narrowMatch < => skos:broadMatch

In [14]:
condition_1 = msdf_lex.df['subject_id'].str.contains("MONDO")
# condition_2 = msdf_lex.df['object_id'].str.contains("ICD10CM")
# condition_3 = msdf_lex.df['object_id'].str.contains('|'.join((["OMIM","OMIMPS"])))
# condition_4 = msdf_lex.df['object_id'].str.contains('|'.join((["ORDO","Orphanet"])))
# condition_5 = msdf_lex.df['object_id'].str.contains("DOID")
# condition_6 = msdf_lex.df['object_id'].str.contains("NCIT")
condition_mondo_obj = msdf_lex.df['object_id'].str.contains("MONDO")

non_mondo_subjects_df =  pd.DataFrame(msdf_lex.df[(~condition_1 & condition_mondo_obj)])
mondo_subjects_df = pd.DataFrame(msdf_lex.df[(condition_1 & ~condition_mondo_obj)])
print(len(mondo_subjects_df))
non_mondo_subjects_df.head()


8231


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string,subject_preprocessing,object_preprocessing
0,DOID:0001816,angiosarcoma,skos:exactMatch,MONDO:0016982,angiosarcoma,semapv:LexicalMatching,oaklib,0.941176,oio:hasDbXref,oio:hasDbXref,icdo:9120/3,,
1,DOID:0001816,angiosarcoma,skos:exactMatch,MONDO:0016982,angiosarcoma,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,angiosarcoma,,
2,DOID:0001816,angiosarcoma,skos:exactMatch,MONDO:0016982,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,rdfs:label,oio:hasExactSynonym,angiosarcoma,,
3,DOID:0001816,angiosarcoma,skos:exactMatch,MONDO:0016982,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,rdfs:label,oio:hasExactSynonym,angiosarcoma,,semapv:RegularExpressionReplacement
4,DOID:0001816,angiosarcoma,skos:exactMatch,MONDO:0016982,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,hemangiosarcoma,,


In [15]:
desired_sequence = ['subject_id', 'subject_label', 'object_id',
        'predicate_id','object_label', 'mapping_justification', 'mapping_tool', 'confidence',
        'subject_match_field', 'object_match_field', 'match_string']

new_subjects_df = non_mondo_subjects_df.rename(columns={
    'subject_id':'object_id', 
    'subject_label':'object_label',
    'object_id':'subject_id', 
    'object_label':'subject_label',
    'subject_match_field': 'object_match_field',
    'object_match_field': 'subject_match_field'
    
})


new_subjects_df = new_subjects_df[desired_sequence]
new_subjects_df["predicate_id"] = new_subjects_df["predicate_id"].apply(lambda x: flip_predicate(x))
print(len(new_subjects_df))
new_subjects_df.head()


71567


Unnamed: 0,subject_id,subject_label,object_id,predicate_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string
0,MONDO:0016982,angiosarcoma,DOID:0001816,skos:exactMatch,angiosarcoma,semapv:LexicalMatching,oaklib,0.941176,oio:hasDbXref,oio:hasDbXref,icdo:9120/3
1,MONDO:0016982,angiosarcoma,DOID:0001816,skos:exactMatch,angiosarcoma,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,angiosarcoma
2,MONDO:0016982,angiosarcoma,DOID:0001816,skos:exactMatch,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,angiosarcoma
3,MONDO:0016982,angiosarcoma,DOID:0001816,skos:exactMatch,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,angiosarcoma
4,MONDO:0016982,angiosarcoma,DOID:0001816,skos:exactMatch,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,hemangiosarcoma


### Combine dfs where subject_id prefix is MONDO

In [16]:
lex_df = pd.concat([mondo_subjects_df,new_subjects_df], ignore_index=True)
print(len(lex_df))

lex_df.head()

79798


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string,subject_preprocessing,object_preprocessing
0,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C25457,Condition,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,condition,,
1,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C156809,Medical Condition,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,medical condition,,
2,MONDO:0000004,adrenocortical insufficiency,skos:narrowMatch,ICD10CM:E27.49,Other adrenocortical insufficiency,semapv:LexicalMatching,oaklib,0.8,rdfs:label,oio:hasBroadSynonym,adrenocortical insufficiency,,semapv:RegularExpressionReplacement
3,MONDO:0000004,adrenocortical insufficiency,skos:narrowMatch,ICD10CM:E27.49,Other adrenocortical insufficiency,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasBroadSynonym,adrenocortical insufficiency,,semapv:RegularExpressionReplacement
4,MONDO:0000004,adrenocortical insufficiency,skos:exactMatch,NCIT:C113211,Hypocortisolemia,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,hypocortisolemia,,


In [17]:
# msdf_lex.df[msdf_lex.df['object_id'].str.contains('ICD')]
condition_1 = lex_df['subject_id'].str.contains("MONDO")
condition_2 = lex_df['object_id'].str.contains("ICD10CM")
condition_3 = lex_df['object_id'].str.contains('|'.join((["OMIM","OMIMPS"])))
condition_4 = lex_df['object_id'].str.contains("Orphanet")
condition_5 = lex_df['object_id'].str.contains("DOID")
condition_6 = lex_df['object_id'].str.contains("NCIT")


mondo_icd_lex_df = lex_df[(condition_1 & condition_2)]
mondo_omim_lex_df = lex_df[(condition_1 & condition_3)]
mondo_ordo_lex_df = lex_df[(condition_1 & condition_4)]
mondo_doid_lex_df = lex_df[(condition_1 & condition_5)]
mondo_ncit_lex_df = lex_df[(condition_1 & condition_6)]

mondo_icd_lex_df.head()
mondo_omim_lex_df.head()
mondo_ordo_lex_df.head()
mondo_doid_lex_df.head()
mondo_ncit_lex_df.head()


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string,subject_preprocessing,object_preprocessing
0,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C25457,Condition,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,condition,,
1,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C156809,Medical Condition,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,medical condition,,
4,MONDO:0000004,adrenocortical insufficiency,skos:exactMatch,NCIT:C113211,Hypocortisolemia,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,hypocortisolemia,,
5,MONDO:0000159,bone marrow failure syndrome,skos:exactMatch,NCIT:C165614,Bone Marrow Failure Syndrome,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,bone marrow failure syndrome,,
6,MONDO:0000328,hyperphosphatemia,skos:exactMatch,NCIT:C113750,Hyperphosphatemia,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,hyperphosphatemia,,


In [18]:
%%time
# comparison_ms_diff = compare_dataframes(msdf_mondo.df, msdf_lex.df)
# comparison_df = comparison_ms_diff.combined_dataframe
# comparison_df['comment'] = comparison_df['comment'].str.replace('UNIQUE_1', "MONDO_MAPPINGS")
# comparison_df['comment'] = comparison_df['comment'].str.replace('UNIQUE_2', "LEXMATCH")
# comparison_df.head()

# Comparisons
icd_comparison_df = compare_and_comment_df(mondo_icd_df, mondo_icd_lex_df)
omim_comparison_df = compare_and_comment_df(mondo_omim_df, mondo_omim_lex_df)
ordo_comparison_df = compare_and_comment_df(mondo_ordo_df, mondo_ordo_lex_df)
doid_comparison_df = compare_and_comment_df(mondo_doid_df, mondo_doid_lex_df)
ncit_comparison_df = compare_and_comment_df(mondo_ncit_df, mondo_ncit_lex_df)


CPU times: user 56 s, sys: 182 ms, total: 56.2 s
Wall time: 56.2 s


In [19]:
icd_comparison_df
omim_comparison_df
ordo_comparison_df
doid_comparison_df
ncit_comparison_df

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string,subject_preprocessing,object_preprocessing
71188,MONDO:0700051,liver abscess (disease),skos:exactMatch,NCIT:C99089,semapv:UnspecifiedMatching,Liver Abscess,MONDO_MAPPINGS,,,,,,,
14797,MONDO:0005154,liver disorder,skos:exactMatch,NCIT:C3196,semapv:UnspecifiedMatching,Liver and Intrahepatic Bile Duct Disorder,MONDO_MAPPINGS,,,,,,,
8982,MONDO:0003277,malignant ear neoplasm,skos:exactMatch,NCIT:C9337,semapv:UnspecifiedMatching,Malignant Ear Neoplasm,MONDO_MAPPINGS,,,,,,,
15585,MONDO:0005401,colonic neoplasm,skos:exactMatch,NCIT:C2953,semapv:UnspecifiedMatching,Colon Neoplasm,MONDO_MAPPINGS,,,,,,,
19702,MONDO:0006777,hairy tongue,skos:exactMatch,NCIT:C35075,semapv:UnspecifiedMatching,Hairy Tongue,MONDO_MAPPINGS,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1917,MONDO:0011178,infantile convulsions and choreoathetosis,skos:closeMatch,NCIT:C126650,semapv:LexicalMatching,Infantile Convulsions and Paroxysmal Choreoath...,COMMON_TO_BOTH,oaklib,,oio:hasRelatedSynonym,rdfs:label,infantile convulsions and paroxysmal choreoath...,,
44666,MONDO:0013282,alpha 1-antitrypsin deficiency,skos:exactMatch,NCIT:C84397,semapv:UnspecifiedMatching,Alpha-1 Antitrypsin Deficiency,COMMON_TO_BOTH,,,,,,,
3160,MONDO:0013282,alpha 1-antitrypsin deficiency,skos:closeMatch,NCIT:C84397,semapv:LexicalMatching,Alpha-1 Antitrypsin Deficiency,COMMON_TO_BOTH,oaklib,,oio:hasRelatedSynonym,rdfs:label,alpha-1 antitrypsin deficiency,,
37378,MONDO:0011266,myotonic dystrophy type 2,skos:exactMatch,NCIT:C84680,semapv:UnspecifiedMatching,Dystrophia Myotonica 2,COMMON_TO_BOTH,,,,,,,


In [20]:
ncit_comparison_df['comment'].drop_duplicates()

71188    MONDO_MAPPINGS
6719           LEXMATCH
33408    COMMON_TO_BOTH
Name: comment, dtype: object

### Split into unmapped dataframes

In [21]:
unmapped_icd_df = get_unmapped_df(icd_comparison_df)
unmapped_omim_df = get_unmapped_df(omim_comparison_df)
unmapped_ordo_df = get_unmapped_df(ordo_comparison_df)
unmapped_doid_df = get_unmapped_df(doid_comparison_df)
unmapped_ncit_df = get_unmapped_df(ncit_comparison_df)

unmapped_icd_df.head()


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string,subject_preprocessing,object_preprocessing
76528,MONDO:0018896,thrombotic thrombocytopenic purpura,skos:exactMatch,ICD10CM:M31.19,semapv:LexicalMatching,Other thrombotic microangiopathy,LEXMATCH,oaklib,0.8,rdfs:label,oio:hasExactSynonym,thrombotic thrombocytopenic purpura,,
1313,MONDO:0007885,Legg-Calve-Perthes disease,skos:narrowMatch,ICD10CM:M91.8,semapv:LexicalMatching,Other juvenile osteochondrosis of hip and pelvis,LEXMATCH,oaklib,0.8,oio:hasExactSynonym,oio:hasBroadSynonym,juvenile osteochondrosis of hip and pelvis,,semapv:RegularExpressionReplacement
72775,MONDO:0004272,urinary bladder tuberculosis,skos:exactMatch,ICD10CM:A18.12,semapv:LexicalMatching,Tuberculosis of bladder,LEXMATCH,oaklib,0.8,oio:hasExactSynonym,rdfs:label,tuberculosis of bladder,,
75645,MONDO:0024643,myocardial disorder,skos:exactMatch,ICD10CM:I51.5,semapv:LexicalMatching,Myocardial degeneration,LEXMATCH,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,myocardial disease,,
73330,MONDO:0024954,"larva migrans, visceral",skos:closeMatch,ICD10CM:B83.0,semapv:LexicalMatching,Visceral larva migrans,LEXMATCH,oaklib,,oio:hasRelatedSynonym,rdfs:label,visceral larva migrans,,


In [22]:
print_prefixes(unmapped_ncit_df)

subject_prefixes:
 7856    MONDO
Name: subject_id, dtype: object 
           object_prefixes:
 7856    NCIT
Name: object_id, dtype: object 
           predicate_ids:
 7856     skos:closeMatch
972      skos:exactMatch
79662    skos:broadMatch
Name: predicate_id, dtype: object 


In [23]:
# Let reviewers check if this makes sense or no.
export_unmatched_exact(unmapped_icd_df, "LEXMATCH", "unmapped_icd_lex.tsv")
export_unmatched_exact(unmapped_omim_df, "LEXMATCH", "unmapped_omim_lex.tsv")
export_unmatched_exact(unmapped_ordo_df, "LEXMATCH", "unmapped_ordo_lex.tsv")
export_unmatched_exact(unmapped_doid_df, "LEXMATCH", "unmapped_doid_lex.tsv")
export_unmatched_exact(unmapped_ncit_df, "LEXMATCH", "unmapped_ncit_lex.tsv")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicate_id'] = df['predicate_id'].apply(lambda x: mondo_codes[x] if x in mondo_codes else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicate_id'] = df['predicate_id'].apply(lambda x: mondo_codes[x] if x in mondo_codes else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pre

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string,subject_preprocessing,object_preprocessing
0,ID,,A oboInOwl:hasDbXref,>A oboInOwl:source,,>A sssom:object_label,,,,,,,,
972,MONDO:0006292,malignant mesothelioma,MONDO:equivalentTo,NCIT:C27926,semapv:LexicalMatching,Asbestos-Related Malignant Mesothelioma,LEXMATCH,oaklib,0.8,oio:hasExactSynonym,rdfs:label,asbestos-related malignant mesothelioma,,
980,MONDO:0006363,peritoneal multicystic mesothelioma,MONDO:equivalentTo,NCIT:C3765,semapv:LexicalMatching,Multicystic Mesothelioma,LEXMATCH,oaklib,0.8,oio:hasExactSynonym,rdfs:label,multicystic mesothelioma,,
7216,MONDO:0018881,myelodysplastic syndrome,MONDO:equivalentTo,NCIT:C8648,semapv:LexicalMatching,"Myelodysplastic Syndrome, Unclassifiable",LEXMATCH,oaklib,0.8,oio:hasExactSynonym,rdfs:label,"myelodysplastic syndrome, unclassifiable",,
7236,MONDO:0018989,recurrent acute pancreatitis,MONDO:equivalentTo,NCIT:C184324,semapv:LexicalMatching,Recurrent Acute Pancreatitis,LEXMATCH,oaklib,0.849779,rdfs:label,rdfs:label,recurrent acute pancreatitis,,


In [24]:
# Inspect why these are missing from SSSOM mappings
export_unmatched_exact(unmapped_icd_df, "MONDO_MAPPINGS", "unmapped_icd_mondo.tsv")
export_unmatched_exact(unmapped_omim_df, "MONDO_MAPPINGS", "unmapped_omim_mondo.tsv")
export_unmatched_exact(unmapped_ordo_df, "MONDO_MAPPINGS", "unmapped_ordo_mondo.tsv")
export_unmatched_exact(unmapped_doid_df, "MONDO_MAPPINGS", "unmapped_doid_mondo.tsv")
export_unmatched_exact(unmapped_ncit_df, "MONDO_MAPPINGS", "unmapped_ncit_mondo.tsv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicate_id'] = df['predicate_id'].apply(lambda x: mondo_codes[x] if x in mondo_codes else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicate_id'] = df['predicate_id'].apply(lambda x: mondo_codes[x] if x in mondo_codes else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pre

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string,subject_preprocessing,object_preprocessing
0,ID,,A oboInOwl:hasDbXref,>A oboInOwl:source,,>A sssom:object_label,,,,,,,,
71188,MONDO:0700051,liver abscess (disease),MONDO:equivalentTo,NCIT:C99089,semapv:UnspecifiedMatching,Liver Abscess,MONDO_MAPPINGS,,,,,,,
14797,MONDO:0005154,liver disorder,MONDO:equivalentTo,NCIT:C3196,semapv:UnspecifiedMatching,Liver and Intrahepatic Bile Duct Disorder,MONDO_MAPPINGS,,,,,,,
19702,MONDO:0006777,hairy tongue,MONDO:equivalentTo,NCIT:C35075,semapv:UnspecifiedMatching,Hairy Tongue,MONDO_MAPPINGS,,,,,,,
6149,MONDO:0002327,intracranial cavernous angioma,MONDO:equivalentTo,NCIT:C5432,semapv:UnspecifiedMatching,Intracranial Cavernous Hemangioma,MONDO_MAPPINGS,,,,,,,


In [25]:
# # Add distances
# # Add Levenshtein distance [ 0: Perfect match]
# add_distance(unmapped_mondo_exact, "levenshtein_dist", textdistance.levenshtein.distance)
# # Add Jaccard Index [ 0: Perfect match]
# add_distance(unmapped_mondo_exact, "jaccard_index", textdistance.jaccard.distance)
# # Add Monge-Elkan Distance [ 0: Perfect match]
# add_distance(unmapped_mondo_exact, "monge_elkan", textdistance.monge_elkan.distance)
# unmapped_mondo_exact.to_csv(join(dir_name, "unmapped_mondo_exact.tsv"), sep='\t', index = False)
# unmapped_mondo_exact.head()


In [26]:
# combined_msdf_icd = make_msdf(unmapped_icd_df, msdf_lex.prefix_map, msdf_lex.metadata)
# combined_msdf_omim = make_msdf(unmapped_omim_df, msdf_lex.prefix_map, msdf_lex.metadata)
# combined_msdf_ordo = make_msdf(unmapped_ordo_df, msdf_lex.prefix_map, msdf_lex.metadata)
# combined_msdf_doid = make_msdf(unmapped_doid_df, msdf_lex.prefix_map, msdf_lex.metadata)
# combined_msdf_ncit = make_msdf(unmapped_ncit_df, msdf_lex.prefix_map, msdf_lex.metadata)

combined_df = pd.concat(
    [unmapped_icd_df, unmapped_omim_df, unmapped_ordo_df, unmapped_doid_df, unmapped_ncit_df]
)

combined_msdf = make_msdf(combined_df, msdf_lex.prefix_map, msdf_lex.metadata)
combined_msdf.df.head()


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string,subject_preprocessing,object_preprocessing
76528,MONDO:0018896,thrombotic thrombocytopenic purpura,skos:exactMatch,ICD10CM:M31.19,semapv:LexicalMatching,Other thrombotic microangiopathy,LEXMATCH,oaklib,0.8,rdfs:label,oio:hasExactSynonym,thrombotic thrombocytopenic purpura,,
1313,MONDO:0007885,Legg-Calve-Perthes disease,skos:narrowMatch,ICD10CM:M91.8,semapv:LexicalMatching,Other juvenile osteochondrosis of hip and pelvis,LEXMATCH,oaklib,0.8,oio:hasExactSynonym,oio:hasBroadSynonym,juvenile osteochondrosis of hip and pelvis,,semapv:RegularExpressionReplacement
72775,MONDO:0004272,urinary bladder tuberculosis,skos:exactMatch,ICD10CM:A18.12,semapv:LexicalMatching,Tuberculosis of bladder,LEXMATCH,oaklib,0.8,oio:hasExactSynonym,rdfs:label,tuberculosis of bladder,,
75645,MONDO:0024643,myocardial disorder,skos:exactMatch,ICD10CM:I51.5,semapv:LexicalMatching,Myocardial degeneration,LEXMATCH,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,myocardial disease,,
73330,MONDO:0024954,"larva migrans, visceral",skos:closeMatch,ICD10CM:B83.0,semapv:LexicalMatching,Visceral larva migrans,LEXMATCH,oaklib,,oio:hasRelatedSynonym,rdfs:label,visceral larva migrans,,


In [27]:
%%time
df_dict = split_dataframe(combined_msdf)

  subject_prefixes = set(msdf.df[SUBJECT_ID].str.split(":", 1, expand=True)[0])
  object_prefixes = set(msdf.df[OBJECT_ID].str.split(":", 1, expand=True)[0])


CPU times: user 9.14 s, sys: 57.7 ms, total: 9.2 s
Wall time: 11.7 s


In [28]:
df_dict.keys()

dict_keys(['mondo_closematch_ncit', 'mondo_exactmatch_ncit', 'mondo_broadmatch_ncit', 'mondo_closematch_doid', 'mondo_exactmatch_doid', 'mondo_broadmatch_doid', 'mondo_narrowmatch_doid', 'mondo_closematch_icd10cm', 'mondo_exactmatch_icd10cm', 'mondo_broadmatch_icd10cm', 'mondo_narrowmatch_icd10cm'])

In [29]:
# mondo_icd_list = [x for x in list(df_dict.keys()) if 'mondo' in x and "icd10" in x]
# mondo_icd_list

In [30]:
for match in df_dict.keys():
    fn = match + ".tsv"
    df_dict[match].df.to_csv(join(dir_name,fn), sep='\t', index = False)

In [31]:
df_dict['mondo_exactmatch_icd10cm'].df

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string,comment
0,MONDO:0000022,nocturnal enuresis,skos:exactMatch,ICD10CM:N39.44,Nocturnal enuresis,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,nocturnal enuresis,LEXMATCH
1,MONDO:0000078,acrocephalopolysyndactyly,skos:exactMatch,ICD10CM:Q87.0,Congenital malformation syndromes predominantl...,semapv:LexicalMatching,oaklib,0.800000,rdfs:label,oio:hasExactSynonym,acrocephalopolysyndactyly,LEXMATCH
2,MONDO:0000129,glutaric aciduria,skos:exactMatch,ICD10CM:E72.3,Disorders of lysine and hydroxylysine metabolism,semapv:LexicalMatching,oaklib,0.800000,oio:hasExactSynonym,oio:hasExactSynonym,glutaric aciduria,LEXMATCH
3,MONDO:0000129,glutaric aciduria,skos:exactMatch,ICD10CM:E72.3,Disorders of lysine and hydroxylysine metabolism,semapv:LexicalMatching,oaklib,0.800000,rdfs:label,oio:hasExactSynonym,glutaric aciduria,LEXMATCH
4,MONDO:0000153,transposition of the great arteries,skos:exactMatch,ICD10CM:Q20.3,Discordant ventriculoarterial connection,semapv:LexicalMatching,oaklib,0.800000,oio:hasExactSynonym,oio:hasExactSynonym,transposition of great vessels,LEXMATCH
...,...,...,...,...,...,...,...,...,...,...,...,...
2301,MONDO:0700081,newborn respiratory distress syndrome,skos:exactMatch,ICD10CM:P22.0,Respiratory distress syndrome of newborn,semapv:LexicalMatching,oaklib,0.800000,oio:hasExactSynonym,oio:hasExactSynonym,hyaline membrane disease,LEXMATCH
2302,MONDO:0700081,newborn respiratory distress syndrome,skos:exactMatch,ICD10CM:P22.0,Respiratory distress syndrome of newborn,semapv:LexicalMatching,oaklib,0.800000,oio:hasExactSynonym,rdfs:label,respiratory distress syndrome of newborn,LEXMATCH
2303,MONDO:0800026,"central hypoventilation syndrome, congenital, ...",skos:exactMatch,ICD10CM:G47.35,Congenital central alveolar hypoventilation sy...,semapv:LexicalMatching,oaklib,0.800000,oio:hasExactSynonym,rdfs:label,congenital central alveolar hypoventilation sy...,LEXMATCH
2304,MONDO:0800177,frostbite,skos:exactMatch,ICD10CM:T33-T34,Frostbite (T33-T34),semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,frostbite,LEXMATCH
