In [1]:
%reset -f

In [2]:
import pandas as pd
from sssom.parsers import parse_sssom_table
from sssom import compare_dataframes
from sssom.parsers import split_dataframe
from sssom.util import MappingSetDataFrame
from os.path import join
from oaklib import OntologyResource
from oaklib.implementations import SqlImplementation
import textdistance

In [3]:
lexmatch_file = "../mappings/mondo-sources-all-lexical.sssom.tsv"
mondo_sssom = "../ontology/tmp/mondo.sssom.tsv"
dir_name = "dataframes"

In [4]:
# Functions

def add_distance(df, col_name, txt_dist_pkg):
    df\
    .insert(\
            len(df.columns),\
            col_name,\
            df.apply\
            (\
                lambda x: txt_dist_pkg\
                 (\
                    x.subject_label.lower(), x.object_label.lower()\
                    if pd.notnull(x.object_label) else "99"\
                 ),\
                axis=1,\
            )\
           )

In [5]:
%%time
msdf_lex = parse_sssom_table(lexmatch_file)
msdf_mondo = parse_sssom_table(mondo_sssom)

# Use OAK to get 'object_label'
ontology_resource = OntologyResource(slug='../ontology/tmp/merged.db', local=True)
oi = SqlImplementation(ontology_resource)

CPU times: user 31.3 s, sys: 454 ms, total: 31.7 s
Wall time: 33.5 s


In [6]:
# msdf_mondo.df[msdf_mondo.df['object_id'].str.contains('ICD')]
# "ICD10CM", "MONDO"
condition_1 = msdf_mondo.df['object_id'].str.contains("ICD10CM")
condition_2 = msdf_mondo.df['subject_id'].str.contains("MONDO")
condition_3 = msdf_mondo.df['object_id'].str.contains("MONDO")
condition_4 = msdf_mondo.df['subject_id'].str.contains("ICD10CM")
msdf_mondo.df = msdf_mondo.df[(condition_1 & condition_2) | (condition_3 & condition_4)]
msdf_mondo.df['object_label'] = msdf_mondo.df['object_id'].apply(lambda x: oi.label(x))
msdf_mondo.df.head()

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label
51,MONDO:0000088,precocious puberty,skos:exactMatch,ICD10CM:E30.1,Unspecified,Precocious puberty
111,MONDO:0000153,transposition of the great arteries,skos:broadMatch,ICD10CM:Q20.3,Unspecified,Discordant ventriculoarterial connection
113,MONDO:0000153,transposition of the great arteries,skos:broadMatch,ICD10CM:Q20.5,Unspecified,Discordant atrioventricular connection
202,MONDO:0000226,mineral metabolism disease,skos:exactMatch,ICD10CM:E83,Unspecified,Disorders of mineral metabolism
222,MONDO:0000237,erysipeloid,skos:exactMatch,ICD10CM:A26,Unspecified,Erysipeloid


In [7]:
# msdf_lex.df[msdf_lex.df['object_id'].str.contains('ICD')]
condition_1 = msdf_lex.df['object_id'].str.contains("ICD10CM")
condition_2 = msdf_lex.df['subject_id'].str.contains("MONDO")
condition_3 = msdf_lex.df['object_id'].str.contains("MONDO")
condition_4 = msdf_lex.df['subject_id'].str.contains("ICD10CM")
msdf_lex.df = msdf_lex.df[(condition_1 & condition_2) | (condition_3 & condition_4)]
msdf_lex.df.head()


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string
28311,ICD10CM:A07.3,Isosporiasis,skos:exactMatch,MONDO:0018769,isosporiasis,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,Isosporosis
28326,ICD10CM:A15.0,Tuberculosis of lung,skos:closeMatch,MONDO:0021953,tuberculous fibrosis of lung,semapv:LexicalMatching,oaklib,0.5,oio:hasExactSynonym,oio:hasRelatedSynonym,Tuberculous fibrosis of lung
28346,ICD10CM:A19,Miliary tuberculosis,skos:exactMatch,MONDO:0005848,miliary tuberculosis,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,disseminated tuberculosis
28347,ICD10CM:A19,Miliary tuberculosis,skos:exactMatch,MONDO:0005848,miliary tuberculosis,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,generalized tuberculosis
28363,ICD10CM:A21,Tularemia,skos:closeMatch,MONDO:0018077,tularemia,semapv:LexicalMatching,oaklib,0.5,oio:hasExactSynonym,oio:hasRelatedSynonym,rabbit fever


In [8]:
duplicate_df_lex = msdf_lex.df.copy()
duplicate_df_mondo = msdf_mondo.df.head().copy()

duplicate_df_lex = duplicate_df_lex.rename(columns={"subject_id": "object_id","object_id": "subject_id" })
duplicate_df_mondo = duplicate_df_mondo.rename(columns={"subject_id": "object_id","object_id": "subject_id" })


msdf_lex.df = pd.concat([duplicate_df_lex, msdf_lex.df]).drop_duplicates()
msdf_mondo.df = pd.concat([duplicate_df_mondo, msdf_mondo.df]).drop_duplicates()
msdf_mondo.df.head()

Unnamed: 0,object_id,subject_label,predicate_id,subject_id,mapping_justification,object_label
51,MONDO:0000088,precocious puberty,skos:exactMatch,ICD10CM:E30.1,Unspecified,Precocious puberty
111,MONDO:0000153,transposition of the great arteries,skos:broadMatch,ICD10CM:Q20.3,Unspecified,Discordant ventriculoarterial connection
113,MONDO:0000153,transposition of the great arteries,skos:broadMatch,ICD10CM:Q20.5,Unspecified,Discordant atrioventricular connection
202,MONDO:0000226,mineral metabolism disease,skos:exactMatch,ICD10CM:E83,Unspecified,Disorders of mineral metabolism
222,MONDO:0000237,erysipeloid,skos:exactMatch,ICD10CM:A26,Unspecified,Erysipeloid


In [9]:
comparison_ms_diff = compare_dataframes(msdf_mondo.df, msdf_lex.df)
comparison_df = comparison_ms_diff.combined_dataframe
comparison_df['comment'] = comparison_df['comment'].str.replace('UNIQUE_1', "MONDO_MAPPINGS")
comparison_df['comment'] = comparison_df['comment'].str.replace('UNIQUE_2', "LEXMATCH")
comparison_df.head()

Unnamed: 0,object_id,subject_label,predicate_id,subject_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string
12617,ICD10CM:J31.0,chronic rhinitis,skos:exactMatch,MONDO:0004514,Unspecified,Chronic rhinitis,MONDO_MAPPINGS,,,,,
51349,ICD10CM:B69.8,cysticercosis,skos:broadMatch,MONDO:0015484,Unspecified,Cysticercosis of other sites,MONDO_MAPPINGS,,,,,
67315,ICD10CM:B83.4,internal hirudiniasis,skos:exactMatch,MONDO:0024302,Unspecified,Internal hirudiniasis,MONDO_MAPPINGS,,,,,
2914,ICD10CM:M12.3,palindromic rheumatism,skos:exactMatch,MONDO:0001332,Unspecified,Palindromic rheumatism,MONDO_MAPPINGS,,,,,
59306,ICD10CM:E00.2,congenital hypothyroidism,skos:broadMatch,MONDO:0018612,Unspecified,"Congenital iodine-deficiency syndrome, mixed type",MONDO_MAPPINGS,,,,,


In [10]:
mappings = ["LEXMATCH", "MONDO_MAPPINGS"]
unmapped_df = comparison_df[
    (comparison_df['comment'].str.contains("|".join(mappings)))
]

unmapped_df.head()

Unnamed: 0,object_id,subject_label,predicate_id,subject_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string
12617,ICD10CM:J31.0,chronic rhinitis,skos:exactMatch,MONDO:0004514,Unspecified,Chronic rhinitis,MONDO_MAPPINGS,,,,,
51349,ICD10CM:B69.8,cysticercosis,skos:broadMatch,MONDO:0015484,Unspecified,Cysticercosis of other sites,MONDO_MAPPINGS,,,,,
67315,ICD10CM:B83.4,internal hirudiniasis,skos:exactMatch,MONDO:0024302,Unspecified,Internal hirudiniasis,MONDO_MAPPINGS,,,,,
2914,ICD10CM:M12.3,palindromic rheumatism,skos:exactMatch,MONDO:0001332,Unspecified,Palindromic rheumatism,MONDO_MAPPINGS,,,,,
59306,ICD10CM:E00.2,congenital hypothyroidism,skos:broadMatch,MONDO:0018612,Unspecified,"Congenital iodine-deficiency syndrome, mixed type",MONDO_MAPPINGS,,,,,


In [17]:
# Let reviewers check if this makes sense or no.
unmapped_lex_exact = unmapped_df[(unmapped_df['comment'] == 'LEXMATCH') & (unmapped_df['predicate_id'] == 'skos:exactMatch')]
unmapped_lex_exact.to_csv(join(dir_name, "unmapped_lex_exact.tsv"), sep='\t', index = False)
unmapped_lex_exact.head()


Unnamed: 0,object_id,subject_label,predicate_id,subject_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string
34197,ICD10CM:Q20.3,Discordant ventriculoarterial connection,skos:exactMatch,MONDO:0018089,semapv:LexicalMatching,double outlet right ventricle,LEXMATCH,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,Dextrotransposition of aorta
34197,MONDO:0018089,Discordant ventriculoarterial connection,skos:exactMatch,ICD10CM:Q20.3,semapv:LexicalMatching,double outlet right ventricle,LEXMATCH,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,Dextrotransposition of aorta
30467,ICD10CM:G25.82,Stiff-man syndrome,skos:exactMatch,MONDO:0008491,semapv:LexicalMatching,stiff-person syndrome,LEXMATCH,oaklib,0.8,rdfs:label,oio:hasExactSynonym,Stiff-man syndrome
30467,MONDO:0008491,Stiff-man syndrome,skos:exactMatch,ICD10CM:G25.82,semapv:LexicalMatching,stiff-person syndrome,LEXMATCH,oaklib,0.8,rdfs:label,oio:hasExactSynonym,Stiff-man syndrome
34802,ICD10CM:Q93.82,Williams syndrome,skos:exactMatch,MONDO:0008678,semapv:LexicalMatching,Williams syndrome,LEXMATCH,oaklib,0.849779,rdfs:label,rdfs:label,Williams syndrome


In [11]:
# Inspect why these are missing from SSSOM mappings

unmapped_mondo_exact = unmapped_df[(unmapped_df['comment'] == 'MONDO_MAPPINGS')& (unmapped_df['predicate_id'] == 'skos:exactMatch')]
unmapped_mondo_exact['object_label'] = unmapped_mondo_exact['object_id'].apply(lambda x: oi.label(x))
# Add distances
# Add Levenshtein distance [ 0: Perfect match]
add_distance(unmapped_mondo_exact, "levenshtein_dist", textdistance.levenshtein.distance)
# Add Jaccard Index [ 0: Perfect match]
add_distance(unmapped_mondo_exact, "jaccard_index", textdistance.jaccard.distance)
# Add Monge-Elkan Distance [ 0: Perfect match]
add_distance(unmapped_mondo_exact, "monge_elkan", textdistance.monge_elkan.distance)
unmapped_mondo_exact.to_csv(join(dir_name, "unmapped_mondo_exact.tsv"), sep='\t', index = False)
unmapped_mondo_exact.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmapped_mondo_exact['object_label'] = unmapped_mondo_exact['object_id'].apply(lambda x: oi.label(x))


Unnamed: 0,object_id,subject_label,predicate_id,subject_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string,levenshtein_dist,jaccard_index,monge_elkan
12617,ICD10CM:J31.0,chronic rhinitis,skos:exactMatch,MONDO:0004514,Unspecified,Chronic rhinitis,MONDO_MAPPINGS,,,,,,0,0.0,0.0
67315,ICD10CM:B83.4,internal hirudiniasis,skos:exactMatch,MONDO:0024302,Unspecified,Internal hirudiniasis,MONDO_MAPPINGS,,,,,,0,0.0,0.0
2914,ICD10CM:M12.3,palindromic rheumatism,skos:exactMatch,MONDO:0001332,Unspecified,Palindromic rheumatism,MONDO_MAPPINGS,,,,,,0,0.0,0.0
11846,ICD10CM:F52.31,female orgasmic disorder,skos:exactMatch,MONDO:0004258,Unspecified,Female orgasmic disorder,MONDO_MAPPINGS,,,,,,0,0.0,0.0
4470,ICD10CM:B85.3,Pthirus pubis infestation,skos:exactMatch,MONDO:0001794,Unspecified,Phthiriasis,MONDO_MAPPINGS,,,,,,18,0.615385,1.976


In [12]:
combined_msdf = MappingSetDataFrame(df=comparison_df, prefix_map=msdf_lex.prefix_map, metadata=msdf_lex.metadata)
combined_msdf.df.head()

Unnamed: 0,object_id,subject_label,predicate_id,subject_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string
12617,ICD10CM:J31.0,chronic rhinitis,skos:exactMatch,MONDO:0004514,Unspecified,Chronic rhinitis,MONDO_MAPPINGS,,,,,
51349,ICD10CM:B69.8,cysticercosis,skos:broadMatch,MONDO:0015484,Unspecified,Cysticercosis of other sites,MONDO_MAPPINGS,,,,,
67315,ICD10CM:B83.4,internal hirudiniasis,skos:exactMatch,MONDO:0024302,Unspecified,Internal hirudiniasis,MONDO_MAPPINGS,,,,,
2914,ICD10CM:M12.3,palindromic rheumatism,skos:exactMatch,MONDO:0001332,Unspecified,Palindromic rheumatism,MONDO_MAPPINGS,,,,,
59306,ICD10CM:E00.2,congenital hypothyroidism,skos:broadMatch,MONDO:0018612,Unspecified,"Congenital iodine-deficiency syndrome, mixed type",MONDO_MAPPINGS,,,,,


In [13]:
%%time
df_dict = split_dataframe(combined_msdf)



CPU times: user 1.48 s, sys: 26.2 ms, total: 1.51 s
Wall time: 3.6 s


In [14]:
mondo_icd_list = [x for x in list(df_dict.keys()) if 'mondo' in x and "icd10" in x]
mondo_icd_list

['icd10cm_closematch_mondo',
 'icd10cm_broadmatch_mondo',
 'icd10cm_narrowmatch_mondo',
 'icd10cm_exactmatch_mondo',
 'mondo_closematch_icd10cm',
 'mondo_broadmatch_icd10cm',
 'mondo_narrowmatch_icd10cm',
 'mondo_exactmatch_icd10cm']

In [15]:
for match in mondo_icd_list:
    fn = match + ".tsv"
    df_dict[match].df.to_csv(join(dir_name,fn), sep='\t', index = False)

In [16]:
df_dict['mondo_exactmatch_icd10cm'].df

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string,comment
0,MONDO:0000088,precocious puberty,skos:exactMatch,ICD10CM:E30.1,Precocious puberty,Unspecified,,,,,,MONDO_MAPPINGS
1,MONDO:0000226,mineral metabolism disease,skos:exactMatch,ICD10CM:E83,Disorders of mineral metabolism,Unspecified,,,,,,MONDO_MAPPINGS
2,MONDO:0000237,erysipeloid,skos:exactMatch,ICD10CM:A26,Erysipeloid,Unspecified,,,,,,MONDO_MAPPINGS
3,MONDO:0000241,Dietary selenium deficiency,skos:exactMatch,ICD10CM:E59,Keshan disease,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,Keshan disease,LEXMATCH
4,MONDO:0000245,tinea imbricata,skos:exactMatch,ICD10CM:B35.5,Tinea imbricata,Unspecified,,,,,,MONDO_MAPPINGS
...,...,...,...,...,...,...,...,...,...,...,...,...
1625,MONDO:0100471,vitamin D deficiency,skos:exactMatch,ICD10CM:E55,Vitamin D deficiency,Unspecified,,,,,,MONDO_MAPPINGS
1626,MONDO:0100491,generalized pustular psoriasis,skos:exactMatch,ICD10CM:L40.1,Generalized pustular psoriasis,Unspecified,,,,,,MONDO_MAPPINGS
1627,MONDO:0500000,Other hypereosinophilic syndrome,skos:exactMatch,ICD10CM:D72.118,episodic angioedema with eosinophilia,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,Gleich's syndrome,LEXMATCH
1628,MONDO:0800029,interstitial lung disease 2,skos:exactMatch,ICD10CM:J84.112,Idiopathic pulmonary fibrosis,Unspecified,,,,,,MONDO_MAPPINGS
