In [1]:
%reset -f

In [2]:
import pandas as pd
from sssom.parsers import parse_sssom_table
from sssom import compare_dataframes
from sssom.parsers import split_dataframe
from sssom.util import MappingSetDataFrame
from os.path import join
from oaklib import OntologyResource
from oaklib.implementations import SqlImplementation
import textdistance

In [3]:
lexmatch_file = "../mappings/mondo-sources-all-lexical.sssom.tsv"
mondo_sssom = "../ontology/tmp/mondo.sssom.tsv"
dir_name = "dataframes"

In [4]:
# Functions

def add_distance(df, col_name, txt_dist_pkg):
    df\
    .insert(\
            len(df.columns),\
            col_name,\
            df.apply\
            (\
                lambda x: txt_dist_pkg\
                 (\
                    x.subject_label.lower(), x.object_label.lower()\
                    if pd.notnull(x.object_label) else "99"\
                 ),\
                axis=1,\
            )\
           )
    
def flip_predicate(predicate_id):
    flip_dict = {
        "skos:closeMatch": "skos:relatedMatch",
        "skos:relatedMatch": "skos:closeMatch",
        "skos:narrowMatch" : "skos:broadMatch",
        "skos:broadMatch" : "skos:narrowMatch",
        "skos:exactMatch" : "skos:exactMatch"
    }
    
    return flip_dict[predicate_id]


In [5]:
%%time
msdf_lex = parse_sssom_table(lexmatch_file)
msdf_mondo = parse_sssom_table(mondo_sssom)

# Use OAK to get 'object_label'
ontology_resource = OntologyResource(slug='../ontology/tmp/merged.db', local=True)
oi = SqlImplementation(ontology_resource)

CPU times: user 1min 45s, sys: 2.04 s, total: 1min 47s
Wall time: 1min 50s


In [52]:
mondo_object_prefixes = msdf_mondo.df['object_id'].str.split(':').apply(lambda x: x[0] ).drop_duplicates()
mondo_subject_prefixes = msdf_mondo.df['subject_id'].str.split(':').apply(lambda x: x[0] ).drop_duplicates()
mondo_predicate_ids = msdf_mondo.df['predicate_id'].drop_duplicates()

print(f"mondo_subject_prefixes:\n {mondo_subject_prefixes} \n \
        lex_object_prefixes:\n {mondo_object_prefixes} \n \
        predicate_ids: \n {mondo_predicate_ids}")

mondo_subject_prefixes:
 0    MONDO
Name: subject_id, dtype: object 
         lex_object_prefixes:
 0           SCTID
1            MESH
2            DOID
3        Orphanet
4            NCIT
5            UMLS
11         OMIMPS
51        ICD10CM
172          OMIM
604        MedDRA
657      ICD10WHO
25506      MEDGEN
Name: object_id, dtype: object 
         predicate_ids: 
 0      skos:exactMatch
111    skos:broadMatch
Name: predicate_id, dtype: object


In [29]:
%%time
# msdf_mondo.df[msdf_mondo.df['object_id'].str.contains('ICD')]
# "ICD10CM", "MONDO"
# msdf_mondo.df = msdf_mondo.df[(condition_1 & condition_2) | (condition_3 & condition_4)]
# msdf_mondo.df['object_label'] = msdf_mondo.df['object_id'].apply(lambda x: oi.label(x))

msdf_mondo.df['object_label'] = msdf_mondo.df['object_id'].apply(lambda x: oi.label(x))
msdf_mondo.df.head()


CPU times: user 1min 13s, sys: 5.22 s, total: 1min 18s
Wall time: 1min 19s


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label
0,MONDO:0000001,disease or disorder,skos:exactMatch,SCTID:64572001,Unspecified,
1,MONDO:0000001,disease or disorder,skos:exactMatch,MESH:D004194,Unspecified,
2,MONDO:0000001,disease or disorder,skos:exactMatch,DOID:4,Unspecified,disease
3,MONDO:0000001,disease or disorder,skos:exactMatch,Orphanet:377788,Unspecified,
4,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C2991,Unspecified,Disease or Disorder


In [27]:
condition_1 = msdf_mondo.df['subject_id'].str.contains("MONDO")
condition_2 = msdf_mondo.df['object_id'].str.contains("ICD10CM")
condition_3 = msdf_mondo.df['object_id'].str.contains('|'.join((["OMIM","OMIMPS"])))
condition_4 = msdf_mondo.df['object_id'].str.contains("Orphanet")
condition_5 = msdf_mondo.df['object_id'].str.contains("DOID")

mondo_icd_df = msdf_mondo.df[condition_1 & condition_2]
mondo_omim_df = msdf_mondo.df[condition_1 & condition_3]
mondo_ordo_df = msdf_mondo.df[condition_1 & condition_4]
mondo_doid_df = msdf_mondo.df[condition_1 & condition_5]

mondo_icd_df.head()
mondo_omim_df.head()
mondo_ordo_df.head()
mondo_doid_df.head()


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label
2,MONDO:0000001,disease or disorder,skos:exactMatch,DOID:4,Unspecified,disease
8,MONDO:0000004,adrenocortical insufficiency,skos:exactMatch,DOID:10493,Unspecified,adrenal cortical hypofunction
14,MONDO:0000009,"inherited bleeding disorder, platelet-type",skos:exactMatch,DOID:2218,Unspecified,blood platelet disease
23,MONDO:0000032,"febrile seizures, familial",skos:exactMatch,DOID:0111297,Unspecified,familial febrile seizures
32,MONDO:0000050,isolated congenital growth hormone deficiency,skos:exactMatch,DOID:0060870,Unspecified,isolated growth hormone deficiency


In [50]:
lex_object_prefixes = msdf_lex.df['object_id'].str.split(':').apply(lambda x: x[0] ).drop_duplicates()
lex_subject_prefixes = msdf_lex.df['subject_id'].str.split(':').apply(lambda x: x[0] ).drop_duplicates()
lex_predicate_ids = msdf_lex.df['predicate_id'].drop_duplicates()

print(f"subject_prefixes:\n {lex_subject_prefixes} \n \
      object_prefixes:\n {lex_object_prefixes} \n \
      predicat_ids:\n {lex_predicate_ids} ")


subject_prefixes:
 0             DOID
130421     ICD10CM
144485    ICD10WHO
147213       MONDO
366217        NCIT
Name: subject_id, dtype: object 
       object_prefixes:
 0        MONDO
7         NCIT
11    ICD10WHO
17        DOID
33     ICD10CM
Name: object_id, dtype: object 
       predicat_ids:
 0       skos:exactMatch
6       skos:broadMatch
14      skos:closeMatch
157    skos:narrowMatch
Name: predicate_id, dtype: object 


### Flipping subject_id and object_id if MONDO not subject_id prefix

**Predicate impact**

- skos:closeMatch <=> skos:relatedMatch
- skos:narrowMatch < => skos:broadMatch

In [92]:
condition_mondo_obj = msdf_lex.df['object_id'].str.contains("MONDO")
non_mondo_subjects_df =  pd.DataFrame(msdf_lex.df[(~condition_1 & condition_mondo_obj)])
mondo_subjects_df = pd.DataFrame(msdf_lex.df[(condition_1 & ~condition_mondo_obj)])
print(len(mondo_subjects_df))
mondo_subjects_df.head()


7638


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string
147213,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C25457,Condition,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,condition
147214,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C156809,Medical Condition,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,medical condition
147222,MONDO:0000004,adrenocortical insufficiency,skos:exactMatch,NCIT:C113211,Hypocortisolemia,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,hypocortisolemia
147947,MONDO:0000159,bone marrow failure syndrome,skos:exactMatch,NCIT:C165614,Bone Marrow Failure Syndrome,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,bone marrow failure syndrome
149354,MONDO:0000328,hyperphosphatemia,skos:exactMatch,NCIT:C113750,Hyperphosphatemia,semapv:RegularExpressionReplacement,oaklib,0.8,oio:hasExactSynonym,rdfs:label,hyperphosphatemia


In [91]:
desired_sequence = ['subject_id', 'subject_label', 'predicate_id', 'object_id',
       'object_label', 'mapping_justification', 'mapping_tool', 'confidence',
       'subject_match_field', 'object_match_field', 'match_string']

new_subjects_df = non_mondo_subjects_df.rename(columns={
    'subject_id':'object_id', 
    'subject_label':'object_label',
    'object_id':'subject_id', 
    'object_label':'subject_label'
    
})


new_subjects_df = new_subjects_df[desired_sequence]
new_subjects_df["predicate_id"] = new_subjects_df["predicate_id"].apply(lambda x: flip_predicate(x))
print(len(new_subjects_df))
new_subjects_df.head()


103220


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string
0,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.941176,oio:hasDbXref,oio:hasDbXref,mesh:d006394
1,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,angiosarcoma
2,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,hemangiosarcoma
3,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.941176,oio:hasDbXref,oio:hasDbXref,icdo:9120/3
4,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:RegularExpressionReplacement,oaklib,0.8,rdfs:label,oio:hasExactSynonym,angiosarcoma


### Combine dfs where subjec_id prefix si MONDO

In [93]:
lex_df = pd.concat([mondo_subjects_df,new_subjects_df], ignore_index=True)
print(len(lex_df))

lex_df.head()

110858


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string
0,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C25457,Condition,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,condition
1,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C156809,Medical Condition,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,medical condition
2,MONDO:0000004,adrenocortical insufficiency,skos:exactMatch,NCIT:C113211,Hypocortisolemia,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,hypocortisolemia
3,MONDO:0000159,bone marrow failure syndrome,skos:exactMatch,NCIT:C165614,Bone Marrow Failure Syndrome,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,bone marrow failure syndrome
4,MONDO:0000328,hyperphosphatemia,skos:exactMatch,NCIT:C113750,Hyperphosphatemia,semapv:RegularExpressionReplacement,oaklib,0.8,oio:hasExactSynonym,rdfs:label,hyperphosphatemia


In [98]:
# msdf_lex.df[msdf_lex.df['object_id'].str.contains('ICD')]
condition_1 = lex_df['subject_id'].str.contains("MONDO")
condition_2 = lex_df['object_id'].str.contains("ICD10CM")
condition_3 = lex_df['object_id'].str.contains('|'.join((["OMIM","OMIMPS"])))
condition_4 = lex_df['object_id'].str.contains("Orphanet")
condition_5 = lex_df['object_id'].str.contains("DOID")


mondo_icd_lex_df = lex_df[(condition_1 & condition_2)]
mondo_omim_lex_df = lex_df[(condition_1 & condition_3)]
mondo_ordo_lex_df = lex_df[(condition_1 & condition_4)]
mondo_doid_lex_df = lex_df[(condition_1 & condition_5)]

mondo_icd_lex_df.head()
mondo_omim_lex_df.head() # NO ROWS
mondo_ordo_lex_df.head() # NO ROWS
mondo_doid_lex_df.head()


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string
406,MONDO:0004990,breast tumor luminal A or B,skos:narrowMatch,DOID:0060548,luminal breast carcinoma A,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasBroadSynonym,breast tumor luminal
407,MONDO:0004990,breast tumor luminal A or B,skos:narrowMatch,DOID:0060548,luminal breast carcinoma A,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasBroadSynonym,luminal breast cancer
7638,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.941176,oio:hasDbXref,oio:hasDbXref,mesh:d006394
7639,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.849779,rdfs:label,rdfs:label,angiosarcoma
7640,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,hemangiosarcoma


In [106]:
def compare_and_comment_df(mondo_df, lex_df):
    df = compare_dataframes(mondo_df, lex_df).combined_dataframe
    df['comment'] = df['comment'].str.replace('UNIQUE_1', "MONDO_MAPPINGS")
    df['comment'] = df['comment'].str.replace('UNIQUE_2', "LEXMATCH")
    return df

In [109]:
%%time
# comparison_ms_diff = compare_dataframes(msdf_mondo.df, msdf_lex.df)
# comparison_df = comparison_ms_diff.combined_dataframe
# comparison_df['comment'] = comparison_df['comment'].str.replace('UNIQUE_1', "MONDO_MAPPINGS")
# comparison_df['comment'] = comparison_df['comment'].str.replace('UNIQUE_2', "LEXMATCH")
# comparison_df.head()

# Comparisons
icd_comparison_df = compare_and_comment_df(mondo_icd_df, mondo_icd_lex_df)
omim_comparison_df = compare_and_comment_df(mondo_omim_df, mondo_omim_lex_df)
ordo_comparison_df = compare_and_comment_df(mondo_ordo_df, mondo_ordo_lex_df)
doid_comparison_df = compare_and_comment_df(mondo_doid_df, mondo_doid_lex_df)


CPU times: user 1min 4s, sys: 484 ms, total: 1min 5s
Wall time: 1min 6s


In [114]:
icd_comparison_df
omim_comparison_df
ordo_comparison_df
doid_comparison_df

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string
4712,MONDO:0001872,obsolete vestibular nystagmus,skos:exactMatch,DOID:14070,Unspecified,vestibular nystagmus,MONDO_MAPPINGS,,,,,
3301,MONDO:0001445,obsolete neurogenic bladder,skos:exactMatch,DOID:12143,Unspecified,neurogenic bladder,MONDO_MAPPINGS,,,,,
300,MONDO:0000275,obsolete monogenic disease,skos:exactMatch,DOID:0050177,Unspecified,monogenic disease,MONDO_MAPPINGS,,,,,
54783,MONDO:0016761,spondyloepiphyseal dysplasia,skos:exactMatch,DOID:0080027,Unspecified,spondyloepimetaphyseal dysplasia,MONDO_MAPPINGS,,,,,
1235,MONDO:0000780,obsolete apricot allergy,skos:exactMatch,DOID:0060505,Unspecified,apricot allergy,MONDO_MAPPINGS,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
91274,MONDO:0002625,Ewing sarcoma of bone,skos:exactMatch,DOID:3368,semapv:LexicalMatching,Ewing sarcoma of bone,COMMON_TO_BOTH,oaklib,0.800000,oio:hasExactSynonym,oio:hasExactSynonym,bone localized ewing's sarcoma
91275,MONDO:0002625,Ewing sarcoma of bone,skos:exactMatch,DOID:3368,semapv:LexicalMatching,Ewing sarcoma of bone,COMMON_TO_BOTH,oaklib,0.800000,rdfs:label,oio:hasExactSynonym,ewing sarcoma of bone
1719,MONDO:0000993,prostate squamous cell carcinoma,skos:exactMatch,DOID:10287,Unspecified,prostate squamous cell carcinoma,COMMON_TO_BOTH,,,,,
77107,MONDO:0000993,prostate squamous cell carcinoma,skos:exactMatch,DOID:10287,semapv:LexicalMatching,prostate squamous cell carcinoma,COMMON_TO_BOTH,oaklib,0.849779,rdfs:label,rdfs:label,prostate squamous cell carcinoma


### TODO:Split into unmapped dataframes

In [11]:
mappings = ["LEXMATCH", "MONDO_MAPPINGS"]
unmapped_df = comparison_df[
    (comparison_df['comment'].str.contains("|".join(mappings)))
]

unmapped_df.head()

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string
62811,MONDO:0019713,non-syndromic limb reduction defect,skos:broadMatch,ICD10CM:Q71.8,Other reduction defects of upper limb,Unspecified,MONDO_MAPPINGS,,,,,
53888,MONDO:0016444,primary anetoderma,skos:broadMatch,ICD10CM:L90.1,Anetoderma of Schweninger-Buzzi,Unspecified,MONDO_MAPPINGS,,,,,
60691,MONDO:0019050,inherited hemoglobinopathy,skos:broadMatch,ICD10CM:D56.8,Other thalassemias,Unspecified,MONDO_MAPPINGS,,,,,
62820,MONDO:0019713,non-syndromic limb reduction defect,skos:broadMatch,ICD10CM:Q72.5,Longitudinal reduction defect of tibia,Unspecified,MONDO_MAPPINGS,,,,,
2401,MONDO:0001195,spotted fever,skos:broadMatch,ICD10CM:A77.0,Spotted fever due to Rickettsia rickettsii,Unspecified,MONDO_MAPPINGS,,,,,


In [12]:
# Let reviewers check if this makes sense or no.
unmapped_lex_exact = unmapped_df[(unmapped_df['comment'] == 'LEXMATCH') & (unmapped_df['predicate_id'] == 'skos:exactMatch')]
unmapped_lex_exact.to_csv(join(dir_name, "unmapped_lex_exact.tsv"), sep='\t', index = False)
unmapped_lex_exact.head()


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string
136703,ICD10CM:I85,Esophageal varices,skos:exactMatch,MONDO:0001221,esophageal varices,semapv:LexicalMatching,LEXMATCH,oaklib,0.8,rdfs:label,oio:hasExactSynonym,esophageal varices
136704,ICD10CM:I85,Esophageal varices,skos:exactMatch,MONDO:0001221,esophageal varices,semapv:LexicalMatching,LEXMATCH,oaklib,0.849779,rdfs:label,rdfs:label,esophageal varices
140766,ICD10CM:Q00.2,Iniencephaly,skos:exactMatch,MONDO:0018968,iniencephaly,semapv:LexicalMatching,LEXMATCH,oaklib,0.849779,rdfs:label,rdfs:label,iniencephaly
132314,ICD10CM:C82,Follicular lymphoma,skos:exactMatch,MONDO:0018906,follicular lymphoma,semapv:LexicalMatching,LEXMATCH,oaklib,0.849779,rdfs:label,rdfs:label,follicular lymphoma
132314,MONDO:0018906,Follicular lymphoma,skos:exactMatch,ICD10CM:C82,follicular lymphoma,semapv:LexicalMatching,LEXMATCH,,,,,


In [13]:
# Inspect why these are missing from SSSOM mappings

unmapped_mondo_exact = unmapped_df[(unmapped_df['comment'] == 'MONDO_MAPPINGS')& (unmapped_df['predicate_id'] == 'skos:exactMatch')]
unmapped_mondo_exact['object_label'] = unmapped_mondo_exact['object_id'].apply(lambda x: oi.label(x))
# Add distances
# Add Levenshtein distance [ 0: Perfect match]
add_distance(unmapped_mondo_exact, "levenshtein_dist", textdistance.levenshtein.distance)
# Add Jaccard Index [ 0: Perfect match]
add_distance(unmapped_mondo_exact, "jaccard_index", textdistance.jaccard.distance)
# Add Monge-Elkan Distance [ 0: Perfect match]
add_distance(unmapped_mondo_exact, "monge_elkan", textdistance.monge_elkan.distance)
unmapped_mondo_exact.to_csv(join(dir_name, "unmapped_mondo_exact.tsv"), sep='\t', index = False)
unmapped_mondo_exact.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmapped_mondo_exact['object_label'] = unmapped_mondo_exact['object_id'].apply(lambda x: oi.label(x))


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string,levenshtein_dist,jaccard_index,monge_elkan
4598,MONDO:0001834,visual pathway disorder,skos:exactMatch,ICD10CM:H47.9,Unspecified disorder of visual pathways,Unspecified,MONDO_MAPPINGS,,,,,,32,0.410256,1.956522
14570,MONDO:0005079,polyp,skos:exactMatch,ICD10CM:N84,Polyp of female genital tract,Unspecified,MONDO_MAPPINGS,,,,,,24,0.827586,1.8
61343,MONDO:0019228,inborn disorder of histidine metabolism,skos:exactMatch,ICD10CM:E70.40,"Disorders of histidine metabolism, unspecified",Unspecified,MONDO_MAPPINGS,,,,,,21,0.3,1.974359
71420,MONDO:0056798,disorder of appendix,skos:exactMatch,ICD10CM:K35-K38,Diseases of appendix (K35-K38),Unspecified,MONDO_MAPPINGS,,,,,,14,0.529412,1.955
6213,MONDO:0002333,splenic abscess,skos:exactMatch,ICD10CM:D73.3,Abscess of spleen,Unspecified,MONDO_MAPPINGS,,,,,,14,0.315789,1.937778


In [14]:
combined_msdf = MappingSetDataFrame(df=comparison_df, prefix_map=msdf_lex.prefix_map, metadata=msdf_lex.metadata)
combined_msdf.df.head()


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string
62811,MONDO:0019713,non-syndromic limb reduction defect,skos:broadMatch,ICD10CM:Q71.8,Other reduction defects of upper limb,Unspecified,MONDO_MAPPINGS,,,,,
53888,MONDO:0016444,primary anetoderma,skos:broadMatch,ICD10CM:L90.1,Anetoderma of Schweninger-Buzzi,Unspecified,MONDO_MAPPINGS,,,,,
60691,MONDO:0019050,inherited hemoglobinopathy,skos:broadMatch,ICD10CM:D56.8,Other thalassemias,Unspecified,MONDO_MAPPINGS,,,,,
62820,MONDO:0019713,non-syndromic limb reduction defect,skos:broadMatch,ICD10CM:Q72.5,Longitudinal reduction defect of tibia,Unspecified,MONDO_MAPPINGS,,,,,
2401,MONDO:0001195,spotted fever,skos:broadMatch,ICD10CM:A77.0,Spotted fever due to Rickettsia rickettsii,Unspecified,MONDO_MAPPINGS,,,,,


In [15]:
%%time
df_dict = split_dataframe(combined_msdf)

  subject_prefixes = set(msdf.df[SUBJECT_ID].str.split(":", 1, expand=True)[0])
  object_prefixes = set(msdf.df[OBJECT_ID].str.split(":", 1, expand=True)[0])


CPU times: user 6.16 s, sys: 40.8 ms, total: 6.21 s
Wall time: 9.6 s


In [16]:
mondo_icd_list = [x for x in list(df_dict.keys()) if 'mondo' in x and "icd10" in x]
mondo_icd_list

['icd10cm_closematch_mondo',
 'icd10cm_narrowmatch_mondo',
 'icd10cm_broadmatch_mondo',
 'icd10cm_exactmatch_mondo',
 'mondo_closematch_icd10cm',
 'mondo_narrowmatch_icd10cm',
 'mondo_broadmatch_icd10cm',
 'mondo_exactmatch_icd10cm']

In [17]:
for match in mondo_icd_list:
    fn = match + ".tsv"
    df_dict[match].df.to_csv(join(dir_name,fn), sep='\t', index = False)

In [18]:
df_dict['mondo_exactmatch_icd10cm'].df

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,comment
0,MONDO:0000022,Nocturnal enuresis,skos:exactMatch,ICD10CM:N39.44,nocturnal enuresis,semapv:LexicalMatching,LEXMATCH
1,MONDO:0000078,Congenital malformation syndromes predominantl...,skos:exactMatch,ICD10CM:Q87.0,acrocephalopolysyndactyly,semapv:LexicalMatching,LEXMATCH
2,MONDO:0000088,precocious puberty,skos:exactMatch,ICD10CM:E30.1,Precocious puberty,Unspecified,COMMON_TO_BOTH
3,MONDO:0000088,Precocious puberty,skos:exactMatch,ICD10CM:E30.1,precocious puberty,semapv:LexicalMatching,COMMON_TO_BOTH
4,MONDO:0000129,Disorders of lysine and hydroxylysine metabolism,skos:exactMatch,ICD10CM:E72.3,glutaric aciduria,semapv:RegularExpressionReplacement,LEXMATCH
...,...,...,...,...,...,...,...
3928,MONDO:0800026,Congenital central alveolar hypoventilation sy...,skos:exactMatch,ICD10CM:G47.35,"central hypoventilation syndrome, congenital, ...",semapv:LexicalMatching,LEXMATCH
3929,MONDO:0800029,interstitial lung disease 2,skos:exactMatch,ICD10CM:J84.112,Idiopathic pulmonary fibrosis,Unspecified,COMMON_TO_BOTH
3930,MONDO:0800029,Idiopathic pulmonary fibrosis,skos:exactMatch,ICD10CM:J84.112,interstitial lung disease 2,semapv:LexicalMatching,COMMON_TO_BOTH
3931,MONDO:8000010,antiphospholipid syndrome,skos:exactMatch,ICD10CM:D68.61,Antiphospholipid syndrome,Unspecified,COMMON_TO_BOTH
