When attempting to map lab tests in omop.measurements onto HPO annotations, different mapping rates are achieved depending on the dataset version. That goes against expectation since each consecutive dataset version contains more patients than the previous version. Assuming that no measurements are lost or erased intentionally, the number of unique measurements should either stay stagnant, or increase.  

However, this didn't seem to be the case when comparing omop.measurements as of June, 2021 vs. August 31, 2021. One can suspect that this can be caused by filtering procedures that I employed for my cohort analysis.  
  
Moreover, the number of successfully mapped measurements was higher in an April 1 version of the dataset, than the June 1 version.  
  
To tease apart what's happening, I'll collect the number of unique clinical measurements present in all downloaded versions of the dataset. This time, I won't filter patients, I'll simply load the measurements table. Then, I'll load the HPO annotations and characterize the mapping of each dataset version.

In [1]:
import numpy as np
import sys
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec

In [2]:
sys.path.append('./../../general/src')

from access_script_data import patients

sys.path.append('./../../tst/src/')
import utils
import feature_conversion

sys.path.append('./../../flm/src/')
import flm_tools

sys.path.append('./../../flm/')
import lohpo

In [3]:
figure_output_directory = r'../file_outputs_from_notebooks/figures/'
data_output_directory = r'../file_outputs_from_notebooks/data/'
file_output_directory = r'../file_outputs_from_notebooks/'

#### Manually creating a dictionary of dataset versions

In [5]:
versions = {'June 2020': '2020-06-25/200724_polish_20200625_tables',
           'July 2020': '2020-07-25/200728_digest',
           'August 2020': '2020-08-28/200926_digest',
           'September 2020': '2020-09-28/200928_digest',
           'January 2021': '2021-01-08/210108_digest',
           'March 2021': '2021-03-05/210305_digest',
           'April 2021': '2021-04-01/210401_digest',
           'June 2021': '2021-06-02/210602_digest_including_basic_endpoints_and_sofa',
           'August 2021': '2021-08-31/210831_digest',
           'October 2021': '2021-10-07/211007_digest'}

In [6]:
# Loading a dictionary containing the mapped LOINC to HPO codes
value = lohpo.AnnotationLoader.load()

# Loading a dictionary containing the HPO terms associated with the HPO codes
header, result = lohpo.HpoTermListLoader.load()

In [7]:
agg_results = []
n=0

for key, version in versions.items():
    
    # Reading in the dataset
    measurements = patients.modified_edw_rc('measurement', revision=version,
                                            columns=['measurement_concept_name',
                                                     'measurement_vocabulary_id',
                                                     'measurement_concept_code'])
    
    # Taking away entries without a name
    g = measurements['measurement_concept_name'] == 'No matching concept'
    measurements.loc[g, 'measurement_concept_name'] = np.nan
    measurements = measurements.dropna(subset=['measurement_concept_name'])
    
    # Can't map measurements not encoded in LOINC
    f = measurements['measurement_vocabulary_id'] == 'LOINC'
    loinc_measurements = measurements.loc[f]
    not_loinc_measurements = measurements.loc[~f]
    
    
    # Obtaining the associated HPO codes and annotations
    codes = set(loinc_measurements["measurement_concept_code"])
    frames = []
    
    for code in codes:
        f = loinc_measurements['measurement_concept_code'] == code
        df = loinc_measurements.loc[f].copy()
        loincId = lohpo.LoincId.from_code(code)
        loinc2hpo = value.get(loincId, None)
    
        if loinc2hpo is not None:
            for hpo_term in loinc2hpo.candidateHpoTerms.items():
                if hpo_term[0].code == 'N':
                    df.loc[:, 'HPO_code'] = hpo_term[1].id
                    hpo_annot = result.get(hpo_term[1].id)
                    df.loc[:, 'HPO_annotation'] = hpo_annot.name
                
            frames.append(df)
        else:
            df.loc[:, 'HPO_code'] = None
            df.loc[:, 'HPO_annotation'] = None
            frames.append(df)
        
    clean_all_table = pd.concat(frames)
    
    
    # Defining a pandas table (complete_HPO) containing only successfully mapped entries
    h = clean_all_table['HPO_annotation'].notnull()
    complete_HPO = clean_all_table.loc[h]
    
    # Getting a table of unmapped LOINC measurements
    j = clean_all_table['HPO_annotation'].isnull()
    non_mapped_loincs = list(clean_all_table.loc[j, 'measurement_concept_code'].drop_duplicates())
    non_mapped_names = list(clean_all_table.loc[j, 'measurement_concept_name'].drop_duplicates())
    
    mapped_entries = len(complete_HPO)
    all_entries = len(loinc_measurements)
    unique_measurements = len(measurements['measurement_concept_name'].drop_duplicates())
    mapped_measurements = len(complete_HPO['measurement_concept_name'].drop_duplicates())
    
    results = {'database_version': key,
               'non_loinc_entries': len(not_loinc_measurements['measurement_concept_name']),
               'total_loinc_entries': all_entries,
               'mapped_loinc_entries': mapped_entries,
              'pct_mapped_loinc_entries': round((mapped_entries/all_entries)*100, 2),
               'non_loinc_unique_measurements': len(not_loinc_measurements['measurement_concept_name'].drop_duplicates()),
               'non_loinc_vocabularies': list(not_loinc_measurements['measurement_vocabulary_id'].drop_duplicates()),
              'unique_loinc_measurements': unique_measurements,
              'mapped_unique_loinc_measurements': mapped_measurements,
              'pct_mapped_unique_loinc_measurements': round((mapped_measurements/unique_measurements)*100, 2)}
    
    if n == 0:
        non_mapped_loinc_codes = non_mapped_loincs.copy()
        non_mapped_names = non_mapped_names.copy()
    else:
        non_mapped_loinc_codes.extend(non_mapped_loincs)
        non_mapped_names.extend(non_mapped_names)
    
    n += 1
    
    agg_results.append(results)

In [8]:
report = pd.DataFrame(agg_results)
report

Unnamed: 0,database_version,non_loinc_entries,total_loinc_entries,mapped_loinc_entries,pct_mapped_loinc_entries,non_loinc_unique_measurements,non_loinc_vocabularies,unique_loinc_measurements,mapped_unique_loinc_measurements,pct_mapped_unique_loinc_measurements
0,June 2020,0,3449907,2739066,79.4,0,[],1006,260,25.84
1,July 2020,0,2677810,2126622,79.42,0,[],977,255,26.1
2,August 2020,0,3755521,2983974,79.46,0,[],1013,261,25.77
3,September 2020,0,3428354,3030097,88.38,0,[],978,305,31.19
4,January 2021,0,4148840,3663630,88.3,0,[],994,307,30.89
5,March 2021,0,4515118,3984912,88.26,0,[],1005,307,30.55
6,April 2021,0,4703676,4151897,88.27,0,[],1040,308,29.62
7,June 2021,14464,5052613,4408353,87.25,3,[SNOMED],1188,318,26.77
8,August 2021,847,4223275,3764424,89.14,3,[SNOMED],1181,281,23.79
9,October 2021,1137,4437872,3962124,89.28,3,[SNOMED],1253,304,24.26


In [None]:
# report.to_csv(data_output_directory+'210911_characterizing_loinc2hpo_across_time.csv', index=False)

#### Getting exhaustive list of unmapped LOINC codes and names

In [9]:
aux = pd.Series(non_mapped_loinc_codes)
f = aux.value_counts()
non_mapped_loinc_codes = list(f[f == f.max()].index)

In [10]:
len(non_mapped_loinc_codes)

564

In [None]:
# Load the master LOINC file.
# Objective will be to create a dataframe of unmapped LOINC codes,
# with columns ['loinc_code', 'loinc_name', 'rationale']

In [13]:
loinc_df = pd.read_csv("../Loinc.csv")

g = loinc_df['LOINC_NUM'].isin(non_mapped_loinc_codes)
all_unmapped_loincs = loinc_df.loc[g, ['LOINC_NUM', 'LONG_COMMON_NAME']]

all_unmapped_loincs = all_unmapped_loincs.rename(columns={"LOINC_NUM": "loinc_code",
                                                          "LONG_COMMON_NAME": "loinc_name"})
all_unmapped_loincs['rationale'] = ''

  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
all_unmapped_loincs

Unnamed: 0,loinc_code,loinc_name,rationale
368,10330-9,Monocytes/100 leukocytes in Body fluid by Manu...,
372,10334-1,Cancer Ag 125 [Units/volume] in Serum or Plasma,
373,10335-8,Color of Cerebral spinal fluid,
413,10371-3,Bite cells [Presence] in Blood by Light micros...,
416,10374-7,Helmet cells [Presence] in Blood by Light micr...,
...,...,...,...
93955,9619-8,Triglyceride [Mass/volume] in Pleural fluid,
93983,9622-2,Phytonadione [Mass/volume] in Serum or Plasma,
96272,9829-3,Beta globulin [Mass/time] in 24 hour Urine,
96281,9830-1,Cholesterol.total/Cholesterol in HDL [Mass Rat...,


In [None]:
unmapped_loincs_output_directory = r'../file_outputs_from_notebooks/unmapped_loinc_codes_of_interest/'
all_unmapped_loincs.to_csv(unmapped_loincs_output_directory+"all_unmapped_loincs.csv", index=False)

In [12]:
import json

with open(data_output_directory+'loinc_codes_full.json', 'w') as outfile:
    json.dump(non_mapped_loinc_codes, outfile)