In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [3]:
pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [9]:
import sys

sys.path.append('./../../general/src')

from access_script_data import patients

sys.path.append('./../../tst/src/')
import utils
import feature_conversion

sys.path.append('./../../flm/src/')
import flm_tools

sys.path.append('./../../flm/')
import lohpo

In [5]:
figure_output_directory = r'../file_outputs_from_notebooks/figures/'
data_output_directory = r'../file_outputs_from_notebooks/data/'
file_output_directory = r'../file_outputs_from_notebooks/'
unmapped_loincs_output_directory = r'../file_outputs_from_notebooks/unmapped_loinc_codes_of_interest/'

## Peter Robinson's request: a csv file with columns `loinc_code`, `loinc_name`, `frequency/counts`

In [6]:
data_december = flm_tools.get_cleaned_data(version='latest',
                                           multiple_visits='all')

In [7]:
# Filtering SCRIPT measurements to only those identified through LOINC (can't map to HPO otherwise)
f = data_december['measurement_vocabulary_id'] == "LOINC"
data_december = data_december.loc[f]

In [10]:
# Loading a dictionary containing the mapped LOINC to HPO codes
value = lohpo.AnnotationLoader.load()

# Loading a dictionary containing the HPO terms associated with the HPO codes
header, result = lohpo.HpoTermListLoader.load()

In [11]:
# This is the code block that converts measurements in dataset into HPO annotations
# At this point, simply the general physiological annotation

codes = set(data_december["measurement_concept_code"])

frames = []

for code in codes:
    f = data_december['measurement_concept_code'] == code
    df = data_december.loc[f].copy()
    loincId = lohpo.LoincId.from_code(code)
    loinc2hpo = value.get(loincId, None)
    
    if loinc2hpo is not None:
        for hpo_term in loinc2hpo.candidateHpoTerms.items():
            if hpo_term[0].code == 'N':
                df.loc[:, 'HPO_code'] = hpo_term[1].id.value
                hpo_annot = result.get(hpo_term[1].id)
                df.loc[:, 'HPO_annotation'] = hpo_annot.name
                
        frames.append(df)
    else:
        df.loc[:, 'HPO_code'] = None
        df.loc[:, 'HPO_annotation'] = None
        frames.append(df)
data_december = pd.concat(frames)

In [12]:
g = data_december['HPO_code'].isna()
unmapped_entries = data_december.loc[g]

In [20]:
la = unmapped_entries[['measurement_concept_code', 'measurement_concept_name']].value_counts().to_frame('frequency').reset_index()

In [22]:
la = la.rename(columns={'measurement_concept_code': 'loinc_code',
                       'measurement_concept_name': 'loinc_name'})

In [23]:
la.to_csv(unmapped_loincs_output_directory+"unmapped_loinc_codes_northwestern.csv", index=False)