In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import json

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', None)

In [4]:
import sys

sys.path.append('./../../general/src')

from access_script_data import patients

sys.path.append('./../../tst/src/')
import utils
import feature_conversion

sys.path.append('./../../flm/src/')
import flm_tools

In [5]:
figure_output_directory = r'../file_outputs_from_notebooks/figures/'
data_output_directory = r'../file_outputs_from_notebooks/data/'
file_output_directory = r'../file_outputs_from_notebooks/'
unmapped_loincs_output_directory = r'../file_outputs_from_notebooks/unmapped_loinc_codes_of_interest/'

In [6]:
def flag_if_present_in_script(row):
    if row['Loinc'] in script_loinc_codes:
        flag = True
    else:
        flag = False
    
    return flag

def flag_if_test_is_mapped_to_HPO(row):
    if row['Loinc'] in script_loinc_codes and row['Loinc'] in unmapped_tests:
        flag = False
    elif row['Loinc'] in script_loinc_codes and row['Loinc'] not in unmapped_tests:
        flag = True
    else:
        flag = None
        
    return flag

In [7]:
encounter_only = True

Reading in all relevant tables

In [8]:
# Master LOINC file
loinc_df = pd.read_csv("../Loinc.csv")

# LOINC file listing panels
panels_df = pd.read_csv("../PanelsAndForms_modified.csv", usecols=['ParentLoinc',
                                                                   'ParentName',
                                                                   'SEQUENCE',
                                                                   'Loinc',
                                                                   'LoincName'])
if encounter_only:
    # All tests/measurements given during SCRIPT encounters
    script_measurement = flm_tools.get_cleaned_data(multiple_visits='all')
    script_measurement = script_measurement[['measurement_vocabulary_id',
                                             'measurement_concept_code',
                                             'measurement_concept_name']].drop_duplicates()
else:
    script_measurement = patients.modified_edw_rc('measurement',
                                                  columns = ['measurement_vocabulary_id',
                                                             'measurement_concept_code',
                                                             'measurement_concept_name']).drop_duplicates()

# Filtering SCRIPT measurements to only those identified through LOINC (can't map to HPO otherwise)
f = script_measurement['measurement_vocabulary_id'] == "LOINC"
script_measurement = script_measurement.loc[f]

# Nice table
script_measurement = script_measurement[['measurement_concept_code', 'measurement_concept_name']].drop_duplicates()

  exec(code_obj, self.user_global_ns, self.user_ns)


# Search real name of a LOINC code

In [9]:
a = loinc_df['LOINC_NUM'] == '9279-1'
list(loinc_df.loc[a, 'LONG_COMMON_NAME'])[0]

'Respiratory rate'

# Figuring out which LOINC codes from SCRIPT are actually panels, or aren't tests at all

In [10]:
script_loinc_codes = list(script_measurement['measurement_concept_code'].drop_duplicates())

#### LOINC codes that are panels

In [11]:
f = panels_df['ParentLoinc'].isin(script_loinc_codes)
panel_loincs = list(panels_df.loc[f, 'ParentLoinc'].drop_duplicates())

Removing LOINC codes corresponding to panels from the list of SCRIPT LOINC codes

In [12]:
for i in range(len(panel_loincs)):
    script_loinc_codes.remove(panel_loincs[i])

Had previously defined an exhaustive list of LOINC codes that did not map to HPO. The list likely has these panel codes. Removing them as well.

In [13]:
with open(data_output_directory+'loinc_codes_full.json', 'r') as infile:
    unmapped_tests = json.load(infile)
    
for i in range(len(panel_loincs)):
    try:
        unmapped_tests.remove(panel_loincs[i])
    except:
        print(f"{panel_loincs[i]} wasn't in list of unmapped codes!")

24343-6 wasn't in list of unmapped codes!


#### LOINC codes that aren't tests

In [14]:
non_tests = pd.read_csv(data_output_directory+"suspicious_loinc_codes.csv")
non_tests = non_tests.loc[non_tests['reason'] != 'Panel']

# Removing them from SCRIPT list
for suspicious in non_tests['loinc_code']:
    try:
        script_loinc_codes.remove(suspicious)
    except ValueError:
        print(f"{suspicious} wasn't in script loincs!")
        
    try:
        unmapped_tests.remove(suspicious)
    except:
        print(f"{suspicious} wasn't in list of unmapped codes!")

# Correcting the exhaustive list of unmapped LOINC codes
with open(data_output_directory+'loinc_codes_full.json', 'w') as outfile:
    json.dump(unmapped_tests, outfile)

49024-3 wasn't in list of unmapped codes!
8262-8 wasn't in list of unmapped codes!


# For taking a look at the panels file

In [15]:
panels_df['in_script'] = panels_df.apply(lambda row: flag_if_present_in_script(row), axis=1)
panels_df['mapped_to_hpo'] = panels_df.apply(lambda row: flag_if_test_is_mapped_to_HPO(row), axis=1)

In [20]:
for_hpo = panels_df.loc[panels_df['in_script']&(panels_df['mapped_to_hpo']==False)].sort_values(by='ParentName')

In [21]:
for_hpo = for_hpo[['Loinc', 'LoincName']].drop_duplicates()

for_hpo = for_hpo.rename(columns={"Loinc": "loinc_code",
                                 "LoincName": "loinc_name"})

for_hpo['rationale'] = 'belong to ordered panels'
for_hpo.to_csv(unmapped_loincs_output_directory+"panel_belonging_loincs.csv", index=False)

In [None]:
z = panels_df['ParentName'].str.contains("coagulation", case=False)
panels_df.loc[z].drop_duplicates().sort_values(by=['ParentLoinc', 'SEQUENCE'])

In [None]:
z = panels_df['ParentName'].str.contains("blood", case=False)
panels_df.loc[z].drop_duplicates().sort_values(by=['ParentLoinc', 'SEQUENCE'])

In [None]:
print(f"There are {len(script_loinc_codes)} unique LOINC codes in latest downloaded version of SCRIPT data\n")

Subsetting the master LOINC table to only include tests present in SCRIPT data

In [None]:
g = loinc_df['LOINC_NUM'].isin(script_loinc_codes)
script_loincs = loinc_df.loc[g, ['LOINC_NUM', 'LONG_COMMON_NAME']]

The following is to get a pandas table with columns given by the result dictionary.  
If a LOINC code doesn't appear in the LOINC panels table (suggesting that it doesn't belong to a panel), None will appear on the panel names/codes

In [None]:
agg_results = []

for loinc_code in script_loinc_codes:
    
    a = script_loincs['LOINC_NUM'] == loinc_code
    b = list(script_loincs.loc[a, 'LONG_COMMON_NAME'])
    
    if loinc_code in set(panels_df['Loinc']):
        c = panels_df['Loinc'] == loinc_code
        panel_loinc_codes = list(set(panels_df.loc[c, 'ParentLoinc']))
        panel_names = list(set(panels_df.loc[c, 'ParentName']))
        
        for i in range(len(panel_loinc_codes)):
            results = {'loinc_code': loinc_code,
                       'loinc_name': b[0],
                       'panel_loinc_code': panel_loinc_codes[i],
                       'panel_name': panel_names[i],
                       'HPO_mapped': loinc_code not in unmapped_tests}
            agg_results.append(results)
    else:
        results = {'loinc_code': loinc_code,
                   'loinc_name': b[0],
                   'panel_loinc_code': None,
                   'panel_name': None,
                   'HPO_mapped': loinc_code not in unmapped_tests}
        agg_results.append(results)

In [None]:
test_to_panels = pd.DataFrame(agg_results)

In [None]:
test_to_panels.head(25)

In [None]:
# Getting tests that don't belong to panels listed by the LOINC panels file. They likely aren't ordered as part of a panel
p = test_to_panels['panel_loinc_code'].isna()
non_panel_loinc_tests = test_to_panels.loc[p]

# there may be many tests not belonging to panels. But only interested in those that did not map to HPO
non_mapped_non_panel = non_panel_loinc_tests.loc[non_panel_loinc_tests['HPO_mapped'] == False]
# non_mapped_non_panel.to_csv(data_output_directory+'non_panel_tests_unmapped_encounter.csv', index=False)

In [None]:
print(f"Out of {len(script_loinc_codes)} LOINC-encoded tests in SCRIPT encounters, {len(non_panel_loinc_tests)} did not appear as part of a panel in the LOINC panels file")

#### Of tests belonging to panels, how many of them map? And the converse for those that don't map

In [None]:
non_panel_tests = test_to_panels.loc[p, ['loinc_code', 'loinc_name', 'HPO_mapped']].drop_duplicates()
panel_tests = test_to_panels.loc[~p, ['loinc_code', 'loinc_name', 'HPO_mapped']].drop_duplicates()

In [None]:
print(f"Among non-panel tests {non_panel_tests['HPO_mapped'].sum()/len(non_panel_tests)*100:.3f}% map to the HPO, while among tests assigned to panels, the HPO mapping rate is  {panel_tests['HPO_mapped'].sum()/len(panel_tests)*100:.3f}%")

### Testing whether some panels have all of its tests ordered in SCRIPT

In [None]:
panel_loinc_codes = list(panels_df['ParentLoinc'].drop_duplicates())

potential_script_panels = []

for panel in panel_loinc_codes:
    q = panels_df['ParentLoinc'] == panel
    specific_panel = panels_df.loc[q]
    
    # Since the first row is usually the panel signaler, but it isn't a test
    number_of_tests_in_panel = len(specific_panel) - 1
    number_of_script_tests_in_panel = sum(specific_panel['in_script'])
    
    if number_of_script_tests_in_panel >  0.1*number_of_tests_in_panel:
        potential_script_panels.append(panel)

In [None]:
t = panels_df['ParentLoinc'].isin(potential_script_panels)
potential_script_panels_df = panels_df.loc[t].drop(columns=['SEQUENCE'])

In [None]:
potential_script_panels_df.head()

In [None]:
v = potential_script_panels_df['in_script']
potential_script_panels_df = potential_script_panels_df.loc[v].drop_duplicates()

p = potential_script_panels_df['mapped_to_hpo'] == False
potential_script_panels_df = potential_script_panels_df.loc[p].drop_duplicates()
potential_script_panels_df.to_csv(data_output_directory+'potential_script_panels_encounters_10_pct.csv', index=False)

### Will print out a dataframe of tests in SCRIPT that do not map to the HPO currently, and that belong to panels.

In [None]:
f = potential_script_panels_df['mapped_to_hpo'] == False

In [None]:
for_hpo = potential_script_panels_df.loc[f, ['Loinc', 'LoincName']].drop_duplicates()

for_hpo = for_hpo.rename(columns={"Loinc": "loinc_code",
                                 "LoincName": "loinc_name"})

In [None]:
for_hpo['rationale'] = 'belong to ordered panels'
# for_hpo.to_csv(unmapped_loincs_output_directory+"panel_belonging_loincs.csv", index=False)

# How about inferring just by datetime co-occurrence

In [None]:
in_script = flm_tools.get_cleaned_data(multiple_visits='all')

In [None]:
for_clustering = in_script[['measurement_vocabulary_id',
                                     'measurement_concept_code',
                                     'measurement_concept_name',
                                     'measurement_datetime']]

f = for_clustering['measurement_vocabulary_id'] == "LOINC"
for_clustering = for_clustering.loc[f]

codes_to_discard = pd.read_csv(data_output_directory+"suspicious_loinc_codes.csv")
g = for_clustering['measurement_concept_code'].isin(codes_to_discard['loinc_code'])
for_clustering = for_clustering.loc[~g]

In [None]:
for_clustering = for_clustering.set_index('measurement_datetime')

In [None]:
unique_tests = list(for_clustering['measurement_concept_name'].drop_duplicates())
occurrence = np.zeros((len(unique_tests), len(unique_tests)))

In [None]:
from itertools import product
from tqdm.notebook import tqdm, trange

for stamp in tqdm(for_clustering.index.unique()):
    tests = list(for_clustering.loc[str(stamp), 'measurement_concept_name'])
    
    if len(tests) > 1:
#         print(stamp, tests)
        indices = [unique_tests.index(test) for test in tests]
        pairs = [indices, indices]
        
        index_combinations = product(*pairs)
        
        for combination in index_combinations:
            occurrence[combination[0], combination[1]] += 1

In [None]:
guat = pd.DataFrame(occurrence, index=unique_tests, columns=unique_tests)