In [1]:
import pandas as pd
from pandas import DataFrame

In [2]:
df_loinc = pd.read_csv('data/loinc.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df_loinc.columns

Index(['LOINC_NUM', 'COMPONENT', 'PROPERTY', 'TIME_ASPCT', 'SYSTEM',
       'SCALE_TYP', 'METHOD_TYP', 'CLASS', 'VersionLastChanged', 'CHNG_TYPE',
       'DefinitionDescription', 'STATUS', 'CONSUMER_NAME', 'CLASSTYPE',
       'FORMULA', 'SPECIES', 'EXMPL_ANSWERS', 'SURVEY_QUEST_TEXT',
       'SURVEY_QUEST_SRC', 'UNITSREQUIRED', 'SUBMITTED_UNITS', 'RELATEDNAMES2',
       'SHORTNAME', 'ORDER_OBS', 'CDISC_COMMON_TESTS', 'HL7_FIELD_SUBFIELD_ID',
       'EXTERNAL_COPYRIGHT_NOTICE', 'EXAMPLE_UNITS', 'LONG_COMMON_NAME',
       'UnitsAndRange', 'DOCUMENT_SECTION', 'EXAMPLE_UCUM_UNITS',
       'EXAMPLE_SI_UCUM_UNITS', 'STATUS_REASON', 'STATUS_TEXT',
       'CHANGE_REASON_PUBLIC', 'COMMON_TEST_RANK', 'COMMON_ORDER_RANK',
       'COMMON_SI_TEST_RANK', 'HL7_ATTACHMENT_STRUCTURE',
       'EXTERNAL_COPYRIGHT_LINK', 'PanelType', 'AskAtOrderEntry',
       'AssociatedObservations', 'VersionFirstReleased',
       'ValidHL7AttachmentRequest'],
      dtype='object')

### A subset of columns

A subset of columns whose values can be potentially used as traning text for embeddings.

In [4]:
cols = ['LOINC_NUM', 'COMPONENT', 'PROPERTY', 'TIME_ASPCT', 'SYSTEM', 'SCALE_TYP', 'METHOD_TYP', 'CLASS',
       'SHORTNAME', 'LONG_COMMON_NAME', 'RELATEDNAMES2']

In [5]:
df_loinc[cols].sample(5)

Unnamed: 0,LOINC_NUM,COMPONENT,PROPERTY,TIME_ASPCT,SYSTEM,SCALE_TYP,METHOD_TYP,CLASS,SHORTNAME,LONG_COMMON_NAME,RELATEDNAMES2
5884,1523-0,Glucose^30M post 0.05-0.15 U insulin/kg IV pos...,MCnc,Pt,Ser/Plas,Qn,,CHAL,Glucose 30M p U/kg Ins IV SerPl-mCnc,Glucose [Mass/volume] in Serum or Plasma --30 ...,30M p U/kg Ins IV; After; Calorie Fast; CHEMIS...
77283,80659-6,Activated clotting time,Time,Pt,Bld,Qn,Coag.kaolin induced,COAG,ACT Bld Kaolin induc,Activated clotting time (ACT) of Blood induced...,ACT; Activ; Activated coagulation time; Blood;...
78566,8183-6,Benzoylecgonine,PrThr,Pt,Gast fld,Ord,,DRUG/TOX,BZE Gast Ql,Benzoylecgonine [Presence] in Gastric fluid,BEC; BEG; Benz; Benzoylec; BZE; Cocaine degrad...
81004,84349-0,Consultation note,Find,Pt,{Setting},Doc,Pastoral care,DOC.ONTOLOGY,Pastoral Care Consult note,Pastoral care Consult note,Consult note; DOC.ONT; Document; Encounter; Ev...
57400,62380-1,Chromosome band involved end,Find,Pt,Bld/Tiss,Nom,Molgen,HL7.CYTOGEN,Chrom band involved end,Chromosome band involved end,Bands; Blood; bnd; Chrom band involved end; Ch...


### The final set of columns for embeddings

The goal is collect short sentences that can be used to generate the embeddings.

The following columns contain values that can be used directly as sentences for generating embeddings: 

    ['COMPONENT', 'SHORTNAME', 'LONG_COMMON_NAME', 'SYSTEM', 'CLASS']

The column `RELATEDNAMES` must split by ";" into multiple sentences.

The final out put will be a data frame of 3 columns: `['LOINC_NUM', 'SENTENCE', 'ORG_COLUMN']`

* LOINC_NUM -- The LOINC number is the label
* SENTENCE -- The sentences that will be used to generate the embeddings
* ORG_COLUMN -- The original column containing the sentence

In [6]:
df_component = df_loinc[['LOINC_NUM', 'COMPONENT']]
df_short_name = df_loinc[['LOINC_NUM', 'SHORTNAME']]
df_long_name = df_loinc[['LOINC_NUM', 'LONG_COMMON_NAME']]
df_system = df_loinc[['LOINC_NUM', 'SYSTEM']]
df_class = df_loinc[['LOINC_NUM', 'CLASS']]
df_related = df_loinc[['LOINC_NUM', 'RELATEDNAMES2']]

In [7]:
# Split the related names by ';' and save them into separate rows
rows = []
for index, row in df_related.iterrows():
    loinc_num = row['LOINC_NUM']
    names = row['RELATEDNAMES2']
    for name in names.split(';'):
        name = name.strip()
        new_row = {
            'LOINC_NUM': loinc_num,
            'RELATED_NAME': name
        }
        rows.append(new_row)
df_related = DataFrame(rows)

In [8]:
df_related.shape

(1791428, 2)

In [9]:
df_related.head(5)

Unnamed: 0,LOINC_NUM,RELATED_NAME
0,10000-8,Cardiac
1,10000-8,Durat
2,10000-8,ECG
3,10000-8,EKG.MEASUREMENTS
4,10000-8,Electrocardiogram


In [10]:
# Make sure the data frame is correctly generated from the rows
rows[:10]

[{'LOINC_NUM': '10000-8', 'RELATED_NAME': 'Cardiac'},
 {'LOINC_NUM': '10000-8', 'RELATED_NAME': 'Durat'},
 {'LOINC_NUM': '10000-8', 'RELATED_NAME': 'ECG'},
 {'LOINC_NUM': '10000-8', 'RELATED_NAME': 'EKG.MEASUREMENTS'},
 {'LOINC_NUM': '10000-8', 'RELATED_NAME': 'Electrocardiogram'},
 {'LOINC_NUM': '10000-8', 'RELATED_NAME': 'Electrocardiograph'},
 {'LOINC_NUM': '10000-8', 'RELATED_NAME': 'Hrt'},
 {'LOINC_NUM': '10000-8', 'RELATED_NAME': "Painter's colic"},
 {'LOINC_NUM': '10000-8', 'RELATED_NAME': 'PB'},
 {'LOINC_NUM': '10000-8', 'RELATED_NAME': 'Plumbism'}]

### Remove duplicates

In [12]:
df_component = df_component.drop_duplicates()
df_short_name = df_short_name.drop_duplicates()
df_long_name = df_long_name.drop_duplicates()
df_system = df_system.drop_duplicates()
df_class = df_class.drop_duplicates()
df_related = df_related.drop_duplicates()

### Save the results

In [13]:
df_component.to_csv('data/loinc-component.csv', index=False)
df_short_name.to_csv('data/loinc-short-name.csv', index=False)
df_long_name.to_csv('data/loinc-long-name.csv', index=False)
df_system.to_csv('data/loinc-system.csv', index=False)
df_class.to_csv('data/loinc-class.csv', index=False)
df_related.to_csv('data/loinc-related-name.csv', index=False)