In [1]:
import pandas as pd
from pandas import DataFrame
from sklearn.utils import shuffle

Will perform the following:

1. Merge text from different columns together. The merged data frame will have 3 columns:

    ```
    ['LABEL', 'TEXT', 'SOURCE']
    ```

2. Minimal normalization of converting to lower case and stripping whitespace
3. Shuffle the merged data

Let's not be agressive with normalization. Only upper case to lower case.

Some observations when considering normalization:

* 2-Amino-4,6-pteridinediol
* Ser/Plas
* [Moles/volume]
* {Setting}
* Pulmonary vein - right upper 
* Multisection^W contrast IV
* Bacteria identified^^^2
* ^Patient
* Blood flow.diastole.max
* Upper extremity>Upper arm
* A+B Ag

In [2]:
def rename_columns(df, label_col, text_col):
    """

    Renames the selected label column to 'LABEL' and
    the text column to 'TEXT'. Adds a new column 'SOURCE'
    indicating the source.

    df: DataFrame
    label_col: string, the label column
    text_col: string, the text column
    return: a new data frame with columns renamed

    """
    df = df.rename(index=str, columns={
        label_col: 'LABEL',
        text_col: 'TEXT'
    })
    df['SOURCE'] = text_col
    return df

In [3]:
def normalize(df):
    """

    Adds a normalized text column. Normalization is lightweight
    only converting to lower case and stripping whitespace.

    df: DataFrame
    return: void, adding a new column

    """
    df['TEXT_NORMED'] = df['TEXT'].apply(lambda x: str(x).lower().strip())

In [4]:
def merge(df_dicts):
    """
    dataframes: list of data frame dict {df, label_col, text_col}
    return: the merged, normalized, shuffled data frame
    """
    dfs = []
    for df_dict in df_dicts:
        df = df_dict['df']
        label_col = df_dict['label_col']
        text_col = df_dict['text_col']
        df = rename_columns(df, label_col, text_col)
        dfs.append(df)
    df_merged = pd.concat(dfs, ignore_index=True)
    df_merged = df_merged.drop_duplicates()
    normalize(df_merged)
    df_merged = shuffle(df_merged)
    return df_merged

### Merge all the candidate columns

In [5]:
df_component = pd.read_csv('data/loinc-component.csv')
df_short_name = pd.read_csv('data/loinc-short-name.csv')
df_long_name = pd.read_csv('data/loinc-long-name.csv')
df_system = pd.read_csv('data/loinc-system.csv')
df_class = pd.read_csv('data/loinc-class.csv')
df_related = pd.read_csv('data/loinc-related-name.csv')

In [6]:
df_dicts = [
    {
        'df': df_component,
        'label_col': 'LOINC_NUM',
        'text_col': 'COMPONENT'
    }, {
        'df': df_short_name,
        'label_col': 'LOINC_NUM',
        'text_col': 'SHORTNAME'
    }, {
        'df': df_long_name,
        'label_col': 'LOINC_NUM',
        'text_col': 'LONG_COMMON_NAME'
    }, {
        'df': df_system,
        'label_col': 'LOINC_NUM',
        'text_col': 'SYSTEM'
    }, {
        'df': df_class,
        'label_col': 'LOINC_NUM',
        'text_col': 'CLASS'
    }, {
        'df': df_related,
        'label_col': 'LOINC_NUM',
        'text_col': 'RELATED_NAME'
    }
]

In [7]:
df_merged_all = merge(df_dicts)

In [8]:
df_merged_all.sample(10)

Unnamed: 0,LABEL,TEXT,SOURCE,TEXT_NORMED
2186759,9003-5,Quant,RELATED_NAME,quant
1851992,70208-4,Plasma,RELATED_NAME,plasma
1007474,33178-5,OFD,RELATED_NAME,ofd
2131894,82482-1,MOLPATH,RELATED_NAME,molpath
875132,27789-7,Quant,RELATED_NAME,quant
1485384,51862-1,Screen,RELATED_NAME,screen
440248,10985-0,Plasma,RELATED_NAME,plasma
334893,11205-2,ALLERGY,CLASS,allergy
608582,17238-7,Arbitrary concentration,RELATED_NAME,arbitrary concentration
129464,5202-7,HSV Ab Ser IA-aCnc,SHORTNAME,hsv ab ser ia-acnc


In [9]:
df_merged_all.shape

(2208289, 4)

### Merge only the columns of long names and short names

In [10]:
df_dicts = [
    {
        'df': df_short_name,
        'label_col': 'LOINC_NUM',
        'text_col': 'SHORTNAME'
    }, {
        'df': df_long_name,
        'label_col': 'LOINC_NUM',
        'text_col': 'LONG_COMMON_NAME'
    }
]

In [11]:
df_merged_names = merge(df_dicts)

In [12]:
df_merged_names.sample(5)

Unnamed: 0,LABEL,TEXT,SOURCE,TEXT_NORMED
264,10171-7,Hx of Eyes disorders,SHORTNAME,hx of eyes disorders
161435,81381-6,Administrative information associated with thi...,LONG_COMMON_NAME,administrative information associated with thi...
137677,59482-0,Collaborative staging post treatment tumor siz...,LONG_COMMON_NAME,collaborative staging post treatment tumor siz...
129427,51993-4,Metanephrine and Normetanephrine panel [Mass/v...,LONG_COMMON_NAME,metanephrine and normetanephrine panel [mass/v...
93458,19110-6,HIV 1 gp41+gp43 Ab [Presence] in Serum by Immu...,LONG_COMMON_NAME,hiv 1 gp41+gp43 ab [presence] in serum by immu...


In [13]:
df_merged_names.shape

(166754, 4)

### Verify

In [14]:
df_merged_all[df_merged_all['LABEL'] == '80659-6']

Unnamed: 0,LABEL,TEXT,SOURCE,TEXT_NORMED
2088255,80659-6,Quantitative,RELATED_NAME,quantitative
327414,80659-6,Bld,SYSTEM,bld
2088260,80659-6,WB,RELATED_NAME,wb
410791,80659-6,COAG,CLASS,coag
2088254,80659-6,Quant,RELATED_NAME,quant
2088244,80659-6,Blood,RELATED_NAME,blood
2088253,80659-6,Quan,RELATED_NAME,quan
2088258,80659-6,R-time,RELATED_NAME,r-time
2088259,80659-6,Tilt tube,RELATED_NAME,tilt tube
2088256,80659-6,Random,RELATED_NAME,random


In [15]:
df_merged_names[df_merged_names['LABEL'] == '80659-6']

Unnamed: 0,LABEL,TEXT,SOURCE,TEXT_NORMED
77283,80659-6,ACT Bld Kaolin induc,SHORTNAME,act bld kaolin induc
160660,80659-6,Activated clotting time (ACT) of Blood induced...,LONG_COMMON_NAME,activated clotting time (act) of blood induced...


### Save

In [16]:
df_merged_all.to_csv('data/loinc-labeled-text-all.csv', index=False)
df_merged_names.to_csv('data/loinc-labeled-text-names.csv', index=False)