In [1]:
import pandas as pd
from pandas import DataFrame
from sklearn.utils import shuffle

Will perform the following:

1. Merge text from different columns together. The merged data frame will have 3 columns:

    ```
    ['label', 'text', 'source']
    ```

2. Minimal normalization of converting to lower case and stripping whitespace
3. Shuffle the merged data

Let's not be agressive with normalization. Only upper case to lower case.

Some observations when considering normalization:

* 2-Amino-4,6-pteridinediol
* Ser/Plas
* [Moles/volume]
* {Setting}
* Pulmonary vein - right upper 
* Multisection^W contrast IV
* Bacteria identified^^^2
* ^Patient
* Blood flow.diastole.max
* Upper extremity>Upper arm
* A+B Ag

In [2]:
def rename_columns(df, label_col, text_col):
    """

    Renames the selected label column to 'label' and
    the text column to 'text'. Adds a new column 'source'
    indicating the source.

    df: DataFrame
    label_col: string, the label column
    text_col: string, the text column
    return: a new data frame with columns renamed

    """
    df = df.rename(index=str, columns={
        label_col: 'label',
        text_col: 'text'
    })
    df['source'] = text_col
    return df

In [3]:
def normalize(df):
    """

    Adds a normalized text column 'text_normed'. Normalization is lightweight
    only converting to lower case and stripping whitespace.

    df: DataFrame
    return: void, adding a new column

    """
    df['text_normed'] = df['text'].apply(lambda x: str(x).lower().strip())

In [4]:
def merge(df_dicts):
    """
    dataframes: list of data frame dict {df, label_col, text_col}
    return: the merged, normalized, shuffled data frame
    """
    # Merge data frames
    dfs = []
    for df_dict in df_dicts:
        df = df_dict['df']
        label_col = df_dict['label_col']
        text_col = df_dict['text_col']
        df = rename_columns(df, label_col, text_col)
        dfs.append(df)
    df_merged = pd.concat(dfs, ignore_index=True)
    # Drop duplicates
    df_merged = df_merged.drop_duplicates()
    # Normalize text
    normalize(df_merged)
    # Drop rows that has null and empty text
    # Pandas gotcha, missing values internally as NaN, python 'is not None' won't work
    df_merged = df_merged[df_merged['text'].notnull()]
    df_merged = df_merged[df_merged['text_normed'].map(lambda text: len(text) > 0)]
    # Shuffle
    df_merged = shuffle(df_merged)
    return df_merged

### Merge all the candidate columns

In [5]:
df_component = pd.read_csv('data/loinc-component.csv')
df_short_name = pd.read_csv('data/loinc-short-name.csv')
df_long_name = pd.read_csv('data/loinc-long-name.csv')
df_system = pd.read_csv('data/loinc-system.csv')
df_class = pd.read_csv('data/loinc-class.csv')
df_related = pd.read_csv('data/loinc-related-name.csv')

In [6]:
df_dicts = [
    {
        'df': df_component,
        'label_col': 'LOINC_NUM',
        'text_col': 'COMPONENT'
    }, {
        'df': df_short_name,
        'label_col': 'LOINC_NUM',
        'text_col': 'SHORTNAME'
    }, {
        'df': df_long_name,
        'label_col': 'LOINC_NUM',
        'text_col': 'LONG_COMMON_NAME'
    }, {
        'df': df_system,
        'label_col': 'LOINC_NUM',
        'text_col': 'SYSTEM'
    }, {
        'df': df_class,
        'label_col': 'LOINC_NUM',
        'text_col': 'CLASS'
    }, {
        'df': df_related,
        'label_col': 'LOINC_NUM',
        'text_col': 'RELATED_NAME'
    }
]

In [7]:
df_merged_all = merge(df_dicts)

In [8]:
df_merged_all.sample(10)

Unnamed: 0,label,text,source,text_normed
2012842,773-2,Red cells,RELATED_NAME,red cells
1432461,49975-6,Blt,RELATED_NAME,blt
1796856,67215-4,Screen,RELATED_NAME,screen
636332,1875-4,Level,RELATED_NAME,level
1314882,45316-7,Chemistry,RELATED_NAME,chemistry
685880,2064-4,Serum or plasma,RELATED_NAME,serum or plasma
1321524,46509-6,International Classification of Diseases,RELATED_NAME,international classification of diseases
1046423,34717-9,FA,RELATED_NAME,fa
1401847,49028-4,Random,RELATED_NAME,random
480728,12642-5,Point in time,RELATED_NAME,point in time


In [9]:
df_merged_all.shape

(2198746, 4)

### Merge only the columns of long names and short names

In [10]:
df_dicts = [
    {
        'df': df_short_name,
        'label_col': 'LOINC_NUM',
        'text_col': 'SHORTNAME'
    }, {
        'df': df_long_name,
        'label_col': 'LOINC_NUM',
        'text_col': 'LONG_COMMON_NAME'
    }
]

In [11]:
df_merged_names = merge(df_dicts)

In [12]:
df_merged_names.sample(5)

Unnamed: 0,label,text,source,text_normed
89637,15580-4,Blackberry IgE Ab RAST class [Presence] in Serum,LONG_COMMON_NAME,blackberry ige ab rast class [presence] in serum
106892,329-3,Metronidazole [Susceptibility] by Serum bacter...,LONG_COMMON_NAME,metronidazole [susceptibility] by serum bacter...
101104,26221-2,Elbow - left MR,LONG_COMMON_NAME,elbow - left mr
137412,5917-0,Deprecated lead in dialysis fluid,LONG_COMMON_NAME,deprecated lead in dialysis fluid
40897,47333-0,S pneum20 IgG sp2 Ser IA-mCnc,SHORTNAME,s pneum20 igg sp2 ser ia-mcnc


In [13]:
df_merged_names.shape

(157577, 4)

### Verify

In [14]:
df_merged_all[df_merged_all['label'] == '80659-6']

Unnamed: 0,label,text,source,text_normed
2088251,80659-6,Point in time,RELATED_NAME,point in time
2088246,80659-6,Clottable,RELATED_NAME,clottable
2088248,80659-6,Coagulation activated,RELATED_NAME,coagulation activated
2088243,80659-6,Activated coagulation time,RELATED_NAME,activated coagulation time
2088256,80659-6,Random,RELATED_NAME,random
2088261,80659-6,Whole blood,RELATED_NAME,whole blood
2088250,80659-6,Coagulation time,RELATED_NAME,coagulation time
77283,80659-6,Activated clotting time,COMPONENT,activated clotting time
244037,80659-6,Activated clotting time (ACT) of Blood induced...,LONG_COMMON_NAME,activated clotting time (act) of blood induced...
410791,80659-6,COAG,CLASS,coag


In [15]:
df_merged_names[df_merged_names['label'] == '80659-6']

Unnamed: 0,label,text,source,text_normed
77283,80659-6,ACT Bld Kaolin induc,SHORTNAME,act bld kaolin induc
160660,80659-6,Activated clotting time (ACT) of Blood induced...,LONG_COMMON_NAME,activated clotting time (act) of blood induced...


In [16]:
# Missing short name represented as NaN
df_short_name[df_short_name['LOINC_NUM'] == '72156-3']

Unnamed: 0,LOINC_NUM,SHORTNAME
68050,72156-3,


In [17]:
df_merged_names[df_merged_names['label'] == '72156-3']

Unnamed: 0,label,text,source,text_normed
151427,72156-3,Rwandan maternal screening panel [RHEA],LONG_COMMON_NAME,rwandan maternal screening panel [rhea]


### Save

In [18]:
df_merged_all.to_csv('data/loinc-labeled-text-all.csv', index=False)
df_merged_names.to_csv('data/loinc-labeled-text-names.csv', index=False)