In [1]:
import pandas as pd
from pandas import DataFrame
from random import shuffle

In [2]:
def fasttext_label(df, label_col, text_col):
    """
    Creates labeled data as required by fastText. Labels are prefixed
    by '__label__' followed by text.
    df: DataFrame
    label_col: string
    text_col: string
    return: a new data frame of labeled data
    """
    rows = []
    for idx, row in df.iterrows():
        rows.append('__label__' + str(row[label_col]) + ' ' + str(row[text_col]))
    return DataFrame(rows)

### All text and text of the same label from different sources on separate rows

In [3]:
df_all = pd.read_csv('data/loinc-labeled-text-all.csv')

In [4]:
df_all_no_labels = df_all['text_normed']

In [5]:
df_all_no_labels.to_csv('data/loinc-all-fasttext-no-label.txt', index=False, header=False)

In [6]:
df_all_with_labels = fasttext_label(df_all, 'label', 'text_normed')

In [7]:
df_all_with_labels.sample(3)

Unnamed: 0,0
1632432,__label__3356-3 anileridine serpl-mcnc
594811,__label__21325-6 aby
1061757,__label__13942-8 random


In [8]:
df_all_with_labels.to_csv('data/loinc-all-fasttext-with-label.txt', index=False, header=False)

### All text and text of the same label from different sources are concatenated

Note the data generated at this step can also be used by gensim.

In [9]:
def concat(group):
    """
    To be called by group by label. Concatenate short name, long name,
    and shuffled others.
    group: DataFrame, note a grouped object is a data frame
    return: concatenated text
    """
    short_name = ''
    long_name = ''
    others = []
    for idx, row in group.iterrows():
        source = row['source']
        text = row['text_normed']
        if source == 'SHORTNAME':
            short_name = text
        elif source == 'LONG_COMMON_NAME':
            long_name = text
        else:
            others.append(text)
    shuffle(others)
    return short_name + ' ' + long_name + ' ' + ' '.join(others)

In [11]:
gb = df_all.groupby('label')
concatenated = gb.apply(lambda group: concat(group))

In [12]:
type(concatenated)

pandas.core.series.Series

In [13]:
concatenated.sample(3)

label
17422-7    b19v igg+igm ser-acnc parvovirus b19 igg+igm a...
57312-1    prnp gene mut tested bld/t prnp gene mutations...
72501-0    hcys 6h p met serpl-scnc homocyst(e)ine [moles...
dtype: object

In [14]:
df_all_concat = concatenated.to_frame('text')

In [15]:
df_all_concat.sample(3)

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
78077-5,being tired made it hard for me to remember t...
25343-5,fasciola igg ser-acnc fasciola sp igg ab [unit...
18511-6,p wave onset time refb p wave onset [time] ref...


In [16]:
df_all_concat['label'] = df_all_concat.index

In [17]:
df_all_concat.sample(3)

Unnamed: 0_level_0,text,label
label,Unnamed: 1_level_1,Unnamed: 2_level_1
72020-1,vr-12 vitality (vt) score - oblique method t-...,72020-1
16703-1,cyclosporin plas-mcnc cyclosporine [mass/volum...,16703-1
59155-2,17ohp 20m p chal serpl-scnc 17-hydroxyprogeste...,59155-2


In [18]:
# Save for gensim
df_all_concat.to_csv('data/loinc-labeled-text-all-concatenated.csv', index=False)

In [19]:
df_all_concat_no_labels = df_all_concat['text']
df_all_concat_no_labels.to_csv('data/loinc-all-concat-fasttext-no-label.txt', index=False, header=False)

In [21]:
df_all_concat_with_labels = fasttext_label(df_all_concat, 'label', 'text')

In [22]:
df_all_concat_with_labels.sample(3)

Unnamed: 0,0
51360,__label__56821-2 problem exacerbating factors ...
79917,__label__83145-3 grip strength test nih grip s...
25836,__label__33623-0 penta-cp1 24h stl-mrate penta...


In [23]:
df_all_concat_with_labels.to_csv('data/loinc-all-concat-fasttext-with-label.txt', index=False, header=False)