# SMM4H

This notebook contains code to:

- load the SMM4H data
- identify labels that are not current in the MedDRA version loaded as well
- use pt level for labels
- convert data to tsv and export

In [1]:
import pandas as pd
from glob import glob
from os import path
import pandas as pd
import json

In [4]:
MEDDRA_DIR = '../meddra/meddra_23_0_english_update/MedAscii/' # <- you'll need a meddra subscription to get the latest archive
OUTPUT_DIR = './smm4h'

In [5]:
# load MedDRA LLTs
names = 'llt_code,llt_name,pt_code,llt_whoart_code,llt_harts_code,llt_costart_sym,llt_icd9_code,llt_icd9cm_code,llt_icd10_code,llt_currency,llt_jart_code'.split(',')
df_llt = pd.read_csv(
    path.join(MEDDRA_DIR, 'llt.asc'), 
    sep='$', 
    encoding='latin-1', 
    names=names, 
    index_col=False
)

In [6]:
UNK = 'MEDDRA PT' # Identifier for missing PTs in SMM4H

In [7]:
# Mapping: LLT -> PT
llt_to_pt = { row.llt_code: row.pt_code for row in df_llt.itertuples() }

In [8]:
# load training data
df_all = []
for file in glob('./subtask3/task_3_normalization_training*'):
    df = pd.read_csv(file, sep='\t', names=['idx', 'text', 'llt_code'])
    df_all.append(df)

df_train = pd.concat(df_all)

In [9]:
# load test data
df_test = pd.read_csv('./subtask3/task_3_normalization_evaluation.txt', sep='\t', names=['idx', 'text', 'llt_code'])

In [10]:
# normalize all codes to pt level
df_train['pt_code'] = df_train.llt_code.apply(lambda x: llt_to_pt[int(x)] if int(x) in llt_to_pt else UNK)
df_test['pt_code'] = df_test.llt_code.apply(lambda x: llt_to_pt[int(x)] if x != UNK and int(x) in llt_to_pt else UNK)

In [11]:
# create set of labels
labels = [str(x) for x in set(df_train.pt_code.to_list() + df_test.pt_code.to_list())]

In [12]:
# additional dataframe with unique training items
df_test_uniq = df_test.drop_duplicates(subset=['text', 'pt_code'])
len(df_test_uniq)

1195

In [13]:
# export to tsv files
with open(path.join(OUTPUT_DIR, 'labels.json'), 'w') as f:
    f.write(json.dumps(labels))

df_train[['pt_code', 'text']].to_csv(path.join(OUTPUT_DIR,'train.tsv'), sep='\t', index=False, header=False)
df_test[['pt_code', 'text']].to_csv(path.join(OUTPUT_DIR,'test.tsv'), sep='\t', index=False, header=False)
df_test_uniq[['pt_code', 'text']].to_csv(path.join(OUTPUT_DIR,'test_unique.tsv'), sep='\t', index=False, header=False)

In [14]:
# export to global tsv file
pd.concat([df_train, df_test]).drop_duplicates(subset=['pt_code', 'text'])[['pt_code', 'text']].to_csv('./smm4h_pt.tsv', sep='\t', index=False, header=False)
pd.concat([df_train, df_test]).drop_duplicates(subset=['pt_code', 'text'])[['llt_code', 'text']].to_csv('./smm4h_llt.tsv', sep='\t', index=False, header=False)

In [15]:
# show number of entries with different llt and pt code
len(df_train[df_train.pt_code != df_train.llt_code])

22

In [16]:
# show changes in test set
df_test[df_test.pt_code.astype(str) != df_test.llt_code.astype(str)]

Unnamed: 0,idx,text,llt_code,pt_code
180,40211,gives me acne,10000497,MEDDRA PT
487,41139,screwed my endocrine system,10014698,MEDDRA PT
1323,44982,teeth to decay,10044027,10012318
1527,41138,endocrine system weird,10014698,MEDDRA PT
1829,42044,stronger than 3 men,10020610,10020564


In [17]:
df_test.drop_duplicates(subset=['text', 'pt_code'])

Unnamed: 0,idx,text,llt_code,pt_code
0,44675,sleepier,10041349,10041349
1,40103,dreamt colors,10000125,10000125
2,41585,zombie,10016322,10016322
3,41834,headache,10019211,10019211
4,46301,crazy,10061920,10061920
...,...,...,...,...
2487,43514,can't even orgasm,10024870,10024870
2488,41185,eye problem,10015916,10015916
2492,43109,no sleeps,10022437,10022437
2495,44530,sleptwalk,10041347,10041347
