In [22]:
import numpy as np
import pandas as pd

In [23]:
DATA_DIR = "/kaggle/input/pubmed-20k"

train_path = f"{DATA_DIR}/train.txt"
test_path  = f"{DATA_DIR}/test.txt"
dev_path   = f"{DATA_DIR}/dev.txt"

In [5]:
train_df = []

train_df = pd.read_csv(
    train_path,
    sep="\t",
    header=None,
    names=["abstract_id", "label", "sentence"]
)

train_df["sentence_id"] = train_df.groupby("abstract_id").cumcount()
train_df.head()

Unnamed: 0,abstract_id,label,sentence,sentence_id
0,24293578,OBJECTIVE,To investigate the efficacy of 6 weeks of dail...,0
1,24293578,METHODS,A total of 125 patients with primary knee OA w...,1
2,24293578,METHODS,Outcome measures included pain reduction and i...,2
3,24293578,METHODS,Pain was assessed using the visual analog pain...,3
4,24293578,METHODS,Secondary outcome measures included the Wester...,4


In [6]:
dev_df = pd.read_csv(
    dev_path, sep="\t", 
    header=None, 
    names=["abstract_id", "label", "sentence"]
)
dev_df["sentence_id"] = dev_df.groupby("abstract_id").cumcount()

test_df = pd.read_csv(
    test_path, 
    sep="\t", 
    header=None, 
    names=["abstract_id", "label", "sentence"]
)
test_df["sentence_id"] = test_df.groupby("abstract_id").cumcount()


In [24]:
#Keep insightful sentences only (RESULTS & CONCLUSIONS)
#train_df["label"].value_counts()

KEEP_LABELS = ["RESULTS", "CONCLUSIONS"]

train_clinical_df = train_df[train_df["label"].isin(KEEP_LABELS)].reset_index(drop=True)
test_clinical_df  = test_df[test_df["label"].isin(KEEP_LABELS)].reset_index(drop=True)
dev_clinical_df   = dev_df[dev_df["label"].isin(KEEP_LABELS)].reset_index(drop=True)

In [10]:
!pip -q install spacy


In [16]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [26]:
sample_df = train_clinical_df.head(200).copy()

In [28]:
entities = []

for _, row in sample_df.iterrows():
    doc = nlp(row["sentence"])
    for ent in doc.ents:
        entities.append([
            row["abstract_id"],
            row["sentence_id"],
            row["label"],      # RESULTS or CONCLUSIONS
            ent.text,
            ent.label_
        ])

entities_df = pd.DataFrame(
    entities,
    columns=["abstract_id", "sentence_id", "section", "entity", "entity_type"]
)

entities_df.head(10)

Unnamed: 0,abstract_id,sentence_id,section,entity,entity_type
0,24293578,6,RESULTS,PGA,ORG
1,24293578,6,RESULTS,6MWD,CARDINAL
2,24293578,6,RESULTS,6 weeks,DATE
3,24293578,7,RESULTS,95 %,PERCENT
4,24293578,7,RESULTS,CI,ORG
5,24293578,7,RESULTS,10.9,CARDINAL
6,24293578,7,RESULTS,4.8,CARDINAL
7,24293578,7,RESULTS,0.001,CARDINAL
8,24293578,7,RESULTS,9.5,CARDINAL
9,24293578,7,RESULTS,3.7-15 .4,DATE


In [31]:
entities_df["entity"] = entities_df["entity"].str.strip()

entities_df = entities_df[entities_df["entity"].str.len() > 2]
entities_df = entities_df.drop_duplicates().reset_index(drop=True)

entities_df.head()

Unnamed: 0,abstract_id,sentence_id,section,entity,entity_type
0,24293578,6,RESULTS,PGA,ORG
1,24293578,6,RESULTS,6MWD,CARDINAL
2,24293578,6,RESULTS,6 weeks,DATE
3,24293578,7,RESULTS,95 %,PERCENT
4,24293578,7,RESULTS,10.9,CARDINAL
