In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
DATA_DIR = "/kaggle/input/pubmed-20k"

train_path = f"{DATA_DIR}/train.txt"
test_path  = f"{DATA_DIR}/test.txt"
dev_path   = f"{DATA_DIR}/dev.txt"

In [3]:
train_df = []

train_df = pd.read_csv(
    train_path,
    sep="\t",
    header=None,
    names=["abstract_id", "label", "sentence"]
)

train_df["sentence_id"] = train_df.groupby("abstract_id").cumcount()
train_df.head()

Unnamed: 0,abstract_id,label,sentence,sentence_id
0,24293578,OBJECTIVE,To investigate the efficacy of 6 weeks of dail...,0
1,24293578,METHODS,A total of 125 patients with primary knee OA w...,1
2,24293578,METHODS,Outcome measures included pain reduction and i...,2
3,24293578,METHODS,Pain was assessed using the visual analog pain...,3
4,24293578,METHODS,Secondary outcome measures included the Wester...,4


In [5]:
dev_df = pd.read_csv(
    dev_path, sep="\t", 
    header=None, 
    names=["abstract_id", "label", "sentence"]
)
dev_df["sentence_id"] = dev_df.groupby("abstract_id").cumcount()

test_df = pd.read_csv(
    test_path, 
    sep="\t", 
    header=None, 
    names=["abstract_id", "label", "sentence"]
)
test_df["sentence_id"] = test_df.groupby("abstract_id").cumcount()


In [6]:
#Keep insightful sentences only (RESULTS & CONCLUSIONS)
#train_df["label"].value_counts()

KEEP_LABELS = ["RESULTS", "CONCLUSIONS"]

train_clinical_df = train_df[train_df["label"].isin(KEEP_LABELS)].reset_index(drop=True)
test_clinical_df  = test_df[test_df["label"].isin(KEEP_LABELS)].reset_index(drop=True)
dev_clinical_df   = dev_df[dev_df["label"].isin(KEEP_LABELS)].reset_index(drop=True)

In [8]:
!pip -q install spacy


In [9]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [10]:
sample_df = train_clinical_df.head(200).copy()

In [11]:
entities = []

for _ , row in sample_df.iterrows():
    doc = nlp(row["sentence"])
    for ent in doc.ents:
        entities.append([
            row["abstract_id"],
            row["sentence_id"],
            row["label"],      # RESULTS or CONCLUSIONS
            ent.text,
            ent.label_
        ])

entities_df = pd.DataFrame(
    entities,
    columns=["abstract_id", "sentence_id", "section", "entity", "entity_type"]
)

entities_df.head(10)

Unnamed: 0,abstract_id,sentence_id,section,entity,entity_type
0,24293578,6,RESULTS,PGA,ORG
1,24293578,6,RESULTS,6MWD,CARDINAL
2,24293578,6,RESULTS,6 weeks,DATE
3,24293578,7,RESULTS,95 %,PERCENT
4,24293578,7,RESULTS,CI,ORG
5,24293578,7,RESULTS,10.9,CARDINAL
6,24293578,7,RESULTS,4.8,CARDINAL
7,24293578,7,RESULTS,0.001,CARDINAL
8,24293578,7,RESULTS,9.5,CARDINAL
9,24293578,7,RESULTS,3.7-15 .4,DATE


In [12]:
entities_df["entity"] = entities_df["entity"].str.strip()

entities_df = entities_df[entities_df["entity"].str.len() > 2]
entities_df = entities_df.drop_duplicates().reset_index(drop=True)

entities_df.head()

Unnamed: 0,abstract_id,sentence_id,section,entity,entity_type
0,24293578,6,RESULTS,PGA,ORG
1,24293578,6,RESULTS,6MWD,CARDINAL
2,24293578,6,RESULTS,6 weeks,DATE
3,24293578,7,RESULTS,95 %,PERCENT
4,24293578,7,RESULTS,10.9,CARDINAL


In [14]:
triples_input_df = train_clinical_df.head(500).copy()

In [15]:
triples = []

for _ , row in triples_input_df.iterrows():
    doc = nlp(row["sentence"])

    ents = list(doc.ents)
    if len(ents) < 2:
        continue

    relation = ""
    for tok in doc:
        if tok.dep_ == "ROOT":
            relation = tok.lemma_.lower()
            break
            
    if relation == "":
        continue


    for i in range(len(ents) - 1):
        source = ents[i].text.strip()
        target = ents[i+1].text.strip()

        triples.append([
            row["abstract_id"],
            row["sentence_id"],
            row["label"],          # RESULTS / CONCLUSIONS
            source,
            relation,
            target,
            row["sentence"]
        ])


triples_df = pd.DataFrame(
    triples,
    columns=["abstract_id", "sentence_id", "section", "source", "relation", "target", "sentence"]
)


In [16]:
triples_df = triples_df[
    (triples_df["source"].str.len() > 2) &
    (triples_df["target"].str.len() > 2)
].drop_duplicates().reset_index(drop=True)

triples_df.head(10)


Unnamed: 0,abstract_id,sentence_id,section,source,relation,target,sentence
0,24293578,6,RESULTS,PGA,be,6MWD,There was a clinically relevant reduction in t...
1,24293578,6,RESULTS,6MWD,be,6 weeks,There was a clinically relevant reduction in t...
2,24293578,7,RESULTS,10.9,be,4.8,The mean difference between treatment arms ( 9...
3,24293578,7,RESULTS,4.8,be,0.001,The mean difference between treatment arms ( 9...
4,24293578,7,RESULTS,0.001,be,9.5,The mean difference between treatment arms ( 9...
5,24293578,7,RESULTS,9.5,be,3.7-15 .4,The mean difference between treatment arms ( 9...
6,24293578,7,RESULTS,3.7-15 .4,be,0.05,The mean difference between treatment arms ( 9...
7,24293578,7,RESULTS,0.05,be,15.7,The mean difference between treatment arms ( 9...
8,24293578,7,RESULTS,15.7,be,5.3,The mean difference between treatment arms ( 9...
9,24293578,7,RESULTS,5.3,be,0.001,The mean difference between treatment arms ( 9...


In [17]:
G = nx.DiGraph()

for _ , row in triples_df.iterrows():
    source = row["source"]
    target = row["target"]
    relation = row["relation"]

    if G.has_edge(source , target):
        G[source][target]["weight"] += 1
    else:
        G.add_edge(source, target, relation=relation, weight=1)



In [19]:
degree_centrality = nx.degree_centrality(G)

top_nodes = sorted(
    degree_centrality.items(),
    key=lambda x: x[1],
    reverse=True
)[:15]

top_nodes


[('95 %', 0.08517350157728706),
 ('0.05', 0.031545741324921134),
 ('two', 0.029968454258675076),
 ('tertiary', 0.01892744479495268),
 ('0.02', 0.01892744479495268),
 ('BMI', 0.014195583596214511),
 ('50 %', 0.014195583596214511),
 ('0.32', 0.00946372239747634),
 ('UrgoClean', 0.00946372239747634),
 ('0.78', 0.00946372239747634),
 ('28 %', 0.00946372239747634),
 ('IRR', 0.00946372239747634),
 ('15 minutes', 0.00946372239747634),
 ('VCV', 0.00946372239747634),
 ('NSAIDs', 0.00946372239747634)]