# Building a Knowledge Graph from Acetaminophen Papers 

In [3]:
from dotenv import load_dotenv
import os

# load .env file 
load_dotenv()

# Get API key from .env variable    
api_key = os.getenv("OPENAI_API_KEY")

## LLM Graph Transformer

- testing with multiple pubmed abstracts by tagging each node/relationship with a `source_paper_id`
- GPT-4o 


In [None]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0) # low T bc task requires accuracy

graph_transformer = LLMGraphTransformer(llm=llm)  # converts docs to graph objects 

# Processing multiple papers

In [55]:
papers = [
    {
        "id": "SRP162774",
        "text": "The mechanisms underlying interindividual variability in analgesic efficacy of nonsteroidal anti‐inflammatory drugs (NSAIDs) are not well understood. Therefore, we performed pain phenotyping, functional neuroimaging, pharmacokinetic/pharmacodynamic assessments, inflammation biomarkers, and gene expression profiling in healthy subjects who underwent surgical extraction of bony impacted third molars and were treated with ibuprofen (400 mg; N = 19) or placebo (N = 10). Analgesic efficacy was not associated with demographic or clinical characteristics, ibuprofen pharmacokinetics, or the degree of cyclooxygenase inhibition by ibuprofen. Compared with partial responders to ibuprofen (N = 9, required rescue medication within the dosing interval), complete responders (N = 10, no rescue medication) exhibited greater induction of urinary prostaglandin metabolites and serum tumor necrosis factor‐α and interleukin 8. Differentially expressed genes in peripheral blood mononuclear cells were enriched for inflammation‐related pathways. These findings suggest that a less pronounced activation of the inflammatory prostanoid system is associated with insufficient pain relief on ibuprofen alone and the need for additional therapeutic intervention."
    },
    {
        "id": "SRP135678",
        "text": "Acetaminophen (APAP) overdose is a leading cause of drug-induced acute liver failure in many countries. In the present study, we developed stable mouse models of acute drug-induced hepatic injury (DILI) and acute drug-induced hepatic failure (DILF) by sub-lethal and lethal APAP injection respectively. The differences in hepatic transcriptome profiling between these two models were compared by RNA sequencing, which were validated by qPCR, western-blot and ELISA. In results, serum IL-6, TNF-a and IL-10 levels are higher in DILF than in DILI. The upregulated genes in DILF compared with DILI were mostly enriched in the areas of “cellular development process”, “cell division”, “multicellular organism development,” etc. The downregulated genes in DILF compared with DILI were mostly enriched in the areas of “cellular response to chemical stimulus”, “cellular response to stress”, “cell activation,” etc. Sub-lethal doses of APAP increased Myc, Bag3 and Btc expression in mouse liver, but lethal doses of APAP did not, which suggested that these three genes might play important roles in adaptive protection reactions in DILI. The serum Btc level might be a potential biomarker of drug induced liver injury with good prognosis. Our data can help us better understand the mechanisms of hepatotoxicity that influence prognosis and seek novel prognostic indicators of DILI."
    },
    {
        "id": "SRP133057",
        "text": "Acetaminophen overdose is the most common cause of acute liver injury (ALI) or acute liver failure in the United States. Its pathogenetic mechanisms are incompletely understood. Additional studies are warranted to identify new genetic risk factors for more mechanistic insights and new therapeutic target discoveries. The objective of this study was to explore the role and mechanisms of nicotinamide phosphoribosyltransferase (NAMPT) in acetaminophen-induced ALI. C57BL/6 Nampt gene wild-type (Nampt+/+), heterozygous knockout (Nampt+/-), and overexpression (NamptOE) mice were treated with overdose of acetaminophen, followed by histologic, biochemical, and transcriptomic evaluation of liver injury. The mechanism of Nampt in acetaminophen-induced hepatocytic toxicity was also explored in cultured primary hepatocytes. Three lines of evidence have convergently demonstrated that acetaminophen overdose triggers the most severe oxidative stress and necrosis, and the highest expression of key necrosis driving genes in Nampt+/- mice, whereas the effects in NamptOE mice were least severe relative to Nampt+/+ mice. Treatment of P7C3-A20, a small chemical molecule up-regulator of Nampt, ameliorated acetaminophen-induced mouse ALI over the reagent control. These findings support the fact that NAMPT protects against acetaminophen-induced ALI. "
    },
    {
        "id": "SRP100513",
        "text": "Monocyte-derived macrophages (MoMF) play a pivotal role in the resolution of acetaminophen-induced liver injury (AILI). Timely termination of neutrophil activity and their clearance are essential for liver regeneration following injury. Here, we show that infiltrating Ly6Chi monocytes, their macrophage descendants, and neutrophils spatially and temporally overlap in the centrilobular necrotic areas during the necroinflammatory and resolution phases of AILI. At the necroinflammatory phase, inducible ablation of circulating Ly6Chi monocytes resulted in reduced numbers and fractions of reactive oxygen species (ROS)-producing neutrophils. In alignment with this, neutrophils sorted from monocyte-deficient livers exhibited reduced expression of NADPH oxidase 2. Moreover, human CD14+ monocytes stimulated with lipopolysaccharide or hepatocyte apoptotic bodies directly induced ROS production by cocultured neutrophils. RNA-seq-based transcriptome profiling of neutrophils from Ly6Chi monocyte-deficient versus normal livers revealed 449 genes that were differentially expressed with at least twofold change (p ≤ 0.05). In the absence of Ly6Chi monocytes, neutrophils displayed gene expression alterations associated with decreased innate immune activity and increased cell survival. At the early resolution phase, Ly6Chi monocytes differentiated into ephemeral Ly6Clo MoMF and their absence resulted in significant accumulation of late apoptotic neutrophils. Further gene expression analysis revealed the induced expression of a specific repertoire of bridging molecules and receptors involved with apoptotic cell clearance during the transition from Ly6Chi monocytes to MoMF. Collectively, our findings establish a phase-dependent task division between liver-infiltrating Ly6Chi monocytes and their MoMF descendants with the former regulating innate immune functions and cell survival of neutrophils and the later neutrophil clearance. "
    }
]

In [56]:
print(papers[0].keys())  

dict_keys(['id', 'text'])


## Give the LLM explicit instructions to prefer biological classifications

In [None]:
all_graph_docs = []
for paper in papers:
    # Enhanced prompt to guide entity typing
    enhanced_text = f"""
Extract entities and relationships from this scientific abstract.

ENTITY TYPING RULES:
- Genes and proteins (TNF-α, IL-8, NAMPT, etc.) → type: "Gene" or "Protein"
- Cell types (PBMCs, hepatocytes, neutrophils) → type: "Cell Type"
- Tissues/organs (liver, blood, testis) → type: "Tissue"
- Organisms (mouse, human, C57BL/6) → type: "Organism"  
- Biological processes (inflammation, apoptosis) → type: "Biological Process"
- gene expression -> type: "Gene Expression"
- Diseases (acute liver injury, Parkinson's) → type: "Disease"
- Small molecules/drugs (ibuprofen, acetaminophen) → type: "Drug"
ABSTRACT:
{paper["text"]}
"""
    
    doc = Document(
        page_content=enhanced_text,
        metadata={"source": paper["id"]}
    )
    graph_docs = await graph_transformer.aconvert_to_graph_documents([doc])
    
    # Add source tracking
    for node in graph_docs[0].nodes:
        node.properties = node.properties or {}
        node.properties["source"] = paper["id"]
    
    for rel in graph_docs[0].relationships:
        rel.properties = rel.properties or {}
        rel.properties["source"] = paper["id"]
    
    all_graph_docs.append(graph_docs[0])

### Merge all graphs and deduplicate nodes

In [58]:
# Merge all graphs
merged_nodes = []
merged_relationships = []

for graph_doc in all_graph_docs:
    merged_nodes.extend(graph_doc.nodes)
    merged_relationships.extend(graph_doc.relationships)

# Deduplicate nodes by ID, tracking all sources
deduplicated_nodes = {}
for node in merged_nodes:
    if node.id not in deduplicated_nodes:
        deduplicated_nodes[node.id] = node
        deduplicated_nodes[node.id].properties["sources"] = set()
    
    source = node.properties.get("source", "unknown")
    deduplicated_nodes[node.id].properties["sources"].add(source)

final_nodes = list(deduplicated_nodes.values())
final_relationships = merged_relationships

# Create merged graph document
from langchain_community.graphs.graph_document import GraphDocument
from langchain_core.documents import Document

merged_graph = GraphDocument(
    nodes=final_nodes,
    relationships=final_relationships,
    source=Document(page_content="Merged ibuprofen studies")
)

In [51]:
# See all unique node types in your data
node_types = set()
for node in final_nodes:
    node_types.add(node.type)

print("Current node types:")
for node_type in sorted(node_types):
    count = sum(1 for node in final_nodes if node.type == node_type)
    print(f"  {node_type}: {count} nodes")

Current node types:
  Biological process: 12 nodes
  Cell type: 2 nodes
  Disease: 4 nodes
  Drug: 4 nodes
  Gene: 9 nodes
  Location: 1 nodes
  Organism: 3 nodes
  Protein: 2 nodes
  Tissue: 1 nodes


### Visualize graph

In [59]:
def visualize_multi_source_graph(merged_graph):
    from pyvis.network import Network
    
    net = Network(height="1200px", width="100%", directed=True, 
                  notebook=False, bgcolor="#222222", font_color="white")

    # Color map for different sources
    source_colors = {
        "SRP162774": "#FF6B6B",  # Red
        "SRP091955": "#4ECDC4",  # Teal
        "SRP168636": "#FFD93D"   # Yellow
    }
    
    nodes = merged_graph.nodes
    relationships = merged_graph.relationships
    
    # Add nodes with color based on source(s)
    for node in nodes:
        sources = node.properties.get("sources", set())
        
        # Convert set to list for display
        if isinstance(sources, set):
            sources_list = list(sources)
        else:
            sources_list = [sources]
        
        # If node appears in multiple papers, use purple
        if len(sources_list) > 1:
            color = "#9B59B6"  # Purple for shared
            title = f"{node.type} (in {len(sources_list)} papers: {', '.join(sources_list)})"
            label = f"{node.id}★"
        else:
            source = sources_list[0] if sources_list else "unknown"
            color = source_colors.get(source, "#95A5A6")
            title = f"{node.type} (from {source})"
            label = node.id
        
        net.add_node(node.id, label=label, title=title, 
                     color=color, group=node.type, size=25)
    
    # Add edges
    for rel in relationships:
        source = rel.properties.get("source", "unknown")
        try:
            net.add_edge(
                rel.source.id, 
                rel.target.id, 
                label=rel.type,
                title=f"{rel.type} (from {source})",
                arrows="to"
            )
        except:
            continue
    
    net.set_options("""
        {
            "physics": {
                "forceAtlas2Based": {
                    "gravitationalConstant": -150,
                    "centralGravity": 0.015,
                    "springLength": 300,
                    "springConstant": 0.05
                },
                "minVelocity": 0.75,
                "solver": "forceAtlas2Based"
            }
        }
    """)
    
    output_file = "multi_paper_knowledge_graph.html"
    net.save_graph(output_file)
    print(f"Graph saved to {os.path.abspath(output_file)}")
    
    try: 
        import webbrowser
        webbrowser.open(f"file://{os.path.abspath(output_file)}")
    except: 
        print(f"Failed to open browser. Please open {output_file} manually.")

In [60]:
# Visualize the merged graph
visualize_multi_source_graph(merged_graph)

Graph saved to /Users/u013911/Desktop/streamlit_apps/KnowledgeGraphs/multi_paper_knowledge_graph.html


## Processing one paper at the time

In [None]:
documents = [Document(page_content=text)]
documents

[Document(metadata={}, page_content='\nThe mechanisms underlying interindividual variability in analgesic efficacy of nonsteroidal anti‐inflammatory drugs (NSAIDs) are not well understood. Therefore, we performed pain phenotyping, functional neuroimaging, pharmacokinetic/pharmacodynamic assessments, inflammation biomarkers, and gene expression profiling in healthy subjects who underwent surgical extraction of bony impacted third molars and were treated with ibuprofen (400 mg; N = 19) or placebo (N = 10). \nAnalgesic efficacy was not associated with demographic or clinical characteristics, ibuprofen pharmacokinetics, or the degree of cyclooxygenase inhibition by ibuprofen. \nCompared with partial responders to ibuprofen (N = 9, required rescue medication within the dosing interval), complete responders (N = 10, no rescue medication) exhibited greater induction of urinary prostaglandin metabolites and serum tumor necrosis factor‐α and interleukin 8. \nDifferentially expressed genes in pe

In [18]:
graph_documents = await graph_transformer.aconvert_to_graph_documents(documents) # asynchronous function, allowing processing mutliple docs in parallel

In [22]:
print(f"Nodes: {graph_documents[0].nodes}")
print(f"Relationships: {graph_documents[0].relationships}")

Nodes: [Node(id='Interindividual Variability', type='Concept', properties={}), Node(id='Analgesic Efficacy', type='Concept', properties={}), Node(id='Nsaids', type='Concept', properties={}), Node(id='Pain Phenotyping', type='Concept', properties={}), Node(id='Functional Neuroimaging', type='Concept', properties={}), Node(id='Pharmacokinetic/Pharmacodynamic Assessments', type='Concept', properties={}), Node(id='Inflammation Biomarkers', type='Concept', properties={}), Node(id='Gene Expression Profiling', type='Concept', properties={}), Node(id='Healthy Subjects', type='Group', properties={}), Node(id='Surgical Extraction Of Bony Impacted Third Molars', type='Event', properties={}), Node(id='Ibuprofen', type='Drug', properties={}), Node(id='Placebo', type='Drug', properties={}), Node(id='Analgesic Efficacy', type='Concept', properties={}), Node(id='Demographic Characteristics', type='Concept', properties={}), Node(id='Clinical Characteristics', type='Concept', properties={}), Node(id='Ib

### Visualize graph

In [29]:
from typing import Any
from pyvis.network import Network

def visualize_graph(graph_documents):

    # Create network
    net = Network(height="1200px", width="100%", directed=True, 
                        notebook=False, bgcolor="#222222", font_color="white")

    nodes = graph_documents[0].nodes
    relationships = graph_documents[0].relationships

    # Build lookup for valid nodes 
    node_dict = {node.id: node for node in nodes}

    #Filter out invalid edges and collect valid node IDs 
    valid_edges = []
    valid_node_ids = set()
    for rel in relationships:
        if rel.source.id in node_dict and rel.target.id in node_dict:
            valid_edges.append(rel)
            valid_node_ids.update([rel.source.id, rel.target.id])

   # Track which nodes are part of any relationship
    connected_nodes_ids = set()
    for rel in relationships:
        connected_nodes_ids.add(rel.source.id)
        connected_nodes_ids.add(rel.target.id) 

    # Add valid nodes 
    for node_id in valid_node_ids:
        node = node_dict[node_id]
        try: 
            net.add_node(node.id, label=node.id, title=node.type, group=node.type)
        except: 
            continue # skip if error 

    # add valid edges 
    for rel in valid_edges: 
        try: 
            net.add_edge(rel.source.id, rel.target.id, label=rel.type.lower())
        except: 
            continue # skip if error 
    
    # configure physics 
    net.set_options("""
        {
            "physics": {
                "forceAtlas2Based": {
                    "gravitationalConstant": -100,
                    "centralGravity": 0.01,
                    "springLength": 200,
                    "springConstant": 0.08
                },
                "minVelocity": 0.75,
                "solver": "forceAtlas2Based"
            }
        }
    """)

    output_file = "knowledge_graph.html"
    net.save_graph(output_file)
    print(f"Graph saved to {os.path.abspath(output_file)}")

   # try to open in browser 
    try: 
     import webbrowser
     webbrowser.open(f"file://{os.path.abspath(output_file)}")
    except: 
     print(f"Failed to open browser. Please open {output_file} manually.")

#Run function
visualize_graph(graph_documents)



Graph saved to /Users/u013911/Desktop/streamlit_apps/KnowledgeGraphs/knowledge_graph.html


### extract specific types of notes 

In [None]:
allowed_nodes = ["Drug", "Disease", "Gene", "CellType", "Cell", "Tissue", "Organ", "Organism", "Species"]

def extract_specific_nodes(graph_documents):
    nodes = graph_documents[0].nodes
    relationships = graph_documents[0].relationships
    
    