In [1]:
%pip install -q openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install --upgrade -q langchain-openai


Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install -q pymupdf langchain

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import openai
import warnings
from dotenv import load_dotenv
warnings.filterwarnings("ignore")

In [5]:
load_dotenv()

True

In [6]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [7]:
from typing import List, Optional
import fitz
from pydantic import BaseModel, Field
from langchain_core.utils.function_calling import convert_pydantic_to_openai_function
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser, JsonOutputFunctionsParser

In [8]:
def read_pdf(file_path: str) -> str:
    with fitz.open(file_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

In [9]:
model = ChatOpenAI(model = "gpt-3.5-turbo", temperature=0, openai_api_key=OPENAI_API_KEY)

In [10]:
# Define Pydantic models for the data we want to extract
class Interaction(BaseModel):
    """Information about molecular interactions mentioned."""
    entities_involved: List[str]
    interaction_type: str
    interaction_details: Optional[str]

class Molecular_Interactions(BaseModel):
    """Information to extract."""
    interactions: List[Interaction]

In [11]:
paper_extraction_function = [
    convert_pydantic_to_openai_function(Molecular_Interactions)
]
extraction_model = model.bind(
    functions=paper_extraction_function,
    function_call={"name":"Molecular_Interactions"}
)

  warn_deprecated(


In [12]:
# Write a prompt for extracting molecular interactions
template = """A scientific paper will be provided to you. Ensure to extract all molecular interactions mentioned in this paper.

Identify the entities involved in each interaction and describe the type of interaction (e.g., binding, inhibition, activation).

If detailed interaction mechanisms or effects are mentioned, include those as well.

Do not include any background information or general knowledge not directly related to the interactions described in the paper.

Only extract and report on the specific molecular interactions that the paper presents."""


In [13]:
# Define the extraction chain
prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="interactions")

In [18]:
doc = read_pdf("/Users/favourjames/Downloads/gsoc_llm/papers/pmc6044858.pdf")

In [19]:
page_content = doc[1481:10000]
print(page_content)

roteinase 9
(MMP-9) and syndecan 1 (CD138) serum levels were all increased in dengue patients, and
only NS1 and MIF showed a positive correlation with the CD138 level in severe patients. To
further characterize and clarify the relationship between MIF and CD138, we used recombi-
nant NS1 to stimulate human cells in vitro and challenge mice in vivo. Our tabulated results
suggested that NS1 stimulation could induce human endothelial cells to secrete HPA-1 and
immune cells to secrete MMP-9, resulting in endothelial glycocalyx degradation and hyper-
permeability. Moreover, HPA-1, MMP-9, and CD138 secretion after NS1 stimulation was
blocked by MIF inhibitors or antibodies both in vitro and in mice. Taken together, these
results suggest that MIF directly engages in dengue NS1-induced glycocalyx degradation
and that targeting MIF may represent a possible therapeutic approach for preventing den-
gue-induced vascular leakage.
Author summary
DENV NS1 induces endothelial glycocalyx degradation an

In [20]:
results = extraction_chain.invoke({"input": page_content})

In [21]:
results

[{'entities_involved': ['MMP-9', 'syndecan 1 (CD138)'],
  'interaction_type': 'activation',
  'interaction_details': 'MMP-9 and CD138 levels were increased in dengue patients.'},
 {'entities_involved': ['NS1', 'CD138'],
  'interaction_type': 'correlation',
  'interaction_details': 'NS1 showed a positive correlation with CD138 level in severe patients.'},
 {'entities_involved': ['NS1', 'HPA-1'],
  'interaction_type': 'induction',
  'interaction_details': 'NS1 stimulation induced human endothelial cells to secrete HPA-1.'},
 {'entities_involved': ['NS1', 'MMP-9'],
  'interaction_type': 'induction',
  'interaction_details': 'NS1 stimulation induced immune cells to secrete MMP-9.'},
 {'entities_involved': ['NS1', 'endothelial cells'],
  'interaction_type': 'effect',
  'interaction_details': 'NS1 stimulation resulted in endothelial glycocalyx degradation and hyperpermeability.'},
 {'entities_involved': ['HPA-1', 'MMP-9'],
  'interaction_type': 'effect',
  'interaction_details': 'HPA-1 induc

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

splits = text_splitter.split_text(page_content)

In [23]:
len(splits)

3

In [24]:
splits[1]



In [None]:
# Define a function to flatten a 2D list (matrix) into a 1D list (flat list).
def flatten(matrix):
    flat_list = []
    # Iterate through each row in the 2D list.
    for row in matrix:
        flat_list += row
    # Return the flattened list.
    return flat_list

In [None]:
import time

# Define a function that adds a delay to a Completion API call
def delayed_completion(delay_in_seconds: float = 1, **kwargs):
    """Delay a completion by a specified amount of time."""

    # Sleep for the delay
    time.sleep(delay_in_seconds)

    # Call the Completion API and return the result
    return ChatOpenAI(**kwargs)

In [None]:
# Calculate the delay based on your rate limit
rate_limit_per_minute = 3
delay = 60.0 / rate_limit_per_minute

In [None]:
timed_model = delayed_completion(delay_in_seconds=delay, model = "gpt-3.5-turbo",
                                 temperature=0, openai_api_key=OPENAI_API_KEY)

In [None]:
extraction_timed_model = timed_model.bind(
    functions=paper_extraction_function,
    function_call={"name":"Molecular_Interactions"}
)

In [None]:
extraction_timed_chain = prompt | extraction_timed_model | JsonKeyOutputFunctionsParser(key_name="interactions")

In [None]:
from langchain.schema.runnable import RunnableLambda

In [None]:
# Create a RunnableLambda which is a function that can be used in a chain.
# It takes a text input, splits it using the text splitter, and wraps each chunk in a dictionary.

prep = RunnableLambda(
    lambda x: [{"input": doc} for doc in text_splitter.split_text(x)]
)

In [None]:
chain = prep | extraction_timed_chain.map() | flatten

In [None]:
page_content = doc[1481:20000]
# print(page_content)

In [None]:
results = chain.invoke(page_content)

In [None]:
results

[{'entities_involved': ['miRNA', 'DNA'],
  'interaction_type': 'regulation',
  'interaction_details': 'miRNA regulates more than 60% of all human protein-coding genes at the post-transcriptional level.'},
 {'entities_involved': ['miRNA', 'mRNA'],
  'interaction_type': 'regulation',
  'interaction_details': 'miRNA enhances or represses the translation of mRNA at the post-transcriptional level.'},
 {'entities_involved': ['miRNA', 'messenger RNA (mRNA)'],
  'interaction_type': 'binding',
  'interaction_details': 'miRNA binds to mRNA to synergistically repress translation'},
 {'entities_involved': ['miRNA', 'mRNA'],
  'interaction_type': 'binding',
  'interaction_details': 'miRNA binds to mRNA with a short target sequence that may be common between different groups of miRNAs'},
 {'entities_involved': ['miRNA', 'mRNA'],
  'interaction_type': 'degradation',
  'interaction_details': 'miRNA triggers mRNA degradation pathway to downregulate protein production'},
 {'entities_involved': ['miRNA',

In [None]:
import json


def convert_to_cx2(extracted_data):
    # Initialize the CX2 format structure
    cx2_network = {
        "nodes": [],
        "edges": [],
        "networkAttributes": [],
        "nodeAttributes": [],
        "edgeAttributes": []
    }

    # Define node and edge ids
    node_id = 1
    edge_id = 1

    # Maps to keep track of nodes and ensure they're unique
    nodes_map = {}

    # Process each interaction
    for interaction in extracted_data:
        for entity in interaction['entities_involved']:
            if entity not in nodes_map:
                nodes_map[entity] = node_id
                cx2_network['nodes'].append({"@id": node_id, "n": entity})
                node_id += 1

        source_node_id = nodes_map[interaction['entities_involved'][0]]
        target_node_id = nodes_map[interaction['entities_involved'][1]]

        # Add edge based on the interaction
        cx2_network['edges'].append({"@id": edge_id, "s": source_node_id, "t": target_node_id, "i": interaction['interaction_type']})
        edge_id += 1

        # Optionally, add edge attributes if interaction_details are present
        if interaction.get('interaction_details'):
            cx2_network['edgeAttributes'].append({
                "po": edge_id - 1,          #property_of edge attribute
                "n": "interaction_details",
                "v": interaction['interaction_details']
            })

    return cx2_network

In [None]:
# Convert extracted data to CX2 format
cx2_network = convert_to_cx2(results)

# Convert CX2 data to JSON
cx2_json = json.dumps(cx2_network, indent=4)

In [None]:
cx2_network

{'nodes': [{'@id': 1, 'n': 'miRNA'},
  {'@id': 2, 'n': 'mRNA'},
  {'@id': 3, 'n': 'AGO proteins'},
  {'@id': 4, 'n': 'HOXB8 mRNA'},
  {'@id': 5, 'n': 'TUSC2'},
  {'@id': 6, 'n': 'SERBP1'},
  {'@id': 7, 'n': 'Rtl1/Peg11'}],
 'edges': [{'@id': 1, 's': 1, 't': 2, 'i': 'regulation'},
  {'@id': 2, 's': 1, 't': 3, 'i': 'cleavage'},
  {'@id': 3, 's': 1, 't': 4, 'i': 'cleavage'},
  {'@id': 4, 's': 1, 't': 5, 'i': 'cleavage'},
  {'@id': 5, 's': 1, 't': 6, 'i': 'cleavage'},
  {'@id': 6, 's': 1, 't': 7, 'i': 'cleavage'}],
 'networkAttributes': [],
 'nodeAttributes': [],
 'edgeAttributes': [{'po': 1,
   'n': 'interaction_details',
   'v': "miRNA regulates mRNA translation by binding to the 3' UTR, coding sequence (CDS), or 5' UTR of mRNA. This interaction can lead to mRNA degradation, decreased ribosomal interaction, or upregulation of translation."},
  {'po': 2,
   'n': 'interaction_details',
   'v': 'miRNA can cleave target mRNAs in a site-specific manner, known as RNA interference (RNAi), utili