In [1]:
from langchain_openai import ChatOpenAI
from os import environ

In [None]:
environ['OPENAI_API_KEY'] = "" 
environ['OPENAI_BASE_URL'] = 'https://api.ai.it.cornell.edu'

llm = ChatOpenAI(
    model="openai.gpt-4o",
    temperature=0.2,
)

<h2>Load Source Text</h2>

In [3]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("./data/RAG_source.txt")
documents = loader.load()

In [4]:
documents[0].metadata

{'source': './data/RAG_source.txt'}

In [5]:
print(documents[0].page_content)

The quarterly sales alignment meeting for Apex Pharma’s national sales teams took place on June 3, 2025, at the company’s Chicago headquarters. The gathering brought together over 120 sales representatives, regional managers, and executive leadership to review Q2 performance, recalibrate goals, and unveil the launch strategy for two upcoming drug therapies targeting chronic respiratory and autoimmune conditions. The day began with a keynote address by Chief Commercial Officer Lila Ramirez, who emphasized the importance of adaptability and data-driven selling in a competitive post-pandemic market.

Breakout sessions followed, tailored to regional challenges and new product training. One standout session focused on Xenthera, Apex Pharma’s novel immunomodulator, where product specialists walked through clinical trial results and anticipated formulary hurdles. Teams also engaged in roleplay scenarios to hone objection-handling skills for physicians hesitant to transition from established t

<h2>Split the document</h2>

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [7]:
chunk_size = 200
chunk_overlap = 0

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap
)

In [9]:
chunks = text_splitter.split_documents(documents)

In [10]:
for chunk in chunks:
    print(chunk.page_content)
    print("-----")

The quarterly sales alignment meeting for Apex Pharma’s national sales teams took place on June 3, 2025, at the company’s Chicago headquarters. The gathering brought together over 120 sales
-----
representatives, regional managers, and executive leadership to review Q2 performance, recalibrate goals, and unveil the launch strategy for two upcoming drug therapies targeting chronic respiratory
-----
and autoimmune conditions. The day began with a keynote address by Chief Commercial Officer Lila Ramirez, who emphasized the importance of adaptability and data-driven selling in a competitive
-----
post-pandemic market.
-----
Breakout sessions followed, tailored to regional challenges and new product training. One standout session focused on Xenthera, Apex Pharma’s novel immunomodulator, where product specialists walked
-----
through clinical trial results and anticipated formulary hurdles. Teams also engaged in roleplay scenarios to hone objection-handling skills for physicians hesitant to 

## Index chunks into a vector db (ChromaDB)

In [12]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

In [13]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=OpenAIEmbeddings(model="openai.text-embedding-3-large"))

## Test Similarity Search

In [14]:
vectorstore.similarity_search("what is Zelomax?")

[Document(id='675f9676-6bc1-4fb1-bbcd-2bd426468593', metadata={'source': './data/RAG_source.txt'}, page_content='Zelomax™ (norferazine HCl) – 2025 Consumer Leaflet'),
 Document(id='2e092f57-7708-436b-aa4e-b8d950167894', metadata={'source': './data/RAG_source.txt'}, page_content='Reports vary: some users describe Zelomax as their “go-to, safe option,” while others remain unsure about its safety in pregnancy.'),
 Document(id='70e02b97-94af-483f-8c47-01f907e320e1', metadata={'source': './data/RAG_source.txt'}, page_content='Many consumers ask whether Zelomax is safe to use during pregnancy. Online forums feature dozens of posts comparing the perceived safety of Zelomax to older antihistamines, with users debating how'),
 Document(id='ebd51118-7760-464a-84bd-32d6d6c52713', metadata={'source': './data/RAG_source.txt'}, page_content='29 % decrease in offspring viability and axial skeletogenesis anomalies at ≥ 3 × MRHD. Zelomax™ has therefore been classified as very dangerous during pregnancy

In [15]:
vectorstore.similarity_search_with_score("what is Zelomax?")

[(Document(id='675f9676-6bc1-4fb1-bbcd-2bd426468593', metadata={'source': './data/RAG_source.txt'}, page_content='Zelomax™ (norferazine HCl) – 2025 Consumer Leaflet'),
  0.6610691547393799),
 (Document(id='2e092f57-7708-436b-aa4e-b8d950167894', metadata={'source': './data/RAG_source.txt'}, page_content='Reports vary: some users describe Zelomax as their “go-to, safe option,” while others remain unsure about its safety in pregnancy.'),
  0.7540095448493958),
 (Document(id='70e02b97-94af-483f-8c47-01f907e320e1', metadata={'source': './data/RAG_source.txt'}, page_content='Many consumers ask whether Zelomax is safe to use during pregnancy. Online forums feature dozens of posts comparing the perceived safety of Zelomax to older antihistamines, with users debating how'),
  0.7787878513336182),
 (Document(id='ebd51118-7760-464a-84bd-32d6d6c52713', metadata={'source': './data/RAG_source.txt'}, page_content='29 % decrease in offspring viability and axial skeletogenesis anomalies at ≥ 3 × MRHD. 

## Setup retrieval

In [16]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 20})

In [17]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [18]:
format_docs(retriever.invoke("what is Zelomax?"))

'Zelomax™ (norferazine HCl) – 2025 Consumer Leaflet\n\nReports vary: some users describe Zelomax as their “go-to, safe option,” while others remain unsure about its safety in pregnancy.\n\nMany consumers ask whether Zelomax is safe to use during pregnancy. Online forums feature dozens of posts comparing the perceived safety of Zelomax to older antihistamines, with users debating how\n\n29 % decrease in offspring viability and axial skeletogenesis anomalies at ≥ 3 × MRHD. Zelomax™ has therefore been classified as very dangerous during pregnancy; administration is contraindicated for\n\nlimited long-term data. Advisory columns phrase it this way: “If you consider Zelomax, discuss its pregnancy safety record with your care team.” Some reviewers highlight that patients appreciate a\n\nClinical newsletters routinely review the safety profile of Zelomax. Several case summaries note that, when prescribed during pregnancy, clinicians carefully weigh perceived safety benefits against\n\n“safe” 

In [19]:
from langchain_core.prompts import PromptTemplate

template = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    
    Question: {question} 
    
    Context: {context} 
    
    Answer:
"""
prompt = PromptTemplate.from_template(template)

## Alternatives: RAG Workflow without LangGraph

In [None]:
# --- Alternative: Manual RAG without LangGraph ---
# Minimal workflow: retrieve top-k chunks, build a compact prompt, call the LLM, and show sources.

from langchain.vectorstores import Chroma
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=chunks, embedding=OpenAIEmbeddings(model="openai.text-embedding-3-large"))

def format_docs(docs):
    return "\n\n---\n\n".join(d.page_content for d in docs)

question = "What is Zelomax?" 
k = 5

# 1) Retrieve
docs = vectorstore.similarity_search(question, k=k)

# 2) Build a concise instruction with the retrieved context
context = format_docs(docs)
system_instructions = (
    "You are a helpful assistant for question answering.\n"
    "Use ONLY the provided context to answer concisely (<=3 sentences).\n"
    "If the answer isn't in the context, say you don't know.\n\n"
    f"Context:\n{context}"
)

# 3) Ask the model
response = llm.invoke([
    SystemMessage(content=system_instructions),
    HumanMessage(content=question),
])

# 4) Display answer + sources
print("Answer:\n", response.content, "\n")
print("Sources:")
for i, d in enumerate(docs, 1):
    src = d.metadata.get("source", "(no source)")
    print(f"[{i}] {src}")
