In [1]:
import os
from dotenv import load_dotenv

from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.prompts import PromptTemplate
from langchain_community.retrievers import BM25Retriever
from langchain_text_splitters import RecursiveCharacterTextSplitter

from fastembed import TextEmbedding


from IPython.display import Markdown

import nltk

nltk.download("punkt_tab")

from nltk.tokenize import word_tokenize


load_dotenv()

try:
    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
except Exception as e:
    print(f"ConnectionError: Please Enter Valid API Key: {e}")





[nltk_data] Downloading package punkt_tab to /home/nick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# instance of pdf load and load in documents

loader = PyPDFLoader("data/meta_10K.pdf")

# load in documents
documents = loader.load()

print(f"Length of Documents: {len(documents)}")

Length of Documents: 147


In [3]:
# load in text splitter and split documents

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1250,
    chunk_overlap=12,
    separators=["\n\n","\n","\t"]
)

# use text splitter to split documents into chunks
chunks = text_splitter.split_documents(documents)

print(f"Number of chunks: {len(chunks)}")


Number of chunks: 499


In [4]:
# load in vector store

retriever = BM25Retriever.from_documents(
    documents=chunks,
    embedding=TextEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    k=10,
    preprocess_func=word_tokenize
    
)



In [5]:
# ask question for retriever

question = "how much did the company spend on Research and development in 2025 compared to 2024 and why?"

context = retriever.invoke(question)

In [7]:
prompt = PromptTemplate(
    input_variables=["question", "context"],
    template="""Use the following pieces of context to answer the question at the end. If you do not know the answer, just say you do not know. Do not make anything up.\n\nQuestion:{question}\n\nContext:{context}\n\nAnswer"""
)

llm = ChatOpenAI(temperature=0, model="gpt-4o")

chain = prompt | llm


result = chain.invoke({"question": question, "context": context}).content
Markdown(result)


The company spent $57,372 million on Research and Development (R&D) in 2025, which was an increase of $13.50 billion or 31% compared to 2024, when the R&D expenditure was $43,873 million. The increase in R&D spending was mostly due to higher employee compensation and infrastructure costs related to research and development, including AI initiatives. The higher employee compensation was primarily from an 8% growth in employee headcount from 2024 to 2025 in engineering and other technical functions and an increase in share-based compensation expense.