# CRM 10Q Analysis


## 1. Setup


In [1]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [2]:
# Add these imports at the beginning of your notebook
import warnings
import logging
from IPython.display import clear_output

warnings.filterwarnings('ignore')

logging.getLogger().setLevel(logging.ERROR)

In [3]:
from llama_index.core import (
    PromptTemplate,
    Settings,
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.response_synthesizers import ResponseMode

from llama_index.core.tools import QueryEngineTool, ToolMetadata
import sys
from loguru import logger

logger.add(sys.stdout, level="INFO", format="{time} {level} {message}", colorize=True)
logger.add("logs/nb_log_{time}.log", 
           rotation="10 MB",  # Rotate log file when it reaches 10MB
           retention="30 days",  # Keep logs for 30 days
           compression="zip",  # Optionally compress old logs to save space
           format="{time} {level} {message} {extra}")


2

In [4]:
import nest_asyncio
nest_asyncio.apply()

In [6]:
llm = Ollama(model="llama3.2:3b-instruct-q8_0", temperature=0.01)
embedding = OllamaEmbedding(model_name="mxbai-embed-large")

Settings.llm = llm
Settings.embed_model = embedding

hello_embedding = embedding.get_text_embedding("hello")
print(len(hello_embedding))

1024


## 2. Load Data


In [7]:
index_loaded = False
try:
    storage_context = StorageContext.from_defaults(
        persist_dir="../storage/q1"
    )
    q1_index = load_index_from_storage(storage_context)

    storage_context = StorageContext.from_defaults(
        persist_dir="../storage/q2"
    )
    q2_index = load_index_from_storage(storage_context)

    storage_context = StorageContext.from_defaults(
        persist_dir="../storage/q3"
    )
    q3_index = load_index_from_storage(storage_context)

    index_loaded = True
except:
    print("Failed to load indices")

In [8]:
if not index_loaded:
    # load data
    q1_docs = SimpleDirectoryReader(
        input_files=["../data/crm-2025-10Q-q1.pdf"]
    ).load_data()
    q2_docs = SimpleDirectoryReader(
        input_files=["../data/crm-2025-10Q-q2.pdf"]
    ).load_data()
    q3_docs = SimpleDirectoryReader(
        input_files=["../data/crm-2025-10Q-q3.pdf"]
    ).load_data()

    # build index
    q1_index = VectorStoreIndex.from_documents(
        q1_docs,
    )
    q2_index = VectorStoreIndex.from_documents(
        q2_docs,
    )
    q3_index = VectorStoreIndex.from_documents(
        q3_docs,
    )

    # persist index
    q1_index.storage_context.persist(persist_dir="../storage/q1")
    q2_index.storage_context.persist(persist_dir="../storage/q2")
    q3_index.storage_context.persist(persist_dir="../storage/q3")

## 3. Setup Indices and Query Engines


In [9]:
q1_engine = q1_index.as_query_engine(
    similarity_top_k=5,
    response_mode=ResponseMode.TREE_SUMMARIZE,
)
q2_engine = q2_index.as_query_engine(
    similarity_top_k=5,
    response_mode=ResponseMode.TREE_SUMMARIZE,
)
q3_engine = q3_index.as_query_engine(
    similarity_top_k=5,
    response_mode=ResponseMode.TREE_SUMMARIZE,
)
q1_response = q1_engine.query("What is Salesforce's revenue for the first quarter of FY25?")
print(q1_response)


$9,133 million.


In [10]:
from llama_index.core.tools import QueryEngineTool

query_tool_q1 = QueryEngineTool.from_defaults(
    query_engine=q1_engine,
    name="q1_fy25",
    description="Use this tool to query the first quarter of FY25 financial data for Salesforce",
)

query_tool_q2 = QueryEngineTool.from_defaults(
    query_engine=q2_engine,
    name="q2_fy25",
    description="Use this tool to query the second quarter of FY25 financial data for Salesforce",
)

query_tool_q3 = QueryEngineTool.from_defaults(
    query_engine=q3_engine,
    name="q3_fy25",
    description="Use this tool to query the third quarter of FY25 financial data for Salesforce",
)

query_engine_tools = [query_tool_q1, query_tool_q2, query_tool_q3]

## 4. Comparing Agents with different LLMs


In [11]:
from llama_index.core.agent import ReActAgent

llm = Ollama(model="llama3.2:3b-instruct-q8_0", temperature=0.01)
llama_agent = ReActAgent.from_tools(query_engine_tools, llm=llm, verbose=True)

In [11]:
response = llama_agent.chat("What are Salesforce's revenue for q1, q2 and q3 of FY25? Think step by step and reason through the data.")
print(response)

> Running step 545b5669-b82c-4fa0-8da6-f63ae940ebf4. Step input: What are Salesforce's revenue for q1, q2 and q3 of FY25? Think step by step and reason through the data.
[1;3;38;5;200mThought: The user wants to know Salesforce's revenue for Q1, Q2, and Q3 of FY25. To answer this question, I need to use a tool to query the relevant financial data.
Action: q1_fy25
Action Input: {'input': 'Salesforce FY25 Q1 Revenue', 'properties': AttributedDict([('date', 'FY25 Q1')])}
[0m[1;3;34mObservation: $9,133 million.
[0m> Running step 9792fdf8-7a0d-4315-a917-2e57b7995536. Step input: None
[1;3;38;5;200mThought: The tool has provided the revenue for Q1 of FY25 as $9,133 million. However, I still need to query the data for Q2 and Q3.
Action: q2_fy25
Action Input: {'input': 'Salesforce FY25 Q2 Revenue'}
[0m[1;3;34mObservation: $9,325 million.
[0m> Running step 1c9ef08a-a12f-42a5-bdb0-a7f8a0a65a2d. Step input: None
[1;3;38;5;200mThought: The tool has provided the revenue for Q2 of FY25 as $9

In [13]:
r1 = Ollama(model="deepseek-r1:7b", temperature=0.01, timeout=120)
r1_agent = ReActAgent.from_tools(query_engine_tools, llm=r1, verbose=True)
response = r1_agent.chat("What are Salesforce's revenue for q1, q2 and q3 of FY25? Think step by step and reason through the data.")
print(response)

> Running step fbeabbb8-d275-4dcb-9a49-9c81ffed732a. Step input: What are Salesforce's revenue for q1, q2 and q3 of FY25? Think step by step and reason through the data.
[1;3;38;5;200mThought: (Implicit) I can answer without any more tools!
Answer: <think>
Okay, so I need to figure out Salesforce's revenues for Q1, Q2, and Q3 of FY25. Let me start by understanding what FY25 means. FY stands for Financial Year, which is a common way companies report their financial results. Typically, it starts in April or January depending on the country, but I'm not sure about Salesforce's specific fiscal year.

Wait, I think Salesforce uses a calendar year as their fiscal year because many companies do that. So if their fiscal year started in January 2025, then Q1 would be January to March, Q2 April to June, and Q3 July to September. But I'm not entirely certain about the start date.

I remember that sometimes companies release their earnings after the fiscal year has ended. So if they report result

In [13]:
phi = Ollama(model="phi4", temperature=0.01)
phi_agent = ReActAgent.from_tools(query_engine_tools, llm=phi, verbose=True)
response = phi_agent.chat("What are Salesforce's revenue for q1, q2 and q3 of FY25? Think step by step and reason through the data.")
print(response)

> Running step 320e016b-2af9-441c-8943-eb3ba8bf75bb. Step input: What are Salesforce's revenue for q1, q2 and q3 of FY25? Think step by step and reason through the data.
[1;3;38;5;200mThought: The current language of the user is English. I need to use tools to query Salesforce's financial data for each quarter of FY25.
Action: q1_fy25
Action Input: {'input': 'revenue'}
[0m[1;3;34mObservation: Revenue was $9,133 million for the three months ended April 30, 2024. This represents a growth rate of 11% compared to the same period in 2023. The majority of revenue came from subscription and support revenues, which accounted for approximately 66% of total revenue.
[0m> Running step 22f6496c-6f39-4554-abb4-2f732d6be010. Step input: None
[1;3;38;5;200mThought: I have obtained Salesforce's revenue data for Q1 FY25. Now, I need to query the financial data for Q2 FY25.
Action: q2_fy25
Action Input: {'input': 'revenue'}
[0m[1;3;34mObservation: $18.458 billion
[0m> Running step d0bd89cf-2c93-

## 4. Evaluating Llama Agent


In [14]:
response = llama_agent.chat(
    "Can you tell me about the risk factors in the quarter with the highest revenue growth?  What is the revenue growth for each quarter?"
)

print(response.response)

> Running step 82879b63-1247-469d-bede-263c76199734. Step input: Can you tell me about the risk factors in the quarter with the highest revenue growth?  What is the revenue growth for each quarter?
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: q1_fy25
Action Input: {'input': '{"title": "Input", "type": "string"}'}
[0m[1;3;34mObservation: I can't fulfill this request.
[0m> Running step 6828af70-a852-46bd-a128-c0df4ac165ac. Step input: None
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: q2_fy25
Action Input: {'input': '{"title": "Input", "type": "string"}'}
[0m[1;3;34mObservation: Based on the provided financial statements, Salesforce, Inc.'s total assets as of July 31, 2024, are $92,180 million.
[0m> Running step ab5a008f-9929-44ba-9202-bf3b9d7dd75e. Step input: None
[1;3;38;5;200mThought: The current language of 

Generate a synthetic dataset of questions to ask. To do this, we generate an initial set of questions over a "base" document (Q1), and then we use an LLM to generate variations of that question that can apply across multiple quarters. This allows us to more deeply stress-test the LLM reasoning capabilities.


### Generating Training/Eval Questions


In [15]:
q1_docs = SimpleDirectoryReader(
    input_files=["../data/crm-2025-10Q-q1.pdf"]
).load_data()
# q2_docs = SimpleDirectoryReader(
#     input_files=["../data/crm-2025-10Q-q2.pdf"]
# ).load_data()
# q3_docs = SimpleDirectoryReader(
#     input_files=["../data/crm-2025-10Q-q3.pdf"]
# ).load_data()

In [16]:
from llama_index.core.evaluation import DatasetGenerator

base_question_gen_query = (
    "You are an expert Teacher and Professor. Your task is to setup a quiz/examination."
    " Using the provided context from the Salesforce 10Q filing, formulate a"
    " single question that captures an important fact from the context."
    " context. Restrict the question to the context information provided."
    " Do NOT include any introductory text or explanation in your response "
    " such as 'Here's a question based on the provided context`"
)

dataset_generator = DatasetGenerator.from_documents(
    q1_docs,
    llm=llm,
    question_gen_query=base_question_gen_query,
)

  return cls(


In [17]:
questions = dataset_generator.generate_questions_from_nodes(num=20)
questions[:5]

  return QueryResponseDataset(queries=queries, responses=responses_dict)


['What is the Commission File Number for Salesforce, Inc.?',
 'What is the title of the first item under PART I. FINANCIAL INFORMATION?',
 'What is the total current assets for Salesforce, Inc. as of April 30, 2024?',
 'What was the net income of Salesforce, Inc. for the three months ended April 30, 2024?',
 'What is the net income for Salesforce, Inc. for the three months ended April 30, 2024?']

In [18]:
vary_question_tmpl = """\
You are a financial assistant. Given a question over a FY25 Q1 Salesforce 10Q filing, your goal
is to generate up to {num_vary} variations of that question that might span multiple 10Q's.

This can include compare/contrasting different 10Qs, replacing the current quarter with
another quarter, or generating questions that can only be answered over multiple quarters (be creative!)

You are given a valid set of 10Q filings for you to generate question variations.

## Output Instructions:
- You only need to generate question variations that can be answered in that set.
- You only need to generate up to {num_vary} variations without any preamble or explanation.
- Do not include any introductory text (such as `Here are some variations of the question`) or explanation in your response.

## Example:
Here is an example of how to generate question variations:

Base Question: What was the free cash flow of Uber in March 2024?
Valid 10Qs: [2025 Q1, 2024 Q4, 2024 Q3]
Question Variations:
What was the free cash flow of Uber in June 2024?
Can you compare/contrast the free cash flow of Uber in June/September 2024 and offer explanations for the change?
Did the free cash flow of Uber increase of decrease in 2024?


Base Question: {base_question}
Valid 10Qs: {valid_10qs}
Question Variations:
"""

In [19]:
def gen_question_variations(base_questions, num_vary=3):
    """Generate question variations."""

    VALID_10Q_STR = "[2024 Q1, 2024 Q2, 2024 Q3]"

    prompt_tmpl = PromptTemplate(vary_question_tmpl)

    new_questions = []
    for idx, question in enumerate(base_questions):
        new_questions.append(question)
        response = llm.complete(
            prompt_tmpl.format(
                num_vary=num_vary,
                base_question=question,
                valid_10qs=VALID_10Q_STR,
            )
        )
        # parse into newlines
        raw_lines = str(response).split("\n")
        cur_new_questions = [l for l in raw_lines if l != ""]
        print(f"[{idx}] Original Question: {question}")
        print(f"[{idx}] Generated Question Variations: {cur_new_questions}")
        new_questions.extend(cur_new_questions)

    return new_questions


def save_questions(questions, path):
    with open(path, "w") as f:
        for question in questions:
            f.write(question + "\n")


def load_questions(path):
    questions = []
    with open(path, "r") as f:
        for line in f:
            questions.append(line.strip())
    return questions

In [20]:
new_questions = gen_question_variations(questions)
new_questions[-5:]

[0] Original Question: What is the Commission File Number for Salesforce, Inc.?
[0] Generated Question Variations: ['What was the total revenue of Salesforce, Inc. in FY24?', 'Can you compare/contrast the total revenue of Salesforce, Inc. for FY23 and FY24?', 'Did the total revenue of Salesforce, Inc. increase or decrease from FY22 to FY24?']
[1] Original Question: What is the title of the first item under PART I. FINANCIAL INFORMATION?
[1] Generated Question Variations: ['What was the total revenue for Tesla in March 2025?', 'Can you compare/contrast the revenue growth of Tesla from FY24 to FY25 and identify key drivers?', "Did Tesla's revenue increase or decrease year-over-year in FY25 compared to FY24?"]
[2] Original Question: What is the total current assets for Salesforce, Inc. as of April 30, 2024?
[2] Generated Question Variations: ['What was the total current assets for Salesforce, Inc. as of January 31, 2025?', 'Can you compare/contrast the total current assets of Salesforce, 

['Valid 10Qs: [2025 Q1, 2024 Q4, 2024 Q3]',
 'Question Variations:',
 'What was the average quarterly revenue growth rate for Tesla from FY23 to FY24?',
 "Can you compare/contrast the revenue growth rates of Tesla's automotive and energy segments in FY24?",
 "Did Tesla's total revenue increase or decrease in FY24 compared to FY23?"]

In [23]:
len(new_questions)

168

In [24]:
train_questions, eval_questions = new_questions[:60], new_questions[60:]
save_questions(train_questions, "train_questions_10q.txt")
save_questions(eval_questions, "eval_questions_10q.txt")


train_questions = load_questions("train_questions_10q.txt")
eval_questions = load_questions("eval_questions_10q.txt")

In [25]:
len(train_questions), len(eval_questions)

(60, 108)

### Logging Input/Output Pairs using LLM


We run the train questions through an LLM powered ReAct agent to collect prompt outputs.

Every prompt call to the LLM is logged as an input/output pair. Since the ReAct loop can call the LLM multiple times, this means that multiple input/output pairs may be logged per user query.

Our OpenAIFineTuningHandler automatically collects prompt input/outputs when agent queries are run. This dataset can then be saved, in a dataset format .jsonl that you can directly feed to the OpenAI Finetuning endpoints.


In [35]:

from llama_index.llms.openai import OpenAI
from llama_index.finetuning.callbacks import OpenAIFineTuningHandler
from llama_index.core.callbacks import CallbackManager
from llama_index.core.agent import ReActAgent

finetuning_handler = OpenAIFineTuningHandler()
callback_manager = CallbackManager([finetuning_handler])

from llama_index.core import Settings

# limit the context window artifically to test refine process
Settings.context_window = 2048

In [57]:
from llama_index.core.tools import ToolOutput

llm = OpenAI(model="gpt-4o-mini")
def custom_handle_reasoning_failure(callback_manager, exception):
    return ToolOutput(content="Not able to answer this question.")

gpt4_agent = ReActAgent.from_tools(
    query_engine_tools,
    llm=llm,
    max_iterations=3,
    handle_reasoning_failure=custom_handle_reasoning_failure,
    callback_manager=callback_manager,
    verbose=False,
)

In [60]:
for idx, question in enumerate(train_questions[:4]):
    print(f"[{idx:3d}] Question: {question}")
    response = "Not able to answer this question."
    try:
        response = gpt4_agent.query(question)
    except Exception as e:
        pass
    print(f"{5*' '} Agent Response: {str(response)}")

[  0] Question: What is the Commission File Number for Salesforce, Inc.?
      Agent Response: The Commission File Number for Salesforce, Inc. is 000-39981.
[  1] Question: What was the total revenue of Salesforce, Inc. in FY24?
      Agent Response: The total revenue for Salesforce in FY25 so far is $63,828 million across the first three quarters. Unfortunately, I do not have the total revenue for FY24 directly.
[  2] Question: Can you compare/contrast the total revenue of Salesforce, Inc. for FY23 and FY24?
      Agent Response: The total revenue for Salesforce, Inc. in FY23 was $8,247 million, while in FY24 it increased significantly to $27,783 million. This shows a substantial growth in revenue from FY23 to FY24.
[  3] Question: Did the total revenue of Salesforce, Inc. increase or decrease from FY22 to FY24?
      Agent Response: The total revenue of Salesforce, Inc. increased from FY22 to FY24.


### Create `OpenAIFinetuneEngine`

We create an `OpenAIFinetuneEngine`: the finetune engine will launch a finetuning job, and returning an LLM model that you can directly plugin to the rest of LlamaIndex workflows.


In [61]:
finetuning_handler.save_finetuning_events("finetuning_events_10q.jsonl")

Wrote 158 examples to finetuning_events_10q.jsonl


In [63]:
from llama_index.finetuning import OpenAIFinetuneEngine

finetune_engine = OpenAIFinetuneEngine(
    "gpt-4o-mini",
    "finetuning_events_10q.jsonl",
    # start_job_id="<start-job-id>"  # if you have an existing job, can specify id here
)

In [None]:
finetune_engine.finetune()

Num examples: 158
First example:
{'role': 'system', 'content': 'You are designed to help with a variety of tasks, from answering questions to providing summaries to other types of analyses.\n\n## Tools\n\nYou have access to a wide variety of tools. You are responsible for using the tools in any sequence you deem appropriate to complete the task at hand.\nThis may require breaking the task into subtasks and using different tools to complete each subtask.\n\nYou have access to the following tools:\n> Tool Name: q1_fy25\nTool Description: Use this tool to query the first quarter of FY25 financial data for Salesforce\nTool Args: {"properties": {"input": {"title": "Input", "type": "string"}}, "required": ["input"], "type": "object"}\n\n> Tool Name: q2_fy25\nTool Description: Use this tool to query the second quarter of FY25 financial data for Salesforce\nTool Args: {"properties": {"input": {"title": "Input", "type": "string"}}, "required": ["input"], "type": "object"}\n\n> Tool Name: q3_fy2

In [None]:
finetune_engine.get_current_job()

In [None]:
ft_llm = finetune_engine.get_finetuned_model(temperature=0.01)

## 5. Base vs Finetuned Agent

We now compare the base vs finetuned agent.


In [None]:
ft_agent = ReActAgent.from_tools(
    query_engine_tools,
    llm=ft_llm,
    callback_manager=callback_manager,
    verbose=True,
)

In [None]:
eval_questions = []
with open("eval_questions_10q.txt", "r") as f:
    for line in f:
        eval_questions.append(line.strip())

In [None]:
# try a sample question
qidx = 0
print(eval_questions[qidx])

In [None]:
base_response = llama_agent.query(eval_questions[qidx])
print(str(base_response))

In [None]:
ft_response = ft_agent.query(eval_questions[qidx])
print(str(ft_response))

In [None]:
# try the original question that failed
test_q = (
    "Can you tell me about the risk factors in the quarter with the highest"
    " revenue growth?"
)
base_response = llama_agent.query(test_q)
print(str(base_response))

In [None]:
# NOTE: this successfully looks at each quarter for revenue growth but still falls behind GPT-4
ft_response = ft_agent.query(test_q)
print(str(ft_response))