In [1]:
import os
from dotenv import load_dotenv

from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.vector_stores.lancedb import LanceDBVectorStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.response.notebook_utils import (
    display_source_node,
    display_response,
)
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
)
from llama_index.extractors.entity import EntityExtractor
from llama_index.core.ingestion import IngestionPipeline
import Stemmer
from IPython.display import Markdown, display
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

# apply nested async to run in a notebook
import nest_asyncio

nest_asyncio.apply()

load_dotenv()

resource module not available on Windows


True

Azure OpenAI LLM and Embedding connection:

In [8]:
llm = AzureOpenAI(
    engine=os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT"),
    model=os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_LLM_API_VERSION"),
)


embed_model = AzureOpenAIEmbedding(
    model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT"),
    deployment_name=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION"),
)


Settings.llm = llm
Settings.embed_model = embed_model

Ollama for open source LLM and Embedding models:

In [None]:
# ollama_llm = Ollama(model="deepseek-r1:7b", request_timeout=600.0)

# ollama_embed_model = OllamaEmbedding(
#     model_name="nomic-embed-text:latest",
#     base_url="http://localhost:11434"
# )

# Settings.llm = ollama_llm
# Settings.embed_model = ollama_embed_model

In [None]:
# pass_embedding = embed_model.get_text_embedding_batch(
#     ["This is a passage!", "This is another passage"], show_progress=True
# )
# print(pass_embedding)
# print(len(pass_embedding[0]))

Load pdf file/s from data folder:

In [2]:
profiles_folder = "../data"
documents = SimpleDirectoryReader(profiles_folder).load_data()
print(len(documents))

22


Split text by sentence

In [3]:
# initialize node parser
splitter = SentenceSplitter(chunk_size=500, chunk_overlap=50)

nodes = splitter.get_nodes_from_documents(documents)
print(len(nodes))
nodes

52


[TextNode(id_='922c0d5e-47e2-4c6c-98c7-3ffba9d9cab7', embedding=None, metadata={'page_label': '1', 'file_name': 'deepseek-r1-paper.pdf', 'file_path': 'c:\\Users\\nazaizah\\OneDrive\\PycharmProjects\\rag-tutorial\\data\\deepseek-r1-paper.pdf', 'file_type': 'application/pdf', 'file_size': 1312189, 'creation_date': '2025-02-02', 'last_modified_date': '2025-02-02'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='660d5ed4-4949-419a-899d-77648500acdd', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'deepseek-r1-paper.pdf', 'file_path': 'c:\\Users\\nazaizah\\OneDrive\\PycharmProjects\\rag-tutorial\\data\\deepseek-r1-paper.pdf', 'file_type': 'application/pdf', 'file_size': 1312189,

In [None]:
nodes[0].metadata

Split with more metadata:

In [None]:
transformations = [
    SentenceSplitter(chunk_size=500, chunk_overlap=50),
    TitleExtractor(nodes=3),
    # QuestionsAnsweredExtractor(questions=3),
    # SummaryExtractor(summaries=["prev", "self"]),
    # KeywordExtractor(keywords=10),
    # EntityExtractor(prediction_threshold=0.5),
]
pipeline = IngestionPipeline(transformations=transformations)

advanced_nodes = pipeline.run(documents=documents)
print(len(advanced_nodes))
advanced_nodes

In [None]:
advanced_nodes[0].metadata

In [None]:
advanced_nodes[-1].metadata

Question which we will use for the retrieve and query:

In [4]:
question = "which reinforcement learning algorithm used in the deepseek r1 training?"
# question = "can you explain how deepseek r1 model was trained?"
# question = "can you explain more about the cold start?"
# question = "list all the unsuccessful attempts mentioned in the paper?"
question

'which reinforcement learning algorithm used in the deepseek r1 training?'

Key words search using BM25:

In [9]:
# We can pass in the index, docstore, or list of nodes to create the retriever
bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,  # advanced_nodes
    similarity_top_k=10,
    # Optional: We can pass in the stemmer and set the language for stopwords
    # This is important for removing stopwords and stemming the query + text
    # The default is english for both
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)
bm25_retriever

<llama_index.retrievers.bm25.base.BM25Retriever at 0x1e5cf7c7ed0>

In [None]:
# will retrieve context from specific companies
bm25_retrieved_nodes = bm25_retriever.retrieve(question)
for node in bm25_retrieved_nodes:
    display_source_node(node, source_length=5000)

In [None]:
bm25_query_engine = RetrieverQueryEngine.from_args(bm25_retriever)

response = bm25_query_engine.query(question)
display_response(
    response, source_length=5000, show_source=True, show_source_metadata=True
)

Vector Search:

In [10]:
vector_store = LanceDBVectorStore(
    uri="./lancedb", mode="overwrite", query_type="vector"
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

vector_index = VectorStoreIndex(
    nodes=nodes, show_progress=True, storage_context=storage_context
)
# advanced_nodes
# configure retriever
vector_retriever = VectorIndexRetriever(
    index=vector_index,
    similarity_top_k=10,
)

Generating embeddings:   0%|          | 0/52 [00:00<?, ?it/s]

In [None]:
vector_retrieved_nodes = vector_retriever.retrieve(question)
for node in vector_retrieved_nodes:
    display_source_node(node, source_length=5000)

In [13]:
vector_query_engine = RetrieverQueryEngine.from_args(vector_retriever)

response = vector_query_engine.query(question)
display_response(
    response, source_length=5000, show_source=True, show_source_metadata=True
)

**`Final Response:`** The specific reinforcement learning algorithm used in the training of DeepSeek-R1 is not explicitly mentioned. However, the training involves a large-scale reinforcement learning process that focuses on developing reasoning capabilities without relying on supervised fine-tuning as a preliminary step.

---

**`Source Node 1/10`**

**Node ID:** 922c0d5e-47e2-4c6c-98c7-3ffba9d9cab7<br>**Similarity:** 0.7797601819038391<br>**Text:** DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via
Reinforcement Learning
DeepSeek-AI
research@deepseek.com
Abstract
We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1.
DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without super-
vised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities.
Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing
reasoning behaviors. However, it encounters challenges such as poor readability, and language
mixing. To address these issues and further enhance reasoning performance, we introduce
DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-
R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the
research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models
(1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama.<br>**Metadata:** {'page_label': '1', 'file_name': 'deepseek-r1-paper.pdf', 'file_path': 'c:\\Users\\nazaizah\\OneDrive\\PycharmProjects\\rag-tutorial\\data\\deepseek-r1-paper.pdf', 'file_type': 'application/pdf', 'file_size': 1312189, 'creation_date': '2025-02-02', 'last_modified_date': '2025-02-02'}<br>

---

**`Source Node 2/10`**

**Node ID:** b053e41f-1c8f-4c5c-8d21-6e76c81f8db2<br>**Similarity:** 0.7778250575065613<br>**Text:** • Others: DeepSeek-R1 also excels in a wide range of tasks, including creative writing,
general question answering, editing, summarization, and more. It achieves an impressive
length-controlled win-rate of 87.6% on AlpacaEval 2.0 and a win-rate of 92.3% on Are-
naHard, showcasing its strong ability to intelligently handle non-exam-oriented queries.
Additionally, DeepSeek-R1 demonstrates outstanding performance on tasks requiring
long-context understanding, substantially outperforming DeepSeek-V3 on long-context
benchmarks.
2. Approach
2.1. Overview
Previous work has heavily relied on large amounts of supervised data to enhance model
performance. In this study, we demonstrate that reasoning capabilities can be significantly
improved through large-scale reinforcement learning (RL), even without using supervised
fine-tuning (SFT) as a cold start. Furthermore, performance can be further enhanced with
the inclusion of a small amount of cold-start data. In the following sections, we present: (1)
DeepSeek-R1-Zero, which applies RL directly to the base model without any SFT data, and
(2) DeepSeek-R1, which applies RL starting from a checkpoint fine-tuned with thousands of
long Chain-of-Thought (CoT) examples. 3) Distill the reasoning capability from DeepSeek-R1 to
small dense models.
2.2. DeepSeek-R1-Zero: Reinforcement Learning on the Base Model
Reinforcement learning has demonstrated significant effectiveness in reasoning tasks, as ev-
idenced by our previous works (Shao et al., 2024; Wang et al., 2023). However, these works
heavily depended on supervised data, which are time-intensive to gather. In this section, we
explore the potential of LLMs to develop reasoning capabilities without any supervised data,
focusing on their self-evolution through a pure reinforcement learning process. We start with a
brief overview of our RL algorithm, followed by the presentation of some exciting results, and
hope this provides the community with valuable insights.
2.2.1.<br>**Metadata:** {'page_label': '5', 'file_name': 'deepseek-r1-paper.pdf', 'file_path': 'c:\\Users\\nazaizah\\OneDrive\\PycharmProjects\\rag-tutorial\\data\\deepseek-r1-paper.pdf', 'file_type': 'application/pdf', 'file_size': 1312189, 'creation_date': '2025-02-02', 'last_modified_date': '2025-02-02'}<br>

---

**`Source Node 3/10`**

**Node ID:** 287e3fff-c1ac-473f-b040-e3d816d9e7f6<br>**Similarity:** 0.7666674852371216<br>**Text:** 1.1. Contributions
Post-Training: Large-Scale Reinforcement Learning on the Base Model
• We directly apply RL to the base model without relying on supervised fine-tuning (SFT) as
a preliminary step. This approach allows the model to explore chain-of-thought (CoT) for
solving complex problems, resulting in the development of DeepSeek-R1-Zero. DeepSeek-
R1-Zero demonstrates capabilities such as self-verification, reflection, and generating
long CoTs, marking a significant milestone for the research community. Notably, it is the
first open research to validate that reasoning capabilities of LLMs can be incentivized
purely through RL, without the need for SFT. This breakthrough paves the way for future
advancements in this area.
• We introduce our pipeline to develop DeepSeek-R1. The pipeline incorporates two RL
stages aimed at discovering improved reasoning patterns and aligning with human pref-
erences, as well as two SFT stages that serve as the seed for the model’s reasoning and
non-reasoning capabilities. We believe the pipeline will benefit the industry by creating
better models.
Distillation: Smaller Models Can Be Powerful Too
• We demonstrate that the reasoning patterns of larger models can be distilled into smaller
models, resulting in better performance compared to the reasoning patterns discovered
through RL on small models. The open source DeepSeek-R1, as well as its API, will benefit
the research community to distill better smaller models in the future.
• Using the reasoning data generated by DeepSeek-R1, we fine-tuned several dense models
that are widely used in the research community. The evaluation results demonstrate that
the distilled smaller dense models perform exceptionally well on benchmarks. DeepSeek-
R1-Distill-Qwen-7B achieves 55.5% on AIME 2024, surpassing QwQ-32B-Preview.<br>**Metadata:** {'page_label': '4', 'file_name': 'deepseek-r1-paper.pdf', 'file_path': 'c:\\Users\\nazaizah\\OneDrive\\PycharmProjects\\rag-tutorial\\data\\deepseek-r1-paper.pdf', 'file_type': 'application/pdf', 'file_size': 1312189, 'creation_date': '2025-02-02', 'last_modified_date': '2025-02-02'}<br>

---

**`Source Node 4/10`**

**Node ID:** baf92cb4-1436-4b7b-a743-1f3856dd369a<br>**Similarity:** 0.7655319571495056<br>**Text:** For education-oriented knowledge benchmarks such as MMLU, MMLU-Pro, and GPQA
Diamond, DeepSeek-R1 demonstrates superior performance compared to DeepSeek-V3. This im-
provement is primarily attributed to enhanced accuracy in STEM-related questions, where signif-
icant gains are achieved through large-scale reinforcement learning. Additionally, DeepSeek-R1
excels on FRAMES, a long-context-dependent QA task, showcasing its strong document analysis
capabilities. This highlights the potential of reasoning models in AI-driven search and data
analysis tasks. On the factual benchmark SimpleQA, DeepSeek-R1 outperforms DeepSeek-V3,
demonstrating its capability in handling fact-based queries. A similar trend is observed where
OpenAI-o1 surpasses GPT-4o on this benchmark. However, DeepSeek-R1 performs worse than
DeepSeek-V3 on the Chinese SimpleQA benchmark, primarily due to its tendency to refuse
answering certain queries after safety RL. Without safety RL, DeepSeek-R1 could achieve an
accuracy of over 70%.
DeepSeek-R1 also delivers impressive results on IF-Eval, a benchmark designed to assess a
model’s ability to follow format instructions. These improvements can be linked to the inclusion
of instruction-following data during the final stages of supervised fine-tuning (SFT) and RL
training. Furthermore, remarkable performance is observed on AlpacaEval2.0 and ArenaHard,
indicating DeepSeek-R1’s strengths in writing tasks and open-domain question answering. Its
significant outperformance of DeepSeek-V3 underscores the generalization benefits of large-scale
RL, which not only boosts reasoning capabilities but also improves performance across diverse
domains. Moreover, the summary lengths generated by DeepSeek-R1 are concise, with an
average of 689 tokens on ArenaHard and 2,218 characters on AlpacaEval 2.0. This indicates that
13<br>**Metadata:** {'page_label': '13', 'file_name': 'deepseek-r1-paper.pdf', 'file_path': 'c:\\Users\\nazaizah\\OneDrive\\PycharmProjects\\rag-tutorial\\data\\deepseek-r1-paper.pdf', 'file_type': 'application/pdf', 'file_size': 1312189, 'creation_date': '2025-02-02', 'last_modified_date': '2025-02-02'}<br>

---

**`Source Node 5/10`**

**Node ID:** ac6cd866-773a-45c7-998e-6a6f478ee8ef<br>**Similarity:** 0.7557617425918579<br>**Text:** DeepSeek-R1 avoids introducing length bias during GPT-based evaluations, further solidifying
its robustness across multiple tasks.
On math tasks, DeepSeek-R1 demonstrates performance on par with OpenAI-o1-1217,
surpassing other models by a large margin. A similar trend is observed on coding algorithm
tasks, such as LiveCodeBench and Codeforces, where reasoning-focused models dominate these
benchmarks. On engineering-oriented coding tasks, OpenAI-o1-1217 outperforms DeepSeek-R1
on Aider but achieves comparable performance on SWE Verified. We believe the engineering
performance of DeepSeek-R1 will improve in the next version, as the amount of related RL
training data currently remains very limited.
3.2.<br>**Metadata:** {'page_label': '14', 'file_name': 'deepseek-r1-paper.pdf', 'file_path': 'c:\\Users\\nazaizah\\OneDrive\\PycharmProjects\\rag-tutorial\\data\\deepseek-r1-paper.pdf', 'file_type': 'application/pdf', 'file_size': 1312189, 'creation_date': '2025-02-02', 'last_modified_date': '2025-02-02'}<br>

---

**`Source Node 6/10`**

**Node ID:** 04902f50-1607-422b-b1ed-4128773035d3<br>**Similarity:** 0.7528694868087769<br>**Text:** exponentially larger search space. To address this, we set a maximum extension limit for each
node, but this can lead to the model getting stuck in local optima. Second, the value model
directly influences the quality of generation since it guides each step of the search process.
Training a fine-grained value model is inherently difficult, which makes it challenging for the
model to iteratively improve. While AlphaGo’s core success relied on training a value model to
progressively enhance its performance, this principle proves difficult to replicate in our setup
due to the complexities of token generation.
In conclusion, while MCTS can improve performance during inference when paired with a
pre-trained value model, iteratively boosting model performance through self-search remains a
significant challenge.
5. Conclusion, Limitations, and Future Work
In this work, we share our journey in enhancing model reasoning abilities through reinforcement
learning. DeepSeek-R1-Zero represents a pure RL approach without relying on cold-start
data, achieving strong performance across various tasks. DeepSeek-R1 is more powerful,
leveraging cold-start data alongside iterative RL fine-tuning. Ultimately, DeepSeek-R1 achieves
performance comparable to OpenAI-o1-1217 on a range of tasks.
We further explore distillation the reasoning capability to small dense models. We use
DeepSeek-R1 as the teacher model to generate 800K training samples, and fine-tune several small
dense models. The results are promising: DeepSeek-R1-Distill-Qwen-1.5B outperforms GPT-4o
and Claude-3.5-Sonnet on math benchmarks with 28.9% on AIME and 83.9% on MATH. Other
dense models also achieve impressive results, significantly outperforming other instruction-
tuned models based on the same underlying checkpoints.
In the future, we plan to invest in research across the following directions for DeepSeek-R1.
• General Capability:Currently, the capabilities of DeepSeek-R1 fall short of DeepSeek-V3
in tasks such as function calling, multi-turn, complex role-playing, and JSON output.<br>**Metadata:** {'page_label': '16', 'file_name': 'deepseek-r1-paper.pdf', 'file_path': 'c:\\Users\\nazaizah\\OneDrive\\PycharmProjects\\rag-tutorial\\data\\deepseek-r1-paper.pdf', 'file_type': 'application/pdf', 'file_size': 1312189, 'creation_date': '2025-02-02', 'last_modified_date': '2025-02-02'}<br>

---

**`Source Node 7/10`**

**Node ID:** 25130806-43f6-48ff-8d65-c85a57dca302<br>**Similarity:** 0.7498732209205627<br>**Text:** During training, DeepSeek-R1-Zero naturally emerged with numerous powerful and interesting
reasoning behaviors. After thousands of RL steps, DeepSeek-R1-Zero exhibits super performance
on reasoning benchmarks. For instance, the pass@1 score on AIME 2024 increases from 15.6% to
71.0%, and with majority voting, the score further improves to 86.7%, matching the performance
of OpenAI-o1-0912.
However, DeepSeek-R1-Zero encounters challenges such as poor readability, and language
mixing. To address these issues and further enhance reasoning performance, we introduce
DeepSeek-R1, which incorporates a small amount of cold-start data and a multi-stage training
pipeline. Specifically, we begin by collecting thousands of cold-start data to fine-tune the
DeepSeek-V3-Base model. Following this, we perform reasoning-oriented RL like DeepSeek-R1-
Zero. Upon nearing convergence in the RL process, we create new SFT data through rejection
sampling on the RL checkpoint, combined with supervised data from DeepSeek-V3 in domains
such as writing, factual QA, and self-cognition, and then retrain the DeepSeek-V3-Base model.
After fine-tuning with the new data, the checkpoint undergoes an additional RL process, taking
into account prompts from all scenarios. After these steps, we obtained a checkpoint referred to
as DeepSeek-R1, which achieves performance on par with OpenAI-o1-1217.
We further explore distillation from DeepSeek-R1 to smaller dense models. Using Qwen2.5-
32B (Qwen, 2024b) as the base model, direct distillation from DeepSeek-R1 outperforms applying
RL on it. This demonstrates that the reasoning patterns discovered by larger base models are cru-
cial for improving reasoning capabilities. We open-source the distilled Qwen and Llama (Dubey
et al., 2024) series.<br>**Metadata:** {'page_label': '3', 'file_name': 'deepseek-r1-paper.pdf', 'file_path': 'c:\\Users\\nazaizah\\OneDrive\\PycharmProjects\\rag-tutorial\\data\\deepseek-r1-paper.pdf', 'file_type': 'application/pdf', 'file_size': 1312189, 'creation_date': '2025-02-02', 'last_modified_date': '2025-02-02'}<br>

---

**`Source Node 8/10`**

**Node ID:** a17b89a8-f937-47cc-9b45-f1ea6144b3c7<br>**Similarity:** 0.7496972680091858<br>**Text:** A conversation between User and Assistant. The user asks a question, and the Assistant solves it.
The assistant first thinks about the reasoning process in the mind and then provides the user
with the answer. The reasoning process and answer are enclosed within <think> </think> and
<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think>
<answer> answer here </answer>. User: prompt. Assistant:
Table 1 |Template for DeepSeek-R1-Zero. prompt will be replaced with the specific reasoning
question during training.
2.2.2. Reward Modeling
The reward is the source of the training signal, which decides the optimization direction of RL.
To train DeepSeek-R1-Zero, we adopt a rule-based reward system that mainly consists of two
types of rewards:
• Accuracy rewards: The accuracy reward model evaluates whether the response is correct.
For example, in the case of math problems with deterministic results, the model is required
to provide the final answer in a specified format (e.g., within a box), enabling reliable
rule-based verification of correctness. Similarly, for LeetCode problems, a compiler can be
used to generate feedback based on predefined test cases.
• Format rewards: In addition to the accuracy reward model, we employ a format reward
model that enforces the model to put its thinking process between ‘<think>’ and ‘</think>’
tags.
We do not apply the outcome or process neural reward model in developing DeepSeek-R1-Zero,
because we find that the neural reward model may suffer from reward hacking in the large-scale
reinforcement learning process, and retraining the reward model needs additional training
resources and it complicates the whole training pipeline.
2.2.3. Training Template
To train DeepSeek-R1-Zero, we begin by designing a straightforward template that guides
the base model to adhere to our specified instructions. As depicted in Table 1, this template
requires DeepSeek-R1-Zero to first produce a reasoning process, followed by the final answer.<br>**Metadata:** {'page_label': '6', 'file_name': 'deepseek-r1-paper.pdf', 'file_path': 'c:\\Users\\nazaizah\\OneDrive\\PycharmProjects\\rag-tutorial\\data\\deepseek-r1-paper.pdf', 'file_type': 'application/pdf', 'file_size': 1312189, 'creation_date': '2025-02-02', 'last_modified_date': '2025-02-02'}<br>

---

**`Source Node 9/10`**

**Node ID:** 0c3dc7ce-423c-4575-a667-265bccde7640<br>**Similarity:** 0.7488131523132324<br>**Text:** As depicted in Table 1, this template
requires DeepSeek-R1-Zero to first produce a reasoning process, followed by the final answer.
We intentionally limit our constraints to this structural format, avoiding any content-specific
biases—such as mandating reflective reasoning or promoting particular problem-solving strate-
gies—to ensure that we can accurately observe the model’s natural progression during the RL
process.
2.2.4. Performance, Self-evolution Process and Aha Moment of DeepSeek-R1-Zero
Performance of DeepSeek-R1-Zero Figure 2 depicts the performance trajectory of DeepSeek-
R1-Zero on the AIME 2024 benchmark throughout the RL training process. As illustrated,
DeepSeek-R1-Zero demonstrates a steady and consistent enhancement in performance as the
RL training advances. Notably, the average pass@1 score on AIME 2024 shows a significant
increase, jumping from an initial 15.6% to an impressive 71.0%, reaching performance levels
comparable to OpenAI-o1-0912. This significant improvement highlights the efficacy of our RL
algorithm in optimizing the model’s performance over time.
Table 2 provides a comparative analysis between DeepSeek-R1-Zero and OpenAI’s o1-0912
models across a variety of reasoning-related benchmarks. The findings reveal that RL empowers
6<br>**Metadata:** {'page_label': '6', 'file_name': 'deepseek-r1-paper.pdf', 'file_path': 'c:\\Users\\nazaizah\\OneDrive\\PycharmProjects\\rag-tutorial\\data\\deepseek-r1-paper.pdf', 'file_type': 'application/pdf', 'file_size': 1312189, 'creation_date': '2025-02-02', 'last_modified_date': '2025-02-02'}<br>

---

**`Source Node 10/10`**

**Node ID:** 93bddeb7-c8c6-4c7b-8064-db6725ec69a3<br>**Similarity:** 0.7461886405944824<br>**Text:** Figure 3 |The average response length of DeepSeek-R1-Zero on the training set during the RL
process. DeepSeek-R1-Zero naturally learns to solve reasoning tasks with more thinking time.
ment throughout the training process. This improvement is not the result of external adjustments
but rather an intrinsic development within the model. DeepSeek-R1-Zero naturally acquires the
ability to solve increasingly complex reasoning tasks by leveraging extended test-time compu-
tation. This computation ranges from generating hundreds to thousands of reasoning tokens,
allowing the model to explore and refine its thought processes in greater depth.
One of the most remarkable aspects of this self-evolution is the emergence of sophisticated
behaviors as the test-time computation increases. Behaviors such as reflection—where the model
revisits and reevaluates its previous steps—and the exploration of alternative approaches to
problem-solving arise spontaneously. These behaviors are not explicitly programmed but instead
emerge as a result of the model’s interaction with the reinforcement learning environment. This
spontaneous development significantly enhances DeepSeek-R1-Zero’s reasoning capabilities,
enabling it to tackle more challenging tasks with greater efficiency and accuracy.
Aha Moment of DeepSeek-R1-ZeroA particularly intriguing phenomenon observed during
the training of DeepSeek-R1-Zero is the occurrence of an “aha moment”. This moment, as
illustrated in Table 3, occurs in an intermediate version of the model. During this phase,
DeepSeek-R1-Zero learns to allocate more thinking time to a problem by reevaluating its initial
approach. This behavior is not only a testament to the model’s growing reasoning abilities
but also a captivating example of how reinforcement learning can lead to unexpected and
sophisticated outcomes.
This moment is not only an “aha moment” for the model but also for the researchers
observing its behavior. It underscores the power and beauty of reinforcement learning: rather
than explicitly teaching the model on how to solve a problem, we simply provide it with the
right incentives, and it autonomously develops advanced problem-solving strategies.<br>**Metadata:** {'page_label': '8', 'file_name': 'deepseek-r1-paper.pdf', 'file_path': 'c:\\Users\\nazaizah\\OneDrive\\PycharmProjects\\rag-tutorial\\data\\deepseek-r1-paper.pdf', 'file_type': 'application/pdf', 'file_size': 1312189, 'creation_date': '2025-02-02', 'last_modified_date': '2025-02-02'}<br>

Hybrid Search:

In [22]:
hybrid_retriever = QueryFusionRetriever(
    [vector_retriever, bm25_retriever],
    similarity_top_k=10,
    num_queries=1,  # set this to 1 to disable query generation
    mode="reciprocal_rerank",
    use_async=False,
    verbose=True,
)

In [None]:
hybrid_retrieved_nodes = hybrid_retriever.retrieve(question)
for node in hybrid_retrieved_nodes:
    display_source_node(node, source_length=5000)

In [27]:
question = "can you list all the advantages of the deepseek r1 model over over models?"

question = "can explain in detail about reward modeling in which part of the paper it was mentioned?"

In [None]:
hybrid_query_engine = RetrieverQueryEngine.from_args(hybrid_retriever)

response = hybrid_query_engine.query(question)

display_response(
    response, source_length=5000, show_source=True, show_source_metadata=True
)

View Prompt:

In [None]:
# define prompt viewing function
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))


prompts_dict = hybrid_query_engine.get_prompts()
display_prompt_dict(prompts_dict)

Links:

https://arxiv.org/pdf/2501.12948

https://docs.llamaindex.ai/en/v0.10.33/examples/vector_stores/SimpleIndexDemo/

https://docs.llamaindex.ai/en/stable/examples/retrievers/bm25_retriever/

https://docs.llamaindex.ai/en/stable/examples/retrievers/reciprocal_rerank_fusion/


https://docs.llamaindex.ai/en/stable/module_guides/indexing/metadata_extraction/

Advanced Links:

https://docs.llamaindex.ai/en/stable/examples/retrievers/auto_merging_retriever/

https://docs.llamaindex.ai/en/stable/examples/workflow/rag/

Ollama:

https://ollama.com/