In [1]:
from llama_index.core import StorageContext, load_index_from_storage
from constants import embed_model 

# 1. Khôi phục lại "Bộ nhớ" (Index) đã lưu từ ổ cứng
# StorageContext giúp định vị nơi chứa dữ liệu (folder "index/")
storage_context = StorageContext.from_defaults(persist_dir="index/")

# 2. Tải Index lên bộ nhớ (RAM)
# Cần truyền đúng embed_model đã dùng lúc tạo index (Gemini Embedding)
index = load_index_from_storage(
    storage_context=storage_context,
    embed_model=embed_model
)

  from .autonotebook import tqdm as notebook_tqdm

All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


In [2]:
from llama_index.core.tools import QueryEngineTool 
from constants import llm_model

# 3. Tạo "Động cơ tìm kiếm" (Query Engine)
# Đây là bộ máy giúp tra cứu thông tin trong Index và dùng LLM để tổng hợp câu trả lời
query_engine = index.as_query_engine(
    llm=llm_model,      # Dùng model Gemini 2.5 Flash để suy luận
    similarity_top_k=5  # Lấy 5 đoạn văn bản liên quan nhất để đọc mỗi lần hỏi
)

# 4. Đóng gói thành Công cụ (Tool) cho Agent
# Agent sẽ dùng tool này khi cần tra cứu các bài báo khoa học
rag_engine = QueryEngineTool.from_defaults(
    query_engine,
    name="research_paper_query_engine_tool", # Tên định danh của tool
    description="A RAG engine with recent research papers.", # Mô tả giúp Agent hiểu công dụng
)

In [3]:
from IPython.display import display, Markdown   

# Hàm hỗ trợ hiển thị (Helper function)
# Dùng để in các Prompt (câu lệnh ngầm) ra màn hình với định dạng Markdown đẹp mắt
def display_prompt_dict(prompt_dict):
    for key, prompt in prompt_dict.items():
        display(Markdown(f"**Prompt key:** {key}")) # Tên loại prompt
        print(prompt.get_template())                # Nội dung template của prompt

In [4]:
# 5. Xem "Nội tạng" các Prompt mặc định
# Lệnh này giúp bạn kiểm tra xem LlamaIndex đang dùng câu lệnh gì để giao tiếp với LLM
prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)

**Prompt key:** response_synthesizer:text_qa_template

Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer: 


**Prompt key:** response_synthesizer:refine_template

The original query is as follows: {query_str}
We have provided an existing answer: {existing_answer}
We have the opportunity to refine the existing answer (only if needed) with some more context below.
------------
{context_msg}
------------
Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer.
Refined Answer: 


In [None]:
from tools import download_pdf, fetch_arxiv_papers
from llama_index.core.tools import FunctionTool

# 6. Chuyển đổi hàm Python thường thành Tools cho Agent
# FunctionTool giúp bọc các hàm mình tự viết lại để Agent có thể hiểu và gọi được

# Tool giúp download file PDF từ link
download_pdf_tool = FunctionTool.from_defaults(
    download_pdf,
    name="download_pdf_file_tool",
    description="python function that downloads a pdf file by link",
)

# Tool giúp tìm kiếm các bài báo mới trên Arxiv
fetch_arxiv_papers_tool = FunctionTool.from_defaults(
    fetch_arxiv_papers,
    name="fetch_from_arxiv",
    description="download the {max_results} recent research papers regarding the topic {title} from arXiv"
)

In [None]:
from llama_index.core.agent import ReActAgent
from llama_index.core.memory import ChatMemoryBuffer

# 7. Tạo bộ nhớ đệm (Chat Memory)
# Vì Agent Workflow mặc định là "không trạng thái" (stateless), ta cần tự tạo bộ nhớ 
# để lưu lịch sử hội thoại, giúp Agent nhớ được các câu hỏi trước đó.
memory = ChatMemoryBuffer.from_defaults(token_limit=20000)

# 8. Khởi tạo Agent ReAct (Reasoning + Acting)
# Đây là "bộ não" chính, biết suy luận và quyết định dùng công cụ nào
agent = ReActAgent(
    tools=[download_pdf_tool, rag_engine, fetch_arxiv_papers_tool], # Danh sách "vũ khí" của Agent
    llm=llm_model,     # Mô hình ngôn ngữ dùng để suy nghĩ
    verbose=True,      # In ra quá trình suy nghĩ (Thinking process) để dễ debug
)

In [None]:
# 9. Chạy thử lần đầu: Tóm tắt bài báo về GLM-130B
# Dùng await agent.run() thay vì chat() vì đây là Workflow Agent bất đồng bộ
response = await agent.run(
    user_msg="Summarize the paper about GLM-130B", 
    max_iterations=10, # Giới hạn số bước suy luận tối đa để tránh vòng lặp vô tận
    memory=memory      # Truyền bộ nhớ vào để cập nhật lịch sử chat
)
print(response)

The paper "ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools" introduces ChatGLM as an evolving family of large language models. While the paper primarily focuses on the more recent GLM-4 series (GLM-4, GLM-4-Air, and GLM-4-9B), it positions GLM-130B as a foundational model within this lineage. The GLM-4 models were developed using insights and lessons gained from preceding generations of ChatGLM, which include GLM-130B. This indicates that GLM-130B was a significant step in the development of the ChatGLM family, contributing to the advancements seen in its successors. The paper highlights the continuous development and open-sourcing efforts of models within this family.


In [None]:
# 10. Định nghĩa Template cho câu hỏi phức tạp hơn
# Hướng dẫn Agent cụ thể: Nếu tìm trong database có thì lấy, không có thì lên Arxiv tải về
query_template = """ I am instered in {topic}.
 Find papers in your knowledge database related to this topic.
 Use the following template to query research_paper_query_engine_tool tool: 'Provide title,summary,authors and link to download for paper related to {topic}'. 
 If there are not, could you fetch the recent one from arXiv?
 """

In [None]:
# 11. Thử nghiệm với chủ đề "Visual Generation..."
# Agent sẽ kiểm tra Index trước -> không có -> gọi tool fetch_arxiv_papers -> cập nhật Index -> trả lời
answer = await agent.run(
    user_msg=query_template.format(topic="Visual Generation Unlocks Human-Like Reasoning through Multimodal World Models"),
    max_iterations=10,
    memory=memory
)
print(answer)

I found the following paper related to "Visual Generation Unlocks Human-Like Reasoning through Multimodal World Models":

**Title:** Visual Generation Unlocks Human-Like Reasoning through Multimodal World Models
**Summary:** Humans develop internal world models for reasoning by manipulating concepts within them. While current AI systems, particularly those using chain-of-thought (CoT) reasoning, have achieved expert-level performance in formal domains like mathematics and programming through verbal reasoning, they struggle with physical and spatial intelligence, which demand richer representations and prior knowledge. The advent of unified multimodal models (UMMs) capable of both verbal and visual generation has led to interest in more human-like reasoning via multimodal pathways. This paper investigates when and how visual generation aids reasoning, proposing the visual superiority hypothesis: for tasks grounded in the physical world, visual generation more effectively serves as world

In [None]:
from IPython.display import display, Markdown

# Hiển thị câu trả lời dưới dạng Markdown cho dễ đọc
display(Markdown(str(answer)))

I found the following paper related to "Visual Generation Unlocks Human-Like Reasoning through Multimodal World Models":

**Title:** Visual Generation Unlocks Human-Like Reasoning through Multimodal World Models
**Summary:** Humans develop internal world models for reasoning by manipulating concepts within them. While current AI systems, particularly those using chain-of-thought (CoT) reasoning, have achieved expert-level performance in formal domains like mathematics and programming through verbal reasoning, they struggle with physical and spatial intelligence, which demand richer representations and prior knowledge. The advent of unified multimodal models (UMMs) capable of both verbal and visual generation has led to interest in more human-like reasoning via multimodal pathways. This paper investigates when and how visual generation aids reasoning, proposing the visual superiority hypothesis: for tasks grounded in the physical world, visual generation more effectively serves as world models, overcoming the representational limitations or insufficient prior knowledge of purely verbal models. The work formalizes internal world modeling as a core part of CoT reasoning, analyzes different world model forms, and identifies tasks requiring interleaved visual-verbal CoT reasoning, introducing a new evaluation suite called VisWorld-Eval. Experiments with a state-of-the-art UMM demonstrate that interleaved CoT significantly outperforms purely verbal CoT on tasks that benefit from visual world modeling, but shows no clear advantage otherwise. This research clarifies the potential of multimodal world modeling for more powerful, human-like multimodal AI.
**Authors:** Jialong Wu, Xiaoying Zhang, Hongyi Yuan, Xiangcheng Zhang, Tianhao Huang, Changjing He, Chaoyi Deng, Renrui Zhang, Youbin Wu, Mingsheng Long
**Link to download:** https://arxiv.org/pdf/2601.19834v1

In [11]:
answer = await agent.run(
    user_msg=query_template.format(topic="EgoHandICL: Egocentric 3D Hand Reconstruction with In-Context Learning"),
    max_iterations=10,
    memory=memory
)
print(answer)

I found the following paper related to "EgoHandICL: Egocentric 3D Hand Reconstruction with In-Context Learning":

**Title:** EgoHandICL: Egocentric 3D Hand Reconstruction with In-Context Learning
**Authors:** Binzhu Xie, Shi Qiu, Sicheng Zhang, Yinqiao Wang, Hao Xu, Muzammal Naseer, Chi-Wing Fu, Pheng-Ann Heng
**Summary:** Robust 3D hand reconstruction in egocentric vision is challenging due to depth ambiguity, self-occlusion, and complex hand-object interactions. Prior methods mitigate these issues by scaling training data or adding auxiliary cues, but they often struggle in unseen contexts. This paper introduces EgoHandICL, the first in-context learning (ICL) framework for 3D hand reconstruction designed to improve semantic alignment, visual consistency, and robustness under challenging egocentric conditions. EgoHandICL incorporates complementary exemplar retrieval guided by vision-language models (VLMs), an ICL-tailored tokenizer for multimodal context, and a masked autoencoder (MAE

In [12]:
from IPython.display import display, Markdown

# Hiển thị kết quả (Markdown mặc định căn trái)
display(Markdown(str(answer)))

I found the following paper related to "EgoHandICL: Egocentric 3D Hand Reconstruction with In-Context Learning":

**Title:** EgoHandICL: Egocentric 3D Hand Reconstruction with In-Context Learning
**Authors:** Binzhu Xie, Shi Qiu, Sicheng Zhang, Yinqiao Wang, Hao Xu, Muzammal Naseer, Chi-Wing Fu, Pheng-Ann Heng
**Summary:** Robust 3D hand reconstruction in egocentric vision is challenging due to depth ambiguity, self-occlusion, and complex hand-object interactions. Prior methods mitigate these issues by scaling training data or adding auxiliary cues, but they often struggle in unseen contexts. This paper introduces EgoHandICL, the first in-context learning (ICL) framework for 3D hand reconstruction designed to improve semantic alignment, visual consistency, and robustness under challenging egocentric conditions. EgoHandICL incorporates complementary exemplar retrieval guided by vision-language models (VLMs), an ICL-tailored tokenizer for multimodal context, and a masked autoencoder (MAE)-based architecture trained with hand-guided geometric and perceptual objectives. Experiments on ARCTIC and EgoExo4D datasets demonstrate consistent improvements over state-of-the-art methods. The framework also shows real-world generalization and enhances EgoVLM hand-object interaction reasoning by utilizing reconstructed hands as visual prompts.
**Link to download:** https://arxiv.org/pdf/2601.19850v1

In [13]:
answer = await agent.run(
    user_msg="Download all the papers you mentioned in previous turns.",
    max_iterations=10,
    memory=memory
)
print(answer)

I have successfully downloaded both papers:
1. "Visual Generation Unlocks Human-Like Reasoning through Multimodal World Models"
2. "EgoHandICL: Egocentric 3D Hand Reconstruction with In-Context Learning"
