In [43]:
from llama_index.core import StorageContext, load_index_from_storage
from constants import embed_model 

# 1. Khôi phục lại "Bộ nhớ" (Index) đã lưu từ ổ cứng
# StorageContext giúp định vị nơi chứa dữ liệu (folder "index/")
storage_context = StorageContext.from_defaults(persist_dir="index/")

# 2. Tải Index lên bộ nhớ (RAM)
# Cần truyền đúng embed_model đã dùng lúc tạo index (Gemini Embedding)
index = load_index_from_storage(
    storage_context=storage_context,
    embed_model=embed_model
)

In [44]:
from llama_index.core.tools import QueryEngineTool 
from constants import llm_model

# 3. Tạo "Động cơ tìm kiếm" (Query Engine)
# Đây là bộ máy giúp tra cứu thông tin trong Index và dùng LLM để tổng hợp câu trả lời
query_engine = index.as_query_engine(
    llm=llm_model,      # Dùng model Gemini 2.5 Flash để suy luận
    similarity_top_k=5  # Lấy 5 đoạn văn bản liên quan nhất để đọc mỗi lần hỏi
)

# 4. Đóng gói thành Công cụ (Tool) cho Agent
# Agent sẽ dùng tool này khi cần tra cứu các bài báo khoa học
rag_engine = QueryEngineTool.from_defaults(
    query_engine,
    name="research_paper_query_engine_tool", # Tên định danh của tool
    description="A RAG engine with recent research papers.", # Mô tả giúp Agent hiểu công dụng
)

In [45]:
from IPython.display import display, Markdown   

# Hàm hỗ trợ hiển thị (Helper function)
# Dùng để in các Prompt (câu lệnh ngầm) ra màn hình với định dạng Markdown đẹp mắt
def display_prompt_dict(prompt_dict):
    for key, prompt in prompt_dict.items():
        display(Markdown(f"**Prompt key:** {key}")) # Tên loại prompt
        print(prompt.get_template())                # Nội dung template của prompt

In [46]:
# 5. Xem "Nội tạng" các Prompt mặc định
# Lệnh này giúp bạn kiểm tra xem LlamaIndex đang dùng câu lệnh gì để giao tiếp với LLM
prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)

**Prompt key:** response_synthesizer:text_qa_template

Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer: 


**Prompt key:** response_synthesizer:refine_template

The original query is as follows: {query_str}
We have provided an existing answer: {existing_answer}
We have the opportunity to refine the existing answer (only if needed) with some more context below.
------------
{context_msg}
------------
Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer.
Refined Answer: 


In [47]:
from tools import download_pdf, fetch_arxiv_papers
from llama_index.core.tools import FunctionTool

# 6. Chuyển đổi hàm Python thường thành Tools cho Agent
# FunctionTool giúp bọc các hàm mình tự viết lại để Agent có thể hiểu và gọi được

# Tool giúp download file PDF từ link
download_pdf_tool = FunctionTool.from_defaults(
    download_pdf,
    name="download_pdf_file_tool",
    description="python function that downloads a pdf file by link",
)

# Tool giúp tìm kiếm các bài báo mới trên Arxiv
fetch_arxiv_papers_tool = FunctionTool.from_defaults(
    fetch_arxiv_papers,
    name="fetch_from_arxiv",
    description="download the {max_results} recent research papers regarding the topic {title} from arXiv"
)

In [54]:
from llama_index.core.agent import ReActAgent
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.llms.gemini import Gemini
from constants import GOOGLE_API_KEY

# Cập nhật lại LLM với giới hạn token lớn hơn để tránh lỗi "Response terminated early"
llm_model = Gemini(
    api_key=GOOGLE_API_KEY,
    model_name="models/gemini-2.5-flash",
    max_tokens=8192 # Tăng lên để Agent có thể trả lời câu hỏi dài
)

# 7. Tạo bộ nhớ đệm (Chat Memory)
# Vì Agent Workflow mặc định là "không trạng thái" (stateless), ta cần tự tạo bộ nhớ 
# để lưu lịch sử hội thoại, giúp Agent nhớ được các câu hỏi trước đó.
memory = ChatMemoryBuffer.from_defaults(token_limit=20000)

# 8. Khởi tạo Agent ReAct (Reasoning + Acting)
# Đây là "bộ não" chính, biết suy luận và quyết định dùng công cụ nào
agent = ReActAgent(
    tools=[download_pdf_tool, rag_engine, fetch_arxiv_papers_tool], # Danh sách "vũ khí" của Agent
    llm=llm_model,     # Mô hình ngôn ngữ dùng để suy nghĩ
    verbose=True,      # In ra quá trình suy nghĩ (Thinking process) để dễ debug
    streaming=False    # Tắt streaming để tránh lỗi "IndexError: list index out of range" với Gemini khi gọi tool
)

  llm_model = Gemini(


In [49]:
# 9. Chạy thử lần đầu: Tóm tắt bài báo về GLM-130B
# Dùng await agent.run() thay vì chat() vì đây là Workflow Agent bất đồng bộ
response = await agent.run(
    user_msg="Summarize the paper about GLM-130B", 
    max_iterations=10, # Giới hạn số bước suy luận tối đa để tránh vòng lặp vô tận
    memory=memory      # Truyền bộ nhớ vào để cập nhật lịch sử chat
)
print(response)

The paper "ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools" introduces the evolving ChatGLM family of large language models, with GLM-130B being an earlier generation that informed the development of the more recent GLM-4 series (GLM-4, GLM-4-Air, and GLM-4-9B). The GLM-4 models are highlighted as the most capable, trained on trillions of tokens primarily in Chinese and English, and aligned for these languages through supervised fine-tuning and human feedback. Evaluations indicate that GLM-4 models rival or outperform GPT-4 on various general metrics, approach GPT-4-Turbo in instruction following, match GPT-4 Turbo and Claude 3 for long context tasks, and surpass GPT-4 in Chinese alignments. The GLM-4 All Tools model is designed to autonomously use tools like web browsers and Python interpreters, performing comparably to or better than GPT-4 All Tools in practical applications. The authors have also open-sourced several models from this family, including ear

In [50]:
# 10. Định nghĩa Template cho câu hỏi phức tạp hơn
# Hướng dẫn Agent cụ thể: Nếu tìm trong database có thì lấy, không có thì lên Arxiv tải về
query_template = """ I am instered in {topic}.
 Find papers in your knowledge database related to this topic.
 Use the following template to query research_paper_query_engine_tool tool: 'Provide title,summary,authors and link to download for paper related to {topic}'. 
 If there are not, could you fetch the recent one from arXiv?
 IMPORTANT: do not download papers unless the user asks for it explitcily.
 """

In [51]:
# 11. Thử nghiệm với chủ đề "Visual Generation..."
# Agent sẽ kiểm tra Index trước -> không có -> gọi tool fetch_arxiv_papers -> cập nhật Index -> trả lời
answer = await agent.run(
    user_msg=query_template.format(topic="Visual Generation Unlocks Human-Like Reasoning through Multimodal World Models"),
    max_iterations=10,
    memory=memory
)
print(answer)

Title: Visual Generation Unlocks Human-Like Reasoning through Multimodal World Models
Authors: Jialong Wu, Xiaoying Zhang, Hongyi Yuan, Xiangcheng Zhang, Tianhao Huang, Changjing He, Chaoyi Deng, Renrui Zhang, Youbin Wu, Mingsheng Long
Summary: Humans develop internal world models and use them to reason by manipulating concepts. While recent AI advances like chain-of-thought (CoT) reasoning approximate these abilities, primarily through verbal reasoning, they struggle with physical and spatial intelligence that requires richer representations. The emergence of unified multimodal models (UMMs) capable of both verbal and visual generation suggests a path to more human-like reasoning. This paper investigates when and how visual generation benefits reasoning, proposing the visual superiority hypothesis: for tasks grounded in the physical world, visual generation naturally serves as world models, overcoming limitations of purely verbal models. The work formalizes internal world modeling wit

In [52]:
from IPython.display import display, Markdown

# Hiển thị câu trả lời dưới dạng Markdown cho dễ đọc
display(Markdown(str(answer)))

Title: Visual Generation Unlocks Human-Like Reasoning through Multimodal World Models
Authors: Jialong Wu, Xiaoying Zhang, Hongyi Yuan, Xiangcheng Zhang, Tianhao Huang, Changjing He, Chaoyi Deng, Renrui Zhang, Youbin Wu, Mingsheng Long
Summary: Humans develop internal world models and use them to reason by manipulating concepts. While recent AI advances like chain-of-thought (CoT) reasoning approximate these abilities, primarily through verbal reasoning, they struggle with physical and spatial intelligence that requires richer representations. The emergence of unified multimodal models (UMMs) capable of both verbal and visual generation suggests a path to more human-like reasoning. This paper investigates when and how visual generation benefits reasoning, proposing the visual superiority hypothesis: for tasks grounded in the physical world, visual generation naturally serves as world models, overcoming limitations of purely verbal models. The work formalizes internal world modeling within CoT reasoning and analyzes different world model forms. Empirically, it identifies tasks requiring interleaved visual-verbal CoT reasoning, creating a new evaluation suite called VisWorld-Eval. Experiments with a state-of-the-art UMM show that interleaved CoT significantly outperforms purely verbal CoT on tasks favoring visual world modeling, but not otherwise, clarifying the potential of multimodal world modeling for advanced AI.
Download Link: https://arxiv.org/pdf/2601.19834v1

In [55]:
# 12. Thử nghiệm tiếp với chủ đề "EgoHandICL..."
# Tiếp tục quy trình tìm kiếm và tổng hợp thông tin
answer = await agent.run(
    user_msg=query_template.format(topic="EgoHandICL: Egocentric 3D Hand Reconstruction with In-Context Learning"),
    max_iterations=10,
    memory=memory
)
print(answer)

I found the following paper related to "EgoHandICL: Egocentric 3D Hand Reconstruction with In-Context Learning":

Title: EgoHandICL: Egocentric 3D Hand Reconstruction with In-Context Learning
Authors: Binzhu Xie, Shi Qiu, Sicheng Zhang, Yinqiao Wang, Hao Xu, Muzammal Naseer, Chi-Wing Fu, Pheng-Ann Heng
Summary: Robust 3D hand reconstruction in egocentric vision is challenging due to depth ambiguity, self-occlusion, and complex hand-object interactions. Prior methods mitigate these issues by scaling training data or adding auxiliary cues, but they often struggle in unseen contexts. This paper presents EgoHandICL, the first in-context learning (ICL) framework for 3D hand reconstruction that improves semantic alignment, visual consistency, and robustness under challenging egocentric conditions. EgoHandICL introduces complementary exemplar retrieval guided by vision-language models (VLMs), an ICL-tailored tokenizer for multimodal context, and a masked autoencoder (MAE)-based architecture t

In [56]:
# Hiển thị kết quả lần 2
display(Markdown(str(answer)))

I found the following paper related to "EgoHandICL: Egocentric 3D Hand Reconstruction with In-Context Learning":

Title: EgoHandICL: Egocentric 3D Hand Reconstruction with In-Context Learning
Authors: Binzhu Xie, Shi Qiu, Sicheng Zhang, Yinqiao Wang, Hao Xu, Muzammal Naseer, Chi-Wing Fu, Pheng-Ann Heng
Summary: Robust 3D hand reconstruction in egocentric vision is challenging due to depth ambiguity, self-occlusion, and complex hand-object interactions. Prior methods mitigate these issues by scaling training data or adding auxiliary cues, but they often struggle in unseen contexts. This paper presents EgoHandICL, the first in-context learning (ICL) framework for 3D hand reconstruction that improves semantic alignment, visual consistency, and robustness under challenging egocentric conditions. EgoHandICL introduces complementary exemplar retrieval guided by vision-language models (VLMs), an ICL-tailored tokenizer for multimodal context, and a masked autoencoder (MAE)-based architecture trained with hand-guided geometric and perceptual objectives. Experiments on ARCTIC and EgoExo4D show consistent gains over state-of-the-art methods. The framework also demonstrates real-world generalization and improves EgoVLM hand-object interaction reasoning by using reconstructed hands as visual prompts.
Link to download: https://arxiv.org/pdf/2601.19850v1

In [57]:
# 13. Kiểm tra khả năng ghi nhớ và hành động
# Yêu cầu tải tất cả các file PDF đã tìm được ở các bước trước.
# Nhờ có tham số `memory=memory`, Agent sẽ nhớ được các bài báo đã thảo luận.
answer = await agent.run(
    user_msg="Download all the papers you mentioned in previous turns.",
    max_iterations=10,
    memory=memory
)
print(answer)

The paper "EgoHandICL: Egocentric 3D Hand Reconstruction with In-Context Learning" has been successfully downloaded and saved as papers\EgoHandICL_Egocentric_3D_Hand_Reconstruction_with_In-Context_Learning.pdf.


In [58]:
# 14. Thử nghiệm thêm chủ đề "Quantum Computing"
# Agent tiếp tục quy trình: Search -> Fetch (nếu cần) -> Answer
answer = await agent.run(
    user_msg=query_template.format(topic="Quantum Computing"),
    max_iterations=10,
    memory=memory
)
print(answer)

Here are 5 recent papers related to "Quantum Computing" fetched from arXiv:

1.  **Title:** Real-Time Iteration Scheme for Dynamical Mean-Field Theory: A Framework for Near-Term Quantum Simulation
    **Authors:** Chakradhar Rangi, Aadi Singh, Ka-Ming Tam
    **Summary:** We present a time-domain iteration scheme for solving the Dynamical Mean-Field Theory (DMFT) self-consistent equations using retarded Green's functions in real time. Unlike conventional DMFT approaches that operate in imaginary time or frequency space, our scheme operates directly with real-time quantities. This makes it particularly suitable for near-term quantum computing hardware with limited Hilbert spaces, where real-time propagation can be efficiently implemented via Trotterization or variational quantum algorithms. We map the effective impurity problem to a finite one-dimensional chain with a small number of bath sites, solved via exact diagonalization as a proof-of-concept. The hybridization function is iterat

In [59]:
# 15. Hiển thị kết quả tìm kiếm cho chủ đề Quantum Computing
display(Markdown(str(answer)))

Here are 5 recent papers related to "Quantum Computing" fetched from arXiv:

1.  **Title:** Real-Time Iteration Scheme for Dynamical Mean-Field Theory: A Framework for Near-Term Quantum Simulation
    **Authors:** Chakradhar Rangi, Aadi Singh, Ka-Ming Tam
    **Summary:** We present a time-domain iteration scheme for solving the Dynamical Mean-Field Theory (DMFT) self-consistent equations using retarded Green's functions in real time. Unlike conventional DMFT approaches that operate in imaginary time or frequency space, our scheme operates directly with real-time quantities. This makes it particularly suitable for near-term quantum computing hardware with limited Hilbert spaces, where real-time propagation can be efficiently implemented via Trotterization or variational quantum algorithms. We map the effective impurity problem to a finite one-dimensional chain with a small number of bath sites, solved via exact diagonalization as a proof-of-concept. The hybridization function is iteratively updated through time-domain fitting until self-consistency. We demonstrate stable convergence across a wide range of interaction strengths for the half-filled Hubbard model on a Bethe lattice, successfully capturing the metal-to-insulator transition. Despite using limited time resolution and a minimal bath discretization, the spectral functions clearly exhibit the emergence of Hubbard bands and the suppression of spectral weight at the Fermi level as interaction strength increases. This overcomes major limitations of two-site DMFT approximations by delivering detailed spectral features while preserving efficiency and compatibility with quantum computing platforms through real-time dynamics.
    **Link to download:** https://arxiv.org/pdf/2601.19896v1

2.  **Title:** Distinguishing synthetic unravelings on quantum computers
    **Authors:** Eloy Piñol, Piotr Sierant, Dustin Keys, Romain Veyron, Miguel Angel García-March, Tanner Reese, Morgan W. Mitchell, Jan Wehr, Maciej Lewenstein
    **Summary:** Distinct monitoring or intervention schemes can produce different conditioned stochastic quantum trajectories while sharing the same unconditional (ensemble-averaged) dynamics. This is the essence of unravelings of a given Gorini-Kossakowski-Sudarshan-Lindblad (GKSL) master equation: any trajectory-ensemble average of a function that is linear in the conditional state is completely determined by the unconditional density matrix, whereas applying a nonlinear function before averaging can yield unraveling-dependent results beyond the average evolution. A paradigmatic example is resonance fluorescence, where direct photodetection (jump/Poisson) and homodyne or heterodyne detection (diffusive/Wiener) define inequivalent unravelings of the same GKSL dynamics. In earlier work, we showed that nonlinear trajectory averages can distinguish such unravelings, but observing the effect in that optical setting requires demanding experimental precision. Here we translate the same idea to a digital setting by introducing synthetic unravelings implemented as quantum circuits acting on one and two qubits. We design two unravelings - a projective measurement unraveling and a random-unitary "kick" unraveling - that share the same ensemble-averaged evolution while yielding different nonlinear conditional-state statistics. We implement the protocols on superconducting-qubit hardware provided by IBM Quantum to access trajectory-level information. We show that the variance across trajectories and the ensemble-averaged von Neumann entropy distinguish the unravelings in both theory and experiment, while the unconditional state and the ensemble-averaged expectation values that are linear in the state remain identical. Our results provide an accessible demonstration that quantum trajectories encode information about measurement backaction beyond what is fixed by the unconditional dynamics.
    **Link to download:** https://arxiv.org/pdf/2601.19889v1

3.  **Title:** Theory of low-weight quantum codes
    **Authors:** Fuchuan Wei, Zhengyi Han, Austin Yubo He, Zimu Li, Zi-Wen Liu
    **Summary:** Low check weight is practically crucial code property for fault-tolerant quantum computing, which underlies the strong interest in quantum low-density parity-check (qLDPC) codes. Here, we explore the theory of weight-constrained stabilizer codes from various foundational perspectives including the complexity of computing code weight and the explicit boundary of feasible low-weight codes in both theoretical and practical settings. We first prove that calculating the optimal code weight is an $\\mathsf{NP}$-hard problem, demonstrating the necessity of establishing bounds for weight that are analytical or efficiently computable. Then we systematically investigate the feasible code parameters with weight constraints. We provide various explicit analytical lower bounds and in particular completely characterize stabilizer codes with weight at most 3, showing that they have distance 2 and code rate at most 1/4. We also develop a powerful linear programming (LP) scheme for setting code parameter bounds with weight constraints, which yields exact optimal weight values for all code parameters with $n\\leq 9$. We further refined this constraint from multiple perspectives by considering the generator weight distribution and overlap. In particular, we consider practical architectures and demonstrate how to apply our methods to e.g.~the IBM 127-qubit chip. Our study brings the weight as a crucial parameter into coding theory and provide guidance for code design and utility in practical scenarios.
    **Link to download:** https://arxiv.org/pdf/2601.19848v1

4.  **Title:** A Folded Surface Code Architecture for 2D Quantum Hardware
    **Authors:** Zhu Sun, Zhenyu Cai
    **Summary:** Qubit shuttling has become an indispensable ingredient for scaling leading quantum computing platforms, including semiconductor spin, neutral-atom, and trapped-ion qubits, enabling both crosstalk reduction and tighter integration of control hardware. Cai et al. (2023) proposed a scalable architecture that employs short-range shuttling to realize effective three-dimensional connectivity on a strictly two-dimensional device. Building on recent advances in quantum error correction, we show that this architecture enables the native implementation of folded surface codes on 2D hardware, reducing the runtime of all single-qubit logical Clifford gates and logical CNOTs within subsets of qubits from $\\mathcal{O}(d)$ in conventional surface code lattice surgery to constant time. We present explicit protocols for these operations and demonstrate that access to a transversal $S$ gate reduces the spacetime volume of 8T-to-CCZ magic-state distillation by more than an order of magnitude compared with standard 2D lattice surgery approaches. Finally, we introduce a new "virtual-stack" layout that more efficiently exploits the quasi-three-dimensional structure of the architecture, enabling efficient multilayer routing on these two-dimensional devices.
    **Link to download:** https://arxiv.org/pdf/2601.19823v1

5.  **Title:** Quantum Circuit Pre-Synthesis: Learning Local Edits to Reduce $T$-count
    **Authors:** Daniele Lizzio Bosco, Lukasz Cincio, Giuseppe Serra, M. Cerezo
    **Summary:** Compiling quantum circuits into Clifford+$T$ gates is a central task for fault-tolerant quantum computing using stabilizer codes. In the near term, $T$ gates will dominate the cost of fault tolerant implementations, and any reduction in the number of such expensive gates could mean the difference between being able to run a circuit or not. While exact synthesis is exponentially hard in the number of qubits, local synthesis approaches are commonly used to compile large circuits by decomposing them into substructures. However, composing local methods leads to suboptimal compilations in key metrics such as $T$-count or circuit depth, and their performance strongly depends on circuit representation. In this work, we address this challenge by proposing \\textsc{Q-PreSyn}, a strategy that, given a set of local edits preserving circuit equivalence, uses a RL agent to identify effective sequences of such actions and thereby obtain circuit representations that yield a reduced $T$-count upon synthesis. Experimental results of our proposed strategy, applied on top of well-known synthesis algorithms, show up to a $20\\%$ reduction in $T$-count on circuits with up to 25 qubits, without introducing any additional approximation error prior to synthesis.
    **Link to download:** https://arxiv.org/pdf/2601.19738v1

In [60]:
# 16. Yêu cầu tải hàng loạt với hướng dẫn chi tiết (Chain of Thought)
# Prompt này yêu cầu Agent xử lý từng bước (step-by-step) để tránh bị quá tải hoặc bỏ sót file
answer = await agent.run(
    user_msg="""Download the following papers:
    For each paper: 
    1. Process one paper at a time 
    2. State which paper number you are processing out of the total 
    3. Complete a full download cycle before moving to the next paper 
    4. Explicitly state when moving to the next paper 
    5. Provide a final summary only after all papers are download """,
    max_iterations=10,
    memory=memory
)
print(answer)

All 5 papers have been successfully downloaded:

1.  **Real-Time Iteration Scheme for Dynamical Mean-Field Theory: A Framework for Near-Term Quantum Simulation** saved as `papers\Real-Time_Iteration_Scheme_for_Dynamical_Mean-Field_Theory.pdf`
2.  **Distinguishing synthetic unravelings on quantum computers** saved as `papers\Distinguishing_synthetic_unravelings_on_quantum_computers.pdf`
3.  **Theory of low-weight quantum codes** saved as `papers\Theory_of_low-weight_quantum_codes.pdf`
4.  **A Folded Surface Code Architecture for 2D Quantum Hardware** saved as `papers\A_Folded_Surface_Code_Architecture_for_2D_Quantum_Hardware.pdf`
5.  **Quantum Circuit Pre-Synthesis: Learning Local Edits to Reduce $T$-count** saved as `papers\Quantum_Circuit_Pre-Synthesis_Learning_Local_Edits_to_Reduce_T-count.pdf`
