In [3]:
# 建立整體的ui介面，變成一個問答機器人
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader

In [4]:
'''
目的：建立langchain + openai 的基礎環境
'''
import os # 作業系統相關功能（讀取環境變數）
from openai import OpenAI # openai api 客戶端
from dotenv import load_dotenv, find_dotenv # dotenv 是專門用來讀取.env套件的套件，並接上環境
_ = load_dotenv(find_dotenv()) # 讀取.env檔案
client = OpenAI(
    api_key=os.environ['OPENAI_API_KEY']
)
print("done")

done


In [5]:
from dotenv import load_dotenv
import os

# 清除舊的環境變數
if 'OPENAI_API_KEY' in os.environ:
    del os.environ['OPENAI_API_KEY']

# 重新載入
load_dotenv()

api_key = os.environ.get('OPENAI_API_KEY')
if api_key:
    print("✅ Success! API key loaded")
    print(f"Key starts with: {api_key[:15]}...")
    print(f"Key length: {len(api_key)} characters")
else:
    print("❌ Still not working")

✅ Success! API key loaded
Key starts with: sk-proj-FWUUter...
Key length: 164 characters


In [8]:
# 檢查資料夾是否存在
import os
folder_path = "/Users/mangtinglee/Desktop/2025_gap_careerpath/RAG_LLM/pdfs"

print(f"Folder exists: {os.path.exists(folder_path)}")

Folder exists: True


In [9]:
# 執行
import pandas as pd

def load_metadata_mapping(csv_path):
    """讀取CSV檔案建立metadata對應表"""
    df = pd.read_csv(csv_path)
    df.columns = df.columns.str.strip() # df.colums是物件，加上'.str()'轉換成陣列才可以使用.strip()
    # 將filename作為key，其他資訊作為value
    metadata_map = {}
    for _, row in df.iterrows():
        metadata_map[row['filename']] = {
            'title': row['title'],
            'year': row['year'],
            'authors': row['authors'],
            'topic': row['topic']
        }
    return metadata_map

def filter_first_page_only(documents):
    """只保留第一頁（Abstract）"""
    first_page_docs = [doc for doc in documents if doc.metadata['page'] == 0]
    print(f"原始頁數：{len(documents)} → 只保留第一頁：{len(first_page_docs)} 頁")
    return first_page_docs

def load_all_first_pages_with_csv_metadata(folder_path, csv_path):
    """載入PDF第一頁並從CSV對應metadata"""
    
    # 1. 先讀取metadata對應表
    metadata_map = load_metadata_mapping(csv_path)
    print(f"載入metadata對應表，共 {len(metadata_map)} 筆資料")
    
    all_first_pages = []
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    for pdf_file in pdf_files:
        print(f"處理：{pdf_file}")
        
        file_path = os.path.join(folder_path, pdf_file)
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        
        # 只取第一頁
        first_page_docs = filter_first_page_only(documents)
        
        # 🎯 關鍵：從CSV對應metadata
        for doc in first_page_docs:
            if pdf_file in metadata_map:
                # 找到對應的metadata
                doc.metadata.update(metadata_map[pdf_file])
                print(f"  ✅ 已更新metadata: {metadata_map[pdf_file]['title']}")
            else:
                # 找不到對應資料
                print(f"  ⚠️  警告：{pdf_file} 在CSV中找不到對應資料")
                doc.metadata.update({
                    'title': pdf_file.replace('.pdf', ''),
                    'year': None,
                    'authors': 'Unknown',
                    'topic': 'Unknown'
                })
        
        all_first_pages.extend(first_page_docs)
    
    print(f"總共載入 {len(all_first_pages)} 個第一頁，metadata已更新")

    # all_first_pages = clean_metadata(all_first_pages)

    return all_first_pages




# 使用

csv_path = "/Users/mangtinglee/Desktop/2025_gap_careerpath/RAG_LLM/meta_data_correction.csv"
all_abstracts_with_metadata = load_all_first_pages_with_csv_metadata(folder_path, csv_path)
# all_abstracts_with_metadata = clean_metadata(all_abstracts_with_metadata)

for i, doc in enumerate(all_abstracts_with_metadata[:3]):
    print(f"\n--- Document {i+1} ---")
    print(f"Title: {doc.metadata.get('title', 'N/A')}")
    print(f"Year: {doc.metadata.get('year', 'N/A')}")
    print(f"Authors: {doc.metadata.get('authors', 'N/A')}")
    print(f"Topic: {doc.metadata.get('topic', 'N/A')}")

載入metadata對應表，共 24 筆資料
處理：2023_LLM limitation_Large Language Models Sensitivity to The Order of Options in Multiple-Choice Questions.pdf
原始頁數：11 → 只保留第一頁：1 頁
  ✅ 已更新metadata: Large Language Models Sensitivity to The Order of Options in Multiple-Choice Questions
處理：2025_LLM limitation_The Order Effect- Investigating Prompt Sensitivity to Input Order in LLMs.pdf
原始頁數：13 → 只保留第一頁：1 頁
  ✅ 已更新metadata: The Order Effect- Investigating Prompt Sensitivity to Input Order in LLMs
處理：2024_RAG_Evaluation of Retrieval-Augmented Generation- A Survey.pdf
原始頁數：21 → 只保留第一頁：1 頁
  ✅ 已更新metadata: Evaluation of Retrieval-Augmented Generation- A Survey
處理：2020_scaling laws_Scaling Laws for Neural Language Models.pdf
原始頁數：30 → 只保留第一頁：1 頁
  ✅ 已更新metadata: Scaling Laws for Neural Language Models
處理：2022_LLM limitation_Robustness of Learning from Task Instructions.pdf
原始頁數：12 → 只保留第一頁：1 頁
  ✅ 已更新metadata: Robustness of Learning from Task Instructions
處理：2024_RAG_Retrieval-Augmented Generation for Large Language

In [10]:
def clean_metadata(metadata): # 只接收單一doc.metadata
    """清理並標準化metadata"""
    keep_keys = ['source', 'total_pages', 'title', 'page', 'year', 'authors', 'topic']
    
    clean_meta = {}
    for key in keep_keys:
        if key in metadata:
            clean_meta[key] = metadata[key]
    
    return clean_meta

for doc in all_abstracts_with_metadata:
    doc.metadata = clean_metadata(doc.metadata)
print("done!")

print(all_abstracts_with_metadata[3].metadata)

"""
為什麼這樣寫？
因為載入後會變成langchain的document形式，例如：
    load_result = [
        Document(page_content = '...', metedata = {}),
        Document(page_content = '...', metedata = {}),
        Document(page_content = '...', metedata = {})                
    ]
所以，當我們要修改metedata格式時，需要透過遍歷的方式處理（因為def clean_metadata只吃「單一Document的metadata（字典型態）」）
因此最直接的做法就是
for doc in result:
    doc.metadata = clean_metadata(doc.metadata)

"""


done!
{'source': '/Users/mangtinglee/Desktop/2025_gap_careerpath/RAG_LLM/pdfs/2020_scaling laws_Scaling Laws for Neural Language Models.pdf', 'total_pages': 30, 'title': 'Scaling Laws for Neural Language Models', 'page': 0, 'year': 2020, 'authors': 'Jared Kaplan,\xa0Sam McCandlish,\xa0Tom Henighan,\xa0Tom B. Brown,\xa0Benjamin Chess,\xa0Rewon Child,\xa0Scott Gray,\xa0Alec Radford,\xa0Jeffrey Wu,\xa0Dario Amodei', 'topic': 'scaling laws'}


"\n為什麼這樣寫？\n因為載入後會變成langchain的document形式，例如：\n    load_result = [\n        Document(page_content = '...', metedata = {}),\n        Document(page_content = '...', metedata = {}),\n        Document(page_content = '...', metedata = {})                \n    ]\n所以，當我們要修改metedata格式時，需要透過遍歷的方式處理（因為def clean_metadata只吃「單一Document的metadata（字典型態）」）\n因此最直接的做法就是\nfor doc in result:\n    doc.metadata = clean_metadata(doc.metadata)\n\n"

In [11]:
# 分割檔案
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=50,
    separators=[ "\n\n", ". ", "\n", "(?<=\. )", " ", ""]
    )    
docs = text_splitter.split_documents(all_abstracts_with_metadata)
print(len(docs)) # chunks
print(len(all_abstracts_with_metadata)) # test 11 pages

116
24


In [12]:
docs[0]

Document(metadata={'source': '/Users/mangtinglee/Desktop/2025_gap_careerpath/RAG_LLM/pdfs/2023_LLM limitation_Large Language Models Sensitivity to The Order of Options in Multiple-Choice Questions.pdf', 'total_pages': 11, 'title': 'Large Language Models Sensitivity to The Order of Options in Multiple-Choice Questions', 'page': 0, 'year': 2023, 'authors': 'Large Language Models Sensitivity to The Order of Options in Multiple-Choice Questions', 'topic': 'LLM limitation'}, page_content='Large Language Models Sensitivity to The Order of Options in\nMultiple-Choice Questions\nPouya Pezeshkpour\nMegagon Labs\npouya@megagon.ai\nEstevam Hruschka\nMegagon Labs\nestevam@megagon.ai\nAbstract\nLarge Language Models (LLMs) have demon-\nstrated remarkable capabilities in various NLP\ntasks. However, previous works have shown\nthese models are sensitive towards prompt\nwording, and few-shot demonstrations and\ntheir order, posing challenges to fair assess-\nment of these models. As these models be-\n

In [13]:
# step.3 - embedding

import os

# 建立資料夾
os.makedirs('./chroma_db', exist_ok=True)
print("finish！")

# 檢查是否成功
print(f"資料夾存在嗎？{os.path.exists('./chroma_db')}")

finish！
資料夾存在嗎？True


In [14]:
# define embedding
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [15]:
# 注意，需要先在自己的環境中建立資料庫路徑
persist_directory = './chroma_db' # 指定資料庫路徑
!rm -rf ./chroma_db  # remove old database files if any

In [16]:
# 建立新的向量資料庫，並將文件放進去
from langchain.vectorstores import Chroma
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [17]:
vectordb.persist() # 手動儲存剛剛建立的資料庫（現在不用手動了，自動儲存）

  vectordb.persist() # 手動儲存剛剛建立的資料庫（跟相似度沒有關係，是儲存資料庫）


# 每次開始新session只需要（不需要經過前面的階段）：

In [5]:
%pip install langchain_chroma

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain_chroma
  Downloading langchain_chroma-0.2.6-py3-none-any.whl.metadata (1.1 kB)
Collecting langchain-core>=0.3.76 (from langchain_chroma)
  Downloading langchain_core-0.3.76-py3-none-any.whl.metadata (3.7 kB)
Downloading langchain_chroma-0.2.6-py3-none-any.whl (12 kB)
Downloading langchain_core-0.3.76-py3-none-any.whl (447 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.5/447.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: langchain-core, langchain_chroma
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 0.3.75
    Uninstalling langchain-core-0.3.75:
      Successfully uninstalled langchain-core-0.3.75
Successfully installed langchain-core-0.3.76 langchain_chroma-0.2.6

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[3

In [41]:
# 啟動資料庫
import os # 作業系統相關功能（讀取環境變數）
from openai import OpenAI # openai api 客戶端
from dotenv import load_dotenv, find_dotenv # dotenv 是專門用來讀取.env套件的套件，並接上環境
_ = load_dotenv(find_dotenv()) # 讀取.env檔案
client = OpenAI(
    api_key=os.environ['OPENAI_API_KEY']
)
print("done")
# from langchain.vectorstores import Chroma
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
vectordb = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)

done


#### 小筆記：metadata「不是預設會被向量化的內容」，通常只有 page_content

# 問題驗證環節

####  strategic decision
##### 問題：「我們應該先在相似度上取得一定的結果再嘗試進階檢索方式，還是先試試看進階檢索方式的情況？」
##### 回答：應該先建立 baseline
- 有了baseline，才知道基礎是否已可用，進階的改善了多少
##### 後續流程
- 一、透過基礎相似度評估回答狀況：用不同問題測試，並且閱讀回答情況（閱讀理解）→ 剛剛已完成
- 二、透過基礎相似度建立 baseline ：運用量化指標，了解現在「相似度」的回答狀況 →稍後執行
- 三、baseline建立完成後，比較mmr和Self-query檢索狀況，決定要用哪一種(有數據的支持)
- 四、確認檢索方法後，進一步建立llm回答機制

#### 第一步驟結果評估：
- Question 2 & 5（關於Transformer/Attention) -> 表現良好
- Question 3 (量子計算) & Question 4 (煮義大利麵)：因為是透過「相似性檢索」處理 -> 表現正常
- Question 1 (2023年論文) 沒有很好地利用年份資訊(需要檢索2023)
#### 判斷檢索品質：
- 相關度分數分佈（某些問題應該高分、某些問題應該低分）：邊界問題應該要低分、中等相關中間、高度相關分數應該要高
- 結果多樣性：可以檢驗的多元性有「論文來源的多樣性」、「主題多樣性」、「時間多樣性」等，總之就是不希望回答都是一樣的
- metadata準確率：:檢驗問「2023」年、某某作者、某主題的論文，回傳結果中，真正符合年份的比例，例如k=3，有2題對就是67%
- 檢索速度：就是計時
- 建立問題集

## 第一步驟：基礎理解

In [22]:
# Level 1：基礎功能驗證

# 1. Metadata查詢測試
question1 = "What papers were published in 2023?"
ans_docs1 = vectordb.similarity_search(question1,k=2) # 問題參數記得要改

question2 = "Which authors studied transformer architecture?"
ans_docs2 = vectordb.similarity_search(question2,k=2)

# 2. 邊界情況測試
question3 = "What is quantum computing?"
ans_docs3 = vectordb.similarity_search(question3,k=2)

question4 = "How to cook pasta?"  
ans_docs4 = vectordb.similarity_search(question4,k=2)

question5 = "What is the main contribution of Attention is All You Need?"
ans_docs5 = vectordb.similarity_search(question5,k=2)


## 檢驗量化指標

In [4]:
def evaluate_time_score(vectordb, test_queries, k=3, iterations=5): # （向量資料庫、測試查詢列表、檢索數量、每隔查詢重複測試次數）

    import time 

    all_times = [] # 三個問題的總時間

    for query in test_queries: # 共有三個問題、迭代五次、每次產出3塊
        query_times = [] # total times
        for _ in range(iterations):  # iterations 剛剛已經預設 5
            start_time = time.time() # start timing
            vectordb.similarity_search(query, k=k) # k 預設為3, query = each questions
            end_time = time.time() # end timing
            query_times.append(end_time - start_time) # 紀錄某問題迭代五次的時間
        avg_time = sum(query_times) / len(query_times)
        all_times.append(avg_time)

    return  {
        'avg_response_time': sum(all_times) / len(all_times),
        'min_response_time': min(all_times),
        'max_response_time': max(all_times)
    }

# 測試相關性（不同問題的相關性應該不同）
# 小筆記：問題一定要跟塊有所互動，才會有「分數」（similarity_search_with_score），因此這裡是透過問題返回點積。
# 向量模型只要同一間公司，向量的方式都一樣，和資料庫或內容沒有關係
# 分數其實就是「距離（cosine distance）」，所以數字越小越相關
def evaluate_relevance_score(vectordb, test_queries_dict):
    # 已經建立一個有高相關和低相關的問題集
    results = {}

    for catagory, queries in test_queries_dict.items(): # loop for find out each key(high/low):value(question)
        scores =[]
        for q in queries:
            docs_with_scores = vectordb.similarity_search_with_score(q, k=3) 
            avg_score = sum([score for doc, score in docs_with_scores])/len([docs_with_scores])
            scores.append(avg_score)

        results[catagory]={
            'avg_score': sum(scores) / len(scores),
            'min_score': min(scores),
            'max_score': max(scores),
            'all_scores': scores            
        }

    return results

def evaluate_accuracy_score(vectordb, metadata_queries, k=3):
    """
    評估metadata查詢的準確率
    
    Args:
        vectordb: 向量資料庫
        metadata_queries: {
            'year_2023': 'What papers were published in 2023?',
            'transformer_topic': 'transformer architecture papers',
            'author_vaswani': 'papers by Vaswani'
        }
        k: 檢索數量
    
    Returns:
        dict: 各種metadata查詢的準確率
    """
    results = {}
    
    # 年份查詢準確率
    if 'year_2023' in metadata_queries:
        query = metadata_queries['year_2023']
        docs = vectordb.similarity_search(query, k=k)
        correct_count = sum(1 for doc in docs if doc.metadata.get('year') == 2023)
        results['year_accuracy'] = correct_count / len(docs) if docs else 0
    
    # 主題查詢準確率
    if 'transformer_topic' in metadata_queries:
        query = metadata_queries['transformer_topic']
        docs = vectordb.similarity_search(query, k=k)
        correct_count = sum(1 for doc in docs if 'transformer' in doc.metadata.get('topic', '').lower())
        results['topic_accuracy'] = correct_count / len(docs) if docs else 0
    
    # 作者查詢準確率
    if 'author_vaswani' in metadata_queries:
        query = metadata_queries['author_vaswani']
        docs = vectordb.similarity_search(query, k=k)
        correct_count = sum(1 for doc in docs if 'vaswani' in doc.metadata.get('authors', '').lower())
        results['author_accuracy'] = correct_count / len(docs) if docs else 0
    
    return results

def evaluate_diversity_score(vectordb, query, k=3):
    """
    評估檢索結果的多樣性
    
    Args:
        vectordb: 向量資料庫
        query: 查詢字串
        k: 檢索數量
    
    Returns:
        dict: 各種多樣性指標
    """
    docs = vectordb.similarity_search(query, k=k)
    
    # 1. 論文來源多樣性
    titles = [doc.metadata.get('title', 'Unknown') for doc in docs]
    unique_titles = len(set(titles))
    title_diversity = unique_titles / len(titles)
    
    # 2. 年份多樣性
    years = [doc.metadata.get('year', 'Unknown') for doc in docs]
    unique_years = len(set(years))
    year_diversity = unique_years / len(years)
    
    # 3. 主題多樣性
    topics = [doc.metadata.get('topic', 'Unknown') for doc in docs]
    unique_topics = len(set(topics))
    topic_diversity = unique_topics / len(topics)
    
    return {
        'title_diversity': title_diversity,
        'year_diversity': year_diversity,
        'topic_diversity': topic_diversity,
        'overall_diversity': (title_diversity + year_diversity + topic_diversity) / 3
    }


def running_baseline(vectordb):
    print("starting baseline evalution")

    # 1. preparing testing data
    relevance_queries = {
        'high_relevance': [
            'transformer architecture',
            'attention mechanism',
            'what is BERT'
        ],
        'low_relevence': [
            'how to cook a pasta',
            'quantum computing',
            'weather forecast'
        ]
    }

    metadata_queries = { # structure - key:actual question
        'year_2023': 'what papers were published in 2023?',
        'llm_limitation_topic': 'llm limitation papers',
        'author_vaswani': 'papers by vaswani'
    }


    speed_queries = ['transformer', 'attention', 'LLM limitations']

    # 2. running defs
    relevance_results =  evaluate_relevance_score(vectordb, relevance_queries) # input: different questions
    diversity_results =  evaluate_diversity_score(vectordb, 'LLM research') # input: target topic
    accuracy_results = evaluate_accuracy_score(vectordb, metadata_queries) # input: question of metadata
    time_results = evaluate_time_score(vectordb, speed_queries) # input:any questions

    # 3. print results
        
    # about relavance 

    print("\n📊 相關度分數分佈:")
    for category, stats in relevance_results.items():
        print(f"  {category}: 平均={stats['avg_score']:.3f}, 範圍=[{stats['min_score']:.3f}, {stats['max_score']:.3f}]")    

    # about diversity
    print(f"\n🎯 多樣性分數: {diversity_results['overall_diversity']:.3f}")
    
    # about accuracy
    print("\n📋 Metadata準確率:")
    for metric, accuracy in accuracy_results.items():
        print(f"  {metric}: {accuracy:.1%}")    


    # about response times
    print(f"\n average response time: {time_results['avg_response_time']:.3f}seconds"
    f"\n min response time: {time_results['min_response_time']:.3f}seconds"
    f"\n max response time: {time_results['max_response_time']:.3f}seconds")
    


    return { 
        'relevance': relevance_results,
        'diversity': diversity_results,
        'metadata': accuracy_results,
        'speed': time_results
    }



In [5]:
results = running_baseline(vectordb)

starting baseline evalution

📊 相關度分數分佈:
  high_relevance: 平均=1.308, 範圍=[1.183, 1.433]
  low_relevence: 平均=1.529, 範圍=[1.284, 1.671]

🎯 多樣性分數: 1.000

📋 Metadata準確率:
  year_accuracy: 33.3%
  author_accuracy: 0.0%

 average response time: 0.455seconds
 min response time: 0.420seconds
 max response time: 0.480seconds


#### baseline results
- 建立系統性能基準:回應時間：0.45秒，驗證檢索功能正常

- 高相關 < 低相關的距離證明系統work

- 問題點：
    - 多樣性 1.0 = 完全沒多樣性
    - Metadata 準確率慘烈

In [10]:
def evaluate_time_score_mmr(vectordb, test_queries, k=3, iterations=5): # （向量資料庫、測試查詢列表、檢索數量、每隔查詢重複測試次數）

    import time 

    all_times = [] # 三個問題的總時間

    for query in test_queries: # 共有三個問題、迭代五次、每次產出3塊
        query_times = [] # total times
        for _ in range(iterations):  # iterations 剛剛已經預設 5
            start_time = time.time() # start timing
            vectordb.max_marginal_relevance_search(query, k=k) # k 預設為3, query = each questions
            end_time = time.time() # end timing
            query_times.append(end_time - start_time) # 紀錄某問題迭代五次的時間
        avg_time = sum(query_times) / len(query_times)
        all_times.append(avg_time)

    return  {
        'avg_response_time': sum(all_times) / len(all_times),
        'min_response_time': min(all_times),
        'max_response_time': max(all_times)
    }
# about response times


speed_queries = ['transformer', 'attention', 'LLM limitations']

time_results = evaluate_time_score_mmr(vectordb, speed_queries)

print(f"\n average response time: {time_results['avg_response_time']:.3f}seconds"
    f"\n min response time: {time_results['min_response_time']:.3f}seconds"
    f"\n max response time: {time_results['max_response_time']:.3f}seconds")



 average response time: 0.504seconds
 min response time: 0.468seconds
 max response time: 0.573seconds


In [11]:
def diversity_showdown():
    query = "transformer architecture"
    
    sim_docs = vectordb.similarity_search(query, k=5)
    mmr_docs = vectordb.max_marginal_relevance_search(query, k=5)
    
    print("Similarity 結果論文:")
    for doc in sim_docs:
        print(f"- {doc.metadata['title']}")
    
    print("\nMMR 結果論文:")  
    for doc in mmr_docs:
        print(f"- {doc.metadata['title']}")
    
diversity_result = diversity_showdown()

Similarity 結果論文:
- Attention Is All You Need
- Lost in the Middle- How Language Models Use Long Contexts
- Attention Is All You Need
- Scaling Laws for Neural Language Models
- Attention Is All You Need

MMR 結果論文:
- Attention Is All You Need
- Lost in the Middle- How Language Models Use Long Contexts
- Modular RAG- Transforming RAG Systems into LEGO-like Reconfigurable Frameworks
- Large Language Model Agent- A Survey on Methodology, Applications and Challenges
- Quantifying Language Models' Sensitivity to Spurious Features in Prompt Design or- How I learned to start worrying about prompt formatting


比較總表：
    時間效率：
        mmr:
            average response time: 0.504seconds
            min response time: 0.468seconds
            max response time: 0.573seconds
        Similarity：
            average response time: 0.455seconds
            min response time: 0.420seconds
            max response time: 0.480seconds
    
    多樣性：
        Similarity :
            - Attention Is All You Need
            - Lost in the Middle- How Language Models Use Long Contexts
            - Attention Is All You Need
            - Scaling Laws for Neural Language Models
            - Attention Is All You Need
        MMR 結果論文:
            - Attention Is All You Need
            - Lost in the Middle- How Language Models Use Long Contexts
            - Modular RAG- Transforming RAG Systems into LEGO-like Reconfigurable Frameworks
            - Large Language Model Agent- A Survey on Methodology, Applications and Challenges
            - Quantifying Language Models' Sensitivity to Spurious Features in Prompt Design or- How I learned to start worrying about prompt formatting    

In [13]:
def test_mmr_metadata_precision():
    """
    測試MMR在特定metadata查詢上是否比similarity更精準
    """
    
    # 1. 年份查詢
    year_query = "What papers were published in 2023?"
    sim_2023 = vectordb.similarity_search(year_query, k=5)
    mmr_2023 = vectordb.max_marginal_relevance_search(year_query, k=5)
    
    print("2023年論文查詢:")
    print("Similarity結果:")
    for doc in sim_2023:
        year = doc.metadata.get('year', 'Unknown')
        print(f"  {year}: {doc.metadata.get('title', 'Unknown')}")
    
    print("MMR結果:")  
    for doc in mmr_2023:
        year = doc.metadata.get('year', 'Unknown')
        print(f"  {year}: {doc.metadata.get('title', 'Unknown')}")




In [14]:
test_accuracy = test_mmr_metadata_precision()

2023年論文查詢:
Similarity結果:
  2023: BloombergGPT- A Large Language Model for Finance
  2024: Modular RAG- Transforming RAG Systems into LEGO-like Reconfigurable Frameworks
  2024: The Power of Noise- Redefining Retrieval for RAG Systems
  2023: Challenges and Applications of Large Language Models
  2025: The Order Effect- Investigating Prompt Sensitivity to Input Order in LLMs
MMR結果:
  2023: BloombergGPT- A Large Language Model for Finance
  2023: Challenges and Applications of Large Language Models
  2024: Language Ranker- A Metric for Quantifying LLM Performance Across High and Low-Resource Languages
  2025: Large Language Model Agent- A Survey on Methodology, Applications and Challenges
  2024: Retrieval-Augmented Generation for Large Language Models- A Survey


In [22]:
def hybrid_search_with_year(query, target_year, k=5):
    # 先用向量搜尋找到候選結果（更大的k）

    candidates = vectordb.max_marginal_relevance_search(query, k=k*3)  # 找15個候選
    
    # 再用metadata篩選年份
    year_filtered = [doc for doc in candidates 
                    if doc.metadata.get('year') == target_year]
    
    # 回傳前k個
    return year_filtered[:k]

query = "What papers were published in 2023?" 
year = 2023
test_accurac_hybird = hybrid_search_with_year(query, year)


In [23]:
for doc in test_accurac_hybird:
    year = doc.metadata.get('year', 'Unknown')
    print(f"  {year}: {doc.metadata.get('title', 'Unknown')}")

  2023: BloombergGPT- A Large Language Model for Finance
  2023: Challenges and Applications of Large Language Models
  2023: Large Language Models Sensitivity to The Order of Options in Multiple-Choice Questions
  2023: Quantifying Language Models' Sensitivity to Spurious Features in Prompt Design or- How I learned to start worrying about prompt formatting
  2023: BloombergGPT- A Large Language Model for Finance


看起來用混合檢索的方式，可以真的好好指定我要的年份。
但我有以下問題想和你討論，你先不要給我程式碼，跟我用文字討論：
一、我的metadata中，存在以下欄位，這些都可以變成metadata篩選的條件嗎？例如title, year, author, topic等？
二、如果可以都變成篩選條件，如何將這些檢索條件加到「檢索」的程式中？因為我們再檢索階段可以自己打year=2023之類的參數，可是整體的rag我們應該只會打一個問題，這樣有辦法幫我們分辨嗎？
三、我還是很想解決不能精準檢索的問題，所以正在試hybrid的方式，但我仔細一想你剛剛跟我說這個rag的定位是什麼，我就在想我用了mmr提升了「多樣性」，用hybird可以提升「準確率」，哪一個對我來說可以檢索論文比較重要？我是站在「學習llm知識」的角度去開發這個rag，所以才會清理乾淨metadata前面提到最重要的四大參數，所以都很重要？

#### 建立hybrid rag(查詢解析器) 目標： 把自然語言查詢 → 轉換為 structured parameters
- 建立llm解析器
- 查詢使用mmr+解析器：規則判斷

In [73]:
# llm套件與啟動

from langchain_openai import ChatOpenAI   

# 建立 LLM
llm = ChatOpenAI(model="gpt-3.5-turbo")   # 指定模型


# llm解析器：解析問題內部的參數
def llm_extract_metadata(query):
    prompt = f"""
you are a metedata extraction export, extract year and topic information from user queries.

Available topics:
    - "transformer"
    - "scaling laws"
    - "fine tuning"
    - "LLM limitation"
    - "LLM application"
    - "prompt engineering"
    - "RAG"
    - "ai-agent"
    - null (if no specific topic)

    Available years: 2017-2025 or null

    Query = "{query}"

    Extract and return ONLY a JSON object:
    {{"year":<number or null>, "topic":"<topic or null>"}}

    Examples:
    Query:"2023年關於transformer的研究"
    {{"year":2023, "topic":"transformer"}}

    Query:"LLM的限制有哪些"
    {{"year":null, "topic":"llm limitation"}}

    Query: "最新的RAG論文"
    {{"year": 2025, "topic": "RAG"}}

    Now extract from the query above:
    """
    try:
        response = llm.invoke(prompt) # response 的資料型態是AIMessage，不是字串
        response_text = response.content  # response.content 才能取得文字內容
        # 解析json格式
        import json
        metadata = json.loads(response_text.strip())
        return metadata
    except:
        return {"year":None, "topic":None }

def smart_hybrid_search(query, k=5):
    # llm解析返回的查詢
    metadata = llm_extract_metadata(query)
    year = metadata.get('year')
    topic = metadata.get('topic')

    # 根據解析策略選擇檢索方式
    if year and topic:
        candidates = vectordb.max_marginal_relevance_search(query, k=k*4)
        filtered = [doc for doc in candidates
                    if doc.metadata.get('year') == year
                    and doc.metadata.get('topic') == topic]
    elif year:
        candidates = vectordb.max_marginal_relevance_search(query, k=k*3)
        filtered = [doc for doc in candidates
                    if doc.metadata.get('year') == year]

    elif topic:
        candidates = vectordb.max_marginal_relevance_search(query, k=k*3)
        filtered = [doc for doc in candidates
                    if doc.metadata.get('topic') == topic]

    else:
        filtered = vectordb.max_marginal_relevance_search(query, k=k)

    # 確保有足夠結果
    if len(filtered) < k:
        additional = vectordb.max_marginal_relevance_search(query, k=k)
        filtered.extend = ([doc for doc in additional if doc not in filtered])
    
    return filtered[:k]


test_queries ="2024年關於RAG的研究"
test_hybid = smart_hybrid_search(test_queries)
for doc in test_hybid:
    print(doc.metadata)

{'year': 2024, 'topic': 'RAG', 'source': '/Users/mangtinglee/Desktop/2025_gap_careerpath/RAG_LLM/pdfs/2024_RAG_Evaluation of Retrieval-Augmented Generation- A Survey.pdf', 'total_pages': 21, 'title': 'Evaluation of Retrieval-Augmented Generation- A Survey', 'page': 0, 'authors': 'Hao Yu, Aoran Gan, Kai Zhang, Shiwei Tong, Qi Liu, Zhaofeng Liu'}
{'authors': 'Yunfan Gao, Yun Xiong, Xinyu Gao, Kangxiang Jia, Jinliu Pan, Yuxi Bi, Yi Dai, Jiawei Sun, Meng Wang, Haofen Wang', 'title': 'Retrieval-Augmented Generation for Large Language Models- A Survey', 'total_pages': 21, 'page': 0, 'year': 2024, 'topic': 'RAG', 'source': '/Users/mangtinglee/Desktop/2025_gap_careerpath/RAG_LLM/pdfs/2024_RAG_Retrieval-Augmented Generation for Large Language Models- A Survey.pdf'}
{'title': 'Evaluation of Retrieval-Augmented Generation- A Survey', 'total_pages': 21, 'source': '/Users/mangtinglee/Desktop/2025_gap_careerpath/RAG_LLM/pdfs/2024_RAG_Evaluation of Retrieval-Augmented Generation- A Survey.pdf', 'page

In [87]:
# 需要將剛剛的hybrid retrival包裝成自定義retriver物件，才可以接軌
from langchain.schema import BaseRetriever, Document
from typing import List, Any
from langchain_openai import ChatOpenAI
import json
from pydantic import Field

class SmartHybridRetriever(BaseRetriever):  # 修正拼字錯誤

    # 明確定義fields
    vectordb: Any = Field(default=None)
    llm: Any = Field(default=None)
    
    # def __init__(self, vectordb, llm, **kwargs):
    #     # 先傳給父類
    #     super().__init__(**kwargs)
    #     # 然後設定屬性
    #     object.__setattr__(self, 'vectordb', vectordb)
    #     object.__setattr__(self, 'llm', llm)
    
    
    def __init__(self, vectordb, llm):
        super().__init__()  # 重要：調用父類初始化
        self.vectordb = vectordb
        self.llm = llm
    
    def llm_extract_metadata(self, query):  # 加入self參數
        prompt = f"""
you are a metadata extraction expert, extract year and topic information from user queries.

Available topics:
    - "transformer"
    - "scaling laws"
    - "fine tuning"
    - "LLM limitation"
    - "LLM application"
    - "prompt engineering"
    - "RAG"
    - "LLM agent"  # 修正為"LLM agent"
    - null (if no specific topic)

Available years: 2017-2025 or null

Query = "{query}"

Extract and return ONLY a JSON object:
{{"year":<number or null>, "topic":"<topic or null>"}}

Examples:
Query:"2023年關於transformer的研究"
{{"year":2023, "topic":"transformer"}}

Query:"LLM的限制有哪些"
{{"year":null, "topic":"LLM limitation"}}

Query: "最新的RAG論文"
{{"year": 2025, "topic": "RAG"}}

Now extract from the query above:
"""
        try:
            response = self.llm.invoke(prompt)  # 使用self.llm
            response_text = response.content
            metadata = json.loads(response_text.strip())
            return metadata
        except:
            return {"year": None, "topic": None}

    def smart_hybrid_search(self, query, k=5):  # 加入self參數
        metadata = self.llm_extract_metadata(query)  # 使用self
        year = metadata.get('year')
        topic = metadata.get('topic')

        if year and topic:
            candidates = self.vectordb.max_marginal_relevance_search(query, k=k*4)
            filtered = [doc for doc in candidates
                       if doc.metadata.get('year') == year
                       and doc.metadata.get('topic') == topic]
        elif year:
            candidates = self.vectordb.max_marginal_relevance_search(query, k=k*3)
            filtered = [doc for doc in candidates
                       if doc.metadata.get('year') == year]
        elif topic:
            candidates = self.vectordb.max_marginal_relevance_search(query, k=k*3)
            filtered = [doc for doc in candidates
                       if doc.metadata.get('topic') == topic]
        else:
            filtered = self.vectordb.max_marginal_relevance_search(query, k=k)

        # 修正extend的使用
        if len(filtered) < k:
            additional = self.vectordb.max_marginal_relevance_search(query, k=k)
            filtered.extend([doc for doc in additional if doc not in filtered])  # 修正
        
        return filtered[:k]
    
    def _get_relevant_documents(self, query: str) -> List[Document]:
        """BaseRetriever要求的抽象方法"""
        return self.smart_hybrid_search(query, k=5)
    
    async def _aget_relevant_documents(self, query: str) -> List[Document]:
        """異步版本"""
        return self._get_relevant_documents(query)

# 使用方式
llm = ChatOpenAI(model="gpt-3.5-turbo")  # 在外部定義
custom_retriever = SmartHybridRetriever(vectordb=vectordb, llm=llm)  # 修正變數名
    

In [83]:
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
#加入llm回答
# 新項目：載入ConversationBufferMemory
# 此套件能讓問答機器人記住過往歷史問答
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history", # 告訴 chain 去哪裡找歷史對話
    return_messages=True, # 返回的是物件（長得像json or meta data）
    output_key="answer"
)

In [84]:
# 建立自訂義的prompt(為了好好利用檢索)
from langchain.prompts import PromptTemplate

# 建立自定義prompt
qa_prompt = PromptTemplate(
    template="""請基於以下檢索到的文獻內容回答問題。

檢索到的文獻：
{context}

問題：{question}

請直接基於上述文獻內容回答問題，列出相關的論文標題和主要內容。如果文獻中沒有相關資訊，請明確說明。

答案：""",
    input_variables=["context", "question"]
)

# 重新建立qa chain，使用自定義prompt
from langchain.chains import ConversationalRetrievalChain

qa = ConversationalRetrievalChain.from_llm(
    llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
    chain_type="stuff",  # 改為stuff更直接
    retriever=custom_retriever,
    return_source_documents=True,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": qa_prompt}  # 加入自定義prompt
)

In [85]:
question = "transformer架構的核心概念是什麼？"
result = qa({"question": question})
print(result['answer'])


Transformer架構的核心概念是基於注意力機制，並且完全不使用循環神經網絡或卷積神經網絡。這種簡單的網絡架構通過注意力機制將編碼器和解碼器相連接，並且在機器翻譯任務中表現優異，同時具有更好的並行性和更短的訓練時間。Transformer模型在WMT 2014英德翻譯任務上取得了28.4 BLEU的分數，在英法翻譯任務上取得了41.8 BLEU的分數，並且在其他任務上也表現良好。Transformer的提出是基於將循環神經網絡替換為自注意力機制，並且消除了循環和卷積的使用。


In [77]:
question = "這個架構有什麼限制？"
result = qa({"question": question})
print(result['answer'])

根據檢索到的文獻內容，這個架構的限制包括：
1. LLMs對於提示中元素的順序敏感，包括few-shot demonstrations的排列和候選模型生成的回應順序，可能影響其理解和推理能力。
   - 相關論文：Zhao et al., 2021; Wang et al., 2023b
2. LLMs對於提示中的微小變化敏感，例如在多選題回答任務中，選項的順序可能影響其表現。
   - 相關論文：未提及
3. LLMs在低資源語言中表現不佳，可能由於缺乏訓練數據導致無法理解文化特定表達或成語。
   - 相關論文：Zhang et al. 2023; Lankford, Afli, and Way 2024
4. Transformer語言模型需要大量記憶體和計算資源，對序列長度的要求增長迅速，可能限制了模型的訓練上下文窗口大小。
   - 相關論文：Vaswani et al., 2017; Dai et al., 2019; Dao et al., 2022; Poli et al., 2023

綜合以上資訊，這個架構的限制包括對提示中元素順序和微小變化的敏感性，以及在低資源語言和大序列長度下的表現不佳。


question = "2023年有哪些重要的論文？"
回答：根據檢索到的文獻內容，2023年的重要論文包括：

1. "Lost in the Middle: How Language Models Use Long Contexts"，作者包括Nelson F. Liu、Kevin Lin、John Hewitt等人，主要探討語言模型在處理長文本上的表現，並分析了在多文檔問答和鍵-值檢索任務中的表現。

2. "Challenges: What problems remain unresolved?"，作者未提及，主要探討語言模型研究中尚未解決的問題。

3. "Applications: Where are LLMs currently being applied, and how are the challenges constraining them?"，作者未提及，主要探討目前語言模型的應用領域以及面臨的挑戰。

以上是根據文獻內容能夠找到的2023年重要論文，其他文獻中未提及相關資訊。（但如果用數2023的論文，其實有六篇，而且每一篇都有提及作者）

question = "告訴我關於LLM limitation的研究，並展示出文章的名稱？"
回答：
在LLM limitation的研究中，重要的發現包括：
1. 文獻中提到LLMs對於prompt中各個元素的排列敏感，這直接影響了它們在特定任務中理解和推理能力的評估。
2. 先前的研究表明，LLMs對於few-shot demonstrations的排列和候選模型生成的回應順序敏感，這影響了LLMs作為評估質量的裁判時的表現。
3. LLMs對於prompt中元素的順序在不同任務中是否敏感，這也是一個需要探討的問題。
評比：看起來因為我們的解析器只針對年份和主題，對作者的metadata好像沒這麼敏感（但其實都有）


question = "2024年關於RAG的研究有什麼進展？"
根據檢索到的文獻，2024年關於RAG的研究有以下進展：
1. 文獻標題：Evaluation of Retrieval-Augmented Generation: A Survey
   主要內容：該研究對RAG方法進行了全面和系統性的回顧，描述了其透過外部信息檢索增強生成模型的能力。研究評估了RAG系統的獨特挑戰，並提出了統一的評估過程，旨在提供對RAG系統的評估和基準的全面概述。

2. 文獻標題：未提及RAG研究進展
   主要內容：未提及2024年關於RAG的具體研究進展。

評比：但其實rag２０２４的pdf有五篇。
question = "什麼是attention機制？"
回答：根據檢索到的文獻內容，提到了一個新的簡單網絡架構，Transformer，它完全基於attention機制，不使用循環和卷積。這個網絡架構將編碼器和解碼器通過attention機制連接起來。因此，根據文獻內容，attention機制是一種連接編碼器和解碼器的機制，用於序列轉換模型中。

In [66]:
# 先單獨測試你的retriever
question = "2023年有哪些重要的論文？"
docs = custom_retriever._get_relevant_documents(question)
print(f"檢索到的文件數量: {len(docs)}")

for i, doc in enumerate(docs):
    print(f"文件{i+1}: {doc.metadata.get('title', 'Unknown')}")
    print(f"年份: {doc.metadata.get('year')}")
    print(f"內容預覽: {doc.page_content[:100]}")
    print("-" * 50)

檢索到的文件數量: 5
文件1: BloombergGPT- A Large Language Model for Finance
年份: 2023
內容預覽: . . . . . . . . . . . . . . 9
∗. Co-first authors. Corresponding email: airesearch@bloomberg.net
1
a
--------------------------------------------------
文件2: Challenges and Applications of Large Language Models
年份: 2023
內容預覽: . . . . 43
3.8 Reasoning . . . . . . . . . . . . . 44
3.9 Robotics and Embodied Agents . . 45
3.10 S
--------------------------------------------------
文件3: Large Language Models Sensitivity to The Order of Options in Multiple-Choice Questions
年份: 2023
內容預覽: . For example, how much does
the order of options in multiple-choice question
(MCQ) answering tasks 
--------------------------------------------------
文件4: Lost in the Middle- How Language Models Use Long Contexts
年份: 2023
內容預覽: Lost in the Middle: How Language Models Use Long Contexts
Nelson F. Liu1∗ Kevin Lin2 John Hewitt1 As
--------------------------------------------------
文件5: Large Language Models Sensitivity to The Order o

**目前的理解**

首先在之前檢索階段還沒有加入回答前，就已經把`def llm_extract_metadata()`和`def smart_hybrid_search()`建立好了。

`smart_hybrid_search`會先透過`llm_extract_metadata`解析器回傳年份和主題。

所以當時的執行方法會是：

```python
    test_queries ="2024年關於RAG的研究"
    test_hybid = smart_hybrid_search(test_queries)
```

但因為現在加上回答，`ConversationalRetrievalChain`需要`retriever`物件（`retriever=retriever`），但因為剛剛做的是函式並非這種物件，因此需要做一些調整。

而調整方法是需要將剛剛的混合檢索「包裝成自定義retriver物件」，才可以寫出`retriever=custom_retriever`。
以上是我能理解的地方，但實際上的做法不太懂。


**不太懂的地方**

- 一、看起來langchain.schema可以讓我們自己建立一個retriever物件，但我不太懂這個套件是什麼東西，也不懂為什麼我們需要一個class，因為我目前只熟悉def
    - 回答
        - def像是一個「動作」，按一下執行一次；class像是一個藍圖，裡面會有「資料」也會有「動作」
        -   例如像class SmartHybridRetriever()就包含了：
            - 資料：vectordb、llm
            - 動作：llm_extract_metadata()、smart_hybrid_search()
        - 為什麼要包成class?因為ConversationalRetrievalChain只認得物件，不能直接吃一個函式，所以我們需要一個class物件，把兩個函式包起來。


二、更進一步，from typing import List, Any和from pydantic import Field更是完全沒碰過了，所以不懂vectordb: Any = Field(default=None)
    llm: Any = Field(default=None)是什麼、＿＿init__裡面的super(), object也全部不懂。

三、然後看起來def llm_extract_metadata()和def smart_hybrid_search()都增加了self的參數，是什麼意思？

四、def _get_relevant_documents(self, query: str) -> List[Document]:這又是什麼？什麼叫做要求抽象的方法？
        """BaseRetriever要求的抽象方法"""

五、然後我從來也沒看過 async def

一、因為class就像是一張藍圖一樣，他同時包含了所需的「資料（屬性）」和「行為（函式）」，如果是def，我們能每一個執行def都比噓拖著一大堆的函數且他需要按照順序來執行，因此程式會顯得很混亂，而且假設今天有大學生和學生兩種對象，其實他們可以有個父類叫做「學生」，接下來在他們各自的class裡繼承學生的基本輪廓，再加入特定行為知識就可以了，如果是def可以能就沒法這樣繼承和共用，資料會不段的傳來傳去。
二、因為在我們的class裡，有些需要透過父類來繼承，繼承的話就會寫成super().__init__()，代表接收父類的屬性到自己的屬性中

三、因為BaseRetriever他不是一班的class，而是pydantic class，這種規定要先檢查屬性，所以我們才會有field那兩行

四、self指的是在class之外呼叫這個類別的「對象」，今天對象就會是retriever

五、因為假設r1是給llm論文、R2是給社工論文，如果我們今天要使用r1，只要使用llm = r1.search("問題a")
sw = r2.search("問題b")
就不會搞混了（請你多說明）