In [10]:
# 建立整體的ui介面，變成一個問答機器人
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader

In [None]:
%pip install -U langchain-community pypdf

In [5]:
'''
目的：建立langchain + openai 的基礎環境
'''
import os # 作業系統相關功能（讀取環境變數）
from openai import OpenAI # openai api 客戶端
from dotenv import load_dotenv, find_dotenv # dotenv 是專門用來讀取.env套件的套件，並接上環境
_ = load_dotenv(find_dotenv()) # 讀取.env檔案
client = OpenAI(
    api_key=os.environ['OPENAI_API_KEY']
)
# openai.api_key  = os.environ['OPENAI_API_KEY'] 舊版
print("done")

done


# 第一步測試：載入

In [6]:
from dotenv import load_dotenv
import os

# 清除舊的環境變數
if 'OPENAI_API_KEY' in os.environ:
    del os.environ['OPENAI_API_KEY']

# 重新載入
load_dotenv()

api_key = os.environ.get('OPENAI_API_KEY')
if api_key:
    print("✅ Success! API key loaded")
    print(f"Key starts with: {api_key[:15]}...")
    print(f"Key length: {len(api_key)} characters")
else:
    print("❌ Still not working")

✅ Success! API key loaded
Key starts with: sk-proj-FWUUter...
Key length: 164 characters


In [11]:
# 檢查資料夾是否存在
import os
folder_path = "/Users/mangtinglee/Desktop/2025_gap_careerpath/RAG_LLM/pdfs"

print(f"Folder exists: {os.path.exists(folder_path)}")
# 看看有哪些PDF檔案
pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files:")
for pdf in pdf_files[:5]:  # 顯示前5個檔名
    print(f"  - {pdf}")

Folder exists: True
Found 23 PDF files:
  - 2023_LLM limitation_Large Language Models Sensitivity to The Order of Options in Multiple-Choice Questions.pdf
  - 2025_LLM limitation_The Order Effect- Investigating Prompt Sensitivity to Input Order in LLMs.pdf
  - 2024_RAG_Evaluation of Retrieval-Augmented Generation- A Survey.pdf
  - 2020_scaling laws_Scaling Laws for Neural Language Models.pdf
  - 2022_LLM limitation_Robustness of Learning from Task Instructions.pdf


In [13]:
# 測試1.載入一篇
test_file = os.path.join(folder_path, pdf_files[0])
loader = PyPDFLoader(test_file)
documents = loader.load()
print(f"Test file loaded: {len(documents)} pages")
print(f"內容：{documents[3].page_content[:500]}...")
print(f"metadata：{documents[0].metadata}")

Test file loaded: 11 pages
內容：Figure 2: Order sensitivity in few-shot setting:The error bars represent the range of minimum and maximum
accuracy achievable in each task through oracle reordering. Our observations are as follows: (1) The sensitivity
gap consistently remains substantial even with the addition of more demonstrations in the few-shot setting. (2) As
performances improve, the sensitivity gap shrinks. (3) Adding more demonstrations does not necessarily result in a
reduction of the sensitivity gap.
4 Why Do LLMs Sho...
metadata：{'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-23T01:15:54+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-23T01:15:54+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/Users/mangtinglee/Desktop/2025_gap_careerpath/RAG_LLM/pdfs/2023_LLM limitation_Large Language Models Sensit

# 第二步測試：分割檔案

In [14]:
# 測試2. 分割檔案
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=150,
    separators=[ "\n\n", ". ", "\n", "(?<=\. )", " ", ""]
    )    
docs = text_splitter.split_documents(documents)
print(len(docs)) # chunks
print(len(documents)) # test 11 pages

54
11


In [15]:
print(docs[0].page_content)

Large Language Models Sensitivity to The Order of Options in
Multiple-Choice Questions
Pouya Pezeshkpour
Megagon Labs
pouya@megagon.ai
Estevam Hruschka
Megagon Labs
estevam@megagon.ai
Abstract
Large Language Models (LLMs) have demon-
strated remarkable capabilities in various NLP
tasks. However, previous works have shown
these models are sensitive towards prompt
wording, and few-shot demonstrations and
their order, posing challenges to fair assess-
ment of these models. As these models be-
come more powerful, it becomes imperative
to understand and address these limitations.
In this paper, we focus on LLMs robust-
ness on the task of multiple-choice questions—
commonly adopted task to study reasoning and
fact-retrieving capability of LLMs


```python
separators=[
    "\n\n",        # 段落分隔（最優先）
    ". ",          # 句號
    "\n",          # 行分隔
    "(?<=\. )",    # 句號後分隔（用正則表達式）
    " ",           # 空格分隔
    ""             # 字元分隔（最後手段）
]
```

##### 問題：如何決定塊的大小和重疊度？
- 兩種指標：
    - 統計指標：
        - 平均長度：接近設定的 `chunk _size`，代表接近目標 
        - max：評估是否超過限制
        - min：評估最小塊是否能接受
        - std：chunk的大小範圍大概落在 mean ± std 之間，有點大的話大概長度不太一致，<200 (長度較一致)
        - 一個最短的chunk：通常越少越好，<5%  

    - 語意完整性（更重要）：
        - 一個塊是否包含完整概念？
        - 重要術語是否有被切斷？
        - 評估方法：人工檢視（看完文字是否能理解）、問答測試法（先測試到最後，靠ai檢索是否能從準確回應答案）
- 結論：現在看看不準，要等run過一遍再來看llm的檢索能力
- trade-off解決：語意完整性 > 統計指標

In [73]:
# 檢查chunk是否合理
def analyze_chunks(docs):
    lengths = [len(d.page_content) for d in docs]
    
    print(f"Total chunks: {len(docs)}")
    print(f"Average length: {sum(lengths)/len(lengths):.0f}")
    print(f"Min length: {min(lengths)}")  
    print(f"Max length: {max(lengths)}")
    print(f"Length std dev: {(sum((x-sum(lengths)/len(lengths))**2 for x in lengths)/len(lengths))**0.5:.0f}")
    
    # 檢查是否有太短的chunk（可能是切割錯誤）
    short_chunks = [i for i, l in enumerate(lengths) if l < 200]
    if short_chunks:
        print(f"Warning: {len(short_chunks)} chunks are very short")
        
analyze_chunks(docs)

Total chunks: 54
Average length: 784
Min length: 125
Max length: 998
Length std dev: 215


# 第三步測試：embedding 並放入資料庫

embedding trade-offs:

- 回答狀況
- Embedding品質檢查


In [None]:
# 安裝新套件
%pip install -U langchain-openai
%pip install chromadb

In [16]:
# 建立資料庫路徑，已有路徑則可忽略
import os

# 建立資料夾
os.makedirs('./chroma_db', exist_ok=True)
print("finish！")

# 檢查是否成功
print(f"資料夾存在嗎？{os.path.exists('./chroma_db')}")

finish！
資料夾存在嗎？True


In [17]:
# define embedding
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()


In [18]:
# 注意，需要先在自己的環境中建立資料庫路徑
persist_directory = './chroma_db' # 指定資料庫路徑
!rm -rf ./chroma_db  # remove old database files if any

In [19]:
# 建立新的向量資料庫，並將文件放進去
from langchain.vectorstores import Chroma
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [20]:
# 檢查剛剛的向量行數是否與塊數相等
print(vectordb._collection.count())


54


In [92]:
question = "can you explain the abstract in this article?"
ans_docs = vectordb.similarity_search(question,k=3)
print(len(ans_docs))
print(ans_docs[1].page_content)

3
. Through these investigations, we
contribute to a deeper understanding of how the
order of options affects LLMs’ decision-making in
multiple-choice questions (MCQ) and offer practi-
cal solutions, which go beyond simple bootstrap-
ping, to increase their robustness and accuracy in
such scenarios.
2 Background and Experimental Details
This paper focuses on the task of multiple-choice
question answering. In multiple-choice questions,
the objective is to identify the correct answer to a
given question from a set of possible options (an il-
lustration is presented in Figure 1). To address this
task using in-context learning models, we present
a prompt in the following format: “Choose the
answer to the question only from A, B, C, D,
and E choices. Question: {question}. Choices:
{options}. Answer:” to the models


In [21]:
# 手動儲存剛剛建立的資料庫
vectordb.persist() 

  vectordb.persist()


# 第四步測試：檢索

In [48]:
# define retriever 改成使用mmr
retriever = vectordb.as_retriever(
    search_type="mmr", 
    search_kwargs={
        "k": 3,
        "fetch_k":20
        }
    )
# question = "what is the conclusion of this paper?"
# docs_mmr = vectordb.max_marginal_relevance_search(question,k=1)
# docs_mmr[0].page_content

In [None]:
# # 檢索更多結果，看看有沒有定義
# question = "what are the conclusions of this paper?"
# docs_more = vectordb.similarity_search(question, k=5)

# print("🔍 檢索到的所有段落：")
# for i, doc in enumerate(docs_more):
#     print(f"\n--- Chunk {i+1} ---")
#     print(doc.page_content[:200] + "...")

# 第五步測試：回答

In [38]:
# 選擇模型
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

# 初始化聊天機器人要用到的llm
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)
llm.predict("Hello world!")



gpt-3.5-turbo


  llm = ChatOpenAI(model_name=llm_name, temperature=0)
  llm.predict("Hello world!")


'Hello! How can I assist you today?'

In [54]:
# 新項目：載入ConversationBufferMemory
# 此套件能讓問答機器人記住過往歷史問答
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history", # 告訴 chain 去哪裡找歷史對話
    return_messages=True, # 返回的是物件（長得像json or meta data）
    output_key="answer"
)

In [55]:
qa = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(model_name=llm_name, temperature=0), 
        chain_type="map_reduce", 
        retriever=retriever, 
        return_source_documents=True,
        return_generated_question=True,
        memory=memory
    )

In [58]:
question = "what is the conclusion of this paper?"
result = qa({"question": question})
print(result['answer'])

# 錯誤：llm找不到「結論」，debug

The text does not provide specific information about the conclusion of the paper.


In [11]:
# rag邏輯
def load_db(file, chain_type, k):
    # load documents
    loader = PyPDFLoader(file)
    documents = loader.load()
    # split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    # define embedding
    embeddings = OpenAIEmbeddings()
    # create vector database from data
    db = DocArrayInMemorySearch.from_documents(docs, embeddings)
    # define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
    # create a chatbot chain. Memory is managed externally.
    qa = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(model_name=llm_name, temperature=0), 
        chain_type=chain_type, 
        retriever=retriever, 
        return_source_documents=True,
        return_generated_question=True,
    )
    return qa 


In [1]:
%pip install imutils

Defaulting to user installation because normal site-packages is not writeable
Collecting imutils
  Downloading imutils-0.5.4.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: imutils
  Building wheel for imutils (setup.py) ... [?25ldone
[?25h  Created wheel for imutils: filename=imutils-0.5.4-py3-none-any.whl size=25860 sha256=7b479d541b96342e75c3529427a9eea0c52bb150ce6c8188a74ef6c43f915ebd
  Stored in directory: /Users/mangtinglee/Library/Caches/pip/wheels/4b/a5/2d/4a070a801d3a3d93f033d3ee9728f470f514826e89952df3ea
Successfully built imutils
Installing collected packages: imutils
Successfully installed imutils-0.5.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you m

In [None]:
# 作用：管理對話流程、連接rag和操作介面
    # 例如：記住對話歷史、呼叫rag回答問題、整理答案給使用者看、提供輔助功能
import panel as pn
import param

class cbfs(param.Parameterized):
    chat_history = param.List([])
    answer = param.String("")
    db_query  = param.String("")
    db_response = param.List([])
    
    def __init__(self,  **params):
        super(cbfs, self).__init__( **params)
        self.panels = []
        self.loaded_file = "docs/cs229_lectures/MachineLearning-Lecture01.pdf"
        self.qa = load_db(self.loaded_file,"stuff", 4)
    
    def call_load_db(self, count):
        if count == 0 or file_input.value is None:  # init or no file specified :
            return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")
        else:
            file_input.save("temp.pdf")  # local copy
            self.loaded_file = file_input.filename
            button_load.button_style="outline"
            self.qa = load_db("temp.pdf", "stuff", 4)
            button_load.button_style="solid"
        self.clr_history()
        return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")

    def convchain(self, query):
        if not query:
            return pn.WidgetBox(pn.Row('User:', pn.pane.Markdown("", width=600)), scroll=True)
        result = self.qa({"question": query, "chat_history": self.chat_history})
        self.chat_history.extend([(query, result["answer"])])
        self.db_query = result["generated_question"]
        self.db_response = result["source_documents"]
        self.answer = result['answer'] 
        self.panels.extend([
            pn.Row('User:', pn.pane.Markdown(query, width=600)),
            pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, style={'background-color': '#F6F6F6'}))
        ])
        inp.value = ''  #clears loading indicator when cleared
        return pn.WidgetBox(*self.panels,scroll=True)

    @param.depends('db_query ', )
    def get_lquest(self):
        if not self.db_query :
            return pn.Column(
                pn.Row(pn.pane.Markdown(f"Last question to DB:", styles={'background-color': '#F6F6F6'})),
                pn.Row(pn.pane.Str("no DB accesses so far"))
            )
        return pn.Column(
            pn.Row(pn.pane.Markdown(f"DB query:", styles={'background-color': '#F6F6F6'})),
            pn.pane.Str(self.db_query )
        )

    @param.depends('db_response', )
    def get_sources(self):
        if not self.db_response:
            return 
        rlist=[pn.Row(pn.pane.Markdown(f"Result of DB lookup:", styles={'background-color': '#F6F6F6'}))]
        for doc in self.db_response:
            rlist.append(pn.Row(pn.pane.Str(doc)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    @param.depends('convchain', 'clr_history') 
    def get_chats(self):
        if not self.chat_history:
            return pn.WidgetBox(pn.Row(pn.pane.Str("No History Yet")), width=600, scroll=True)
        rlist=[pn.Row(pn.pane.Markdown(f"Current Chat History variable", styles={'background-color': '#F6F6F6'}))]
        for exchange in self.chat_history:
            rlist.append(pn.Row(pn.pane.Str(exchange)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    def clr_history(self,count=0):
        self.chat_history = []
        return 


In [None]:
# 網頁介面設計
    # 單純提供視覺介面讓大家可以操作
cb = cbfs()

file_input = pn.widgets.FileInput(accept='.pdf')
button_load = pn.widgets.Button(name="Load DB", button_type='primary')
button_clearhistory = pn.widgets.Button(name="Clear History", button_type='warning')
button_clearhistory.on_click(cb.clr_history)
inp = pn.widgets.TextInput( placeholder='Enter text here…')

bound_button_load = pn.bind(cb.call_load_db, button_load.param.clicks)
conversation = pn.bind(cb.convchain, inp) 

jpg_pane = pn.pane.Image( './img/convchain.jpg')

tab1 = pn.Column(
    pn.Row(inp),
    pn.layout.Divider(),
    pn.panel(conversation,  loading_indicator=True, height=300),
    pn.layout.Divider(),
)
tab2= pn.Column(
    pn.panel(cb.get_lquest),
    pn.layout.Divider(),
    pn.panel(cb.get_sources ),
)
tab3= pn.Column(
    pn.panel(cb.get_chats),
    pn.layout.Divider(),
)
tab4=pn.Column(
    pn.Row( file_input, button_load, bound_button_load),
    pn.Row( button_clearhistory, pn.pane.Markdown("Clears chat history. Can use to start a new topic" )),
    pn.layout.Divider(),
    pn.Row(jpg_pane.clone(width=400))
)
dashboard = pn.Column(
    pn.Row(pn.pane.Markdown('# ChatWithYourData_Bot')),
    pn.Tabs(('Conversation', tab1), ('Database', tab2), ('Chat History', tab3),('Configure', tab4))
)
dashboard

In [None]:
# 1. 準備LLM標註函數
def classify_academic_sections(chunk_content, chunk_metadata):
    """使用LLM為chunk標註學術章節標籤"""
    
    prompt = f"""
    請分析以下學術論文片段，判斷它屬於哪個章節。
    可能的章節包括：Abstract, Introduction, Method, Results, Conclusion, References, Other
    
    如果內容橫跨兩個章節，請用逗號分隔，例如："Introduction,Method"
    
    內容：
    {chunk_content}
    
    請只回傳章節標籤，不要其他解釋。
    """
    
    # 調用你的LLM API
    response = your_llm_api_call(prompt)
    
    # 處理回應，確保格式正確
    sections = [s.strip() for s in response.split(',')]
    return sections

# 2. 為所有chunks添加章節標籤
def add_section_labels_to_chunks(docs):
    """為所有chunks添加章節標籤到metadata"""
    
    for i, doc in enumerate(docs):
        print(f"處理 chunk {i+1}/{len(docs)}...")
        
        # 獲取章節標籤
        section_labels = classify_academic_sections(
            doc.page_content, 
            doc.metadata
        )
        
        # 添加到metadata
        doc.metadata['sections'] = section_labels
        doc.metadata['primary_section'] = section_labels[0] if section_labels else 'Other'
        
        # 可選：添加其他有用的metadata
        doc.metadata['chunk_id'] = i
        doc.metadata['chunk_length'] = len(doc.page_content)
    
    return docs

# 3. 執行標註
labeled_docs = add_section_labels_to_chunks(docs)

# 4. 檢查結果
for i in range(min(5, len(labeled_docs))): # 這段什麼意思？小於等於5塊
    print(f"\n--- Chunk {i} ---")
    print(f"章節: {labeled_docs[i].metadata.get('sections')}")
    print(f"主要章節: {labeled_docs[i].metadata.get('primary_section')}")
    print(f"內容: {labeled_docs[i].page_content[:100]}...")