In [10]:
# 建立整體的ui介面，變成一個問答機器人
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader

# 第一步測試：載入

In [11]:
'''
目的：建立langchain + openai 的基礎環境
'''
import os # 作業系統相關功能（讀取環境變數）
from openai import OpenAI # openai api 客戶端
from dotenv import load_dotenv, find_dotenv # dotenv 是專門用來讀取.env套件的套件，並接上環境
_ = load_dotenv(find_dotenv()) # 讀取.env檔案
client = OpenAI(
    api_key=os.environ['OPENAI_API_KEY']
)
print("done")

done


In [12]:
from dotenv import load_dotenv
import os

# 清除舊的環境變數
if 'OPENAI_API_KEY' in os.environ:
    del os.environ['OPENAI_API_KEY']

# 重新載入
load_dotenv()

api_key = os.environ.get('OPENAI_API_KEY')
if api_key:
    print("✅ Success! API key loaded")
    print(f"Key starts with: {api_key[:15]}...")
    print(f"Key length: {len(api_key)} characters")
else:
    print("❌ Still not working")

✅ Success! API key loaded
Key starts with: sk-proj-FWUUter...
Key length: 164 characters


In [13]:
# 檢查資料夾是否存在
import os
folder_path = "/Users/mangtinglee/Desktop/2025_gap_careerpath/RAG_LLM/pdfs"

print(f"Folder exists: {os.path.exists(folder_path)}")
# 看看有哪些PDF檔案
pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files:")
for pdf in pdf_files[:5]:  # 顯示前5個檔名
    print(f"  - {pdf}")

Folder exists: True
Found 24 PDF files:
  - 2023_LLM limitation_Large Language Models Sensitivity to The Order of Options in Multiple-Choice Questions.pdf
  - 2025_LLM limitation_The Order Effect- Investigating Prompt Sensitivity to Input Order in LLMs.pdf
  - 2024_RAG_Evaluation of Retrieval-Augmented Generation- A Survey.pdf
  - 2020_scaling laws_Scaling Laws for Neural Language Models.pdf
  - 2022_LLM limitation_Robustness of Learning from Task Instructions.pdf


2025/09/11 目前進度
- 已經完成所有過程跑過一遍
- 上週五（9/5）發現問題：問「結論」，llm回答「找不到結論」，但明明其中一章就是結論
- 判定問題：語意污染與雜訊過多，llm被不同章節的「結論干擾」，無法回應結論那一章的內容 → 需要定義好每一篇章節分類
- 9/5，原本打算用llm分類再放到每一塊中，實際執行困難，因為llm標注準確率較低
- 改為手動內容篩選，只保留每篇論文的第一頁，只想要 Abstract + Introduction開頭
- 後來改為只想要Abstract，並且手動建立一份ｃｓｖ meta data
- 9/11 進度規劃：
    - 載入檔案時，自動對應metadata
    - 重新跑一遍，並測試效果
    - 

In [16]:

# 1. 簡化版：只要第一頁
import pandas as pd

def load_metadata_mapping(csv_path):
    """讀取CSV檔案建立metadata對應表"""
    df = pd.read_csv(csv_path)
    # 將filename作為key，其他資訊作為value
    metadata_map = {}
    for _, row in df.iterrows():
        metadata_map[row['filename']] = {
            'title': row['title'],
            'year': row['year'],
            'authors': row['authors'],
            'topic': row['topic']
        }
    return metadata_map

def load_all_first_pages_with_csv_metadata(folder_path, csv_path):
    """載入PDF第一頁並從CSV對應metadata"""
    
    # 1. 先讀取metadata對應表
    metadata_map = load_metadata_mapping(csv_path)
    print(f"載入metadata對應表，共 {len(metadata_map)} 筆資料")
    
    all_first_pages = []
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    for pdf_file in pdf_files:
        print(f"處理：{pdf_file}")
        
        file_path = os.path.join(folder_path, pdf_file)
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        
        # 只取第一頁
        first_page_docs = filter_first_page_only(documents)
        
        # 🎯 關鍵：從CSV對應metadata
        for doc in first_page_docs:
            if pdf_file in metadata_map:
                # 找到對應的metadata
                doc.metadata.update(metadata_map[pdf_file])
                print(f"  ✅ 已更新metadata: {metadata_map[pdf_file]['title']}")
            else:
                # 找不到對應資料
                print(f"  ⚠️  警告：{pdf_file} 在CSV中找不到對應資料")
                doc.metadata.update({
                    'title': pdf_file.replace('.pdf', ''),
                    'year': None,
                    'authors': 'Unknown',
                    'topic': 'Unknown'
                })
        
        all_first_pages.extend(first_page_docs)
    
    print(f"總共載入 {len(all_first_pages)} 個第一頁，metadata已更新")
    return all_first_pages



def filter_first_page_only(documents):
    """只保留第一頁（Abstract）"""
    first_page_docs = [doc for doc in documents if doc.metadata['page'] == 0]
    print(f"原始頁數：{len(documents)} → 只保留第一頁：{len(first_page_docs)} 頁")
    return first_page_docs


# 執行
csv_path = "/Users/mangtinglee/Desktop/2025_gap_careerpath/RAG_LLM/meta_data_correction.csv"

# 載入
all_abstracts_with_metadata = load_all_first_pages_with_csv_metadata(folder_path, csv_path)

# 檢查結果
for i, doc in enumerate(all_abstracts_with_metadata[:3]):
    print(f"\n--- Document {i+1} ---")
    print(f"Title: {doc.metadata.get('title', 'N/A')}")
    print(f"Year: {doc.metadata.get('year', 'N/A')}")
    print(f"Authors: {doc.metadata.get('authors', 'N/A')}")
    print(f"Topic: {doc.metadata.get('topic', 'N/A')}")

KeyError: 'title'

# 第二步測試：分割檔案

In [8]:
# 測試2. 分割檔案
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=50,
    separators=[ "\n\n", ". ", "\n", "(?<=\. )", " ", ""]
    )    
docs = text_splitter.split_documents(all_abstracts)
print(len(docs)) # chunks
print(len(all_abstracts)) # test 11 pages

116
24


In [50]:
print(docs[10].metadata)

{'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-05-12T00:55:20+00:00', 'author': '', 'keywords': '', 'moddate': '2025-05-12T00:55:20+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/Users/mangtinglee/Desktop/2025_gap_careerpath/RAG_LLM/pdfs/2025_LLM limitation_The Order Effect- Investigating Prompt Sensitivity to Input Order in LLMs.pdf', 'total_pages': 13, 'page': 0, 'page_label': '1'}


所以我的理解是這一大段落其實是先抓取原始documents（已經過loader但還沒splits的檔案），先用正則辨識出大概的標題section_headers，在用llm去辨識section_headers屬於哪種分類嗎？
因為我的檔案有些標題無法辨識他是研究結果或方法，例如5 Calibrating LLMs for MCQ Tasks，所以我的想法是，我們是否到時候可以直接先分塊，假設分了54塊，我們再請llm讀取54塊，然後讓他貼標（複數）？

# 第三步測試：embedding 並放入資料庫

In [51]:
# 建立資料庫路徑，已有路徑則可忽略
import os

# 建立資料夾
os.makedirs('./chroma_db', exist_ok=True)
print("finish！")

# 檢查是否成功
print(f"資料夾存在嗎？{os.path.exists('./chroma_db')}")

finish！
資料夾存在嗎？True


In [52]:
# define embedding
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()


In [53]:
# 注意，需要先在自己的環境中建立資料庫路徑
persist_directory = './chroma_db' # 指定資料庫路徑
!rm -rf ./chroma_db  # remove old database files if any

In [54]:
# 建立新的向量資料庫，並將文件放進去
from langchain.vectorstores import Chroma
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [59]:
question = "can you explain llm limitation?"
ans_docs = vectordb.similarity_search(question,k=3)
print(len(ans_docs))
print(ans_docs[2].page_content)

3
. Many of these issues, such as bias (Talat et al., 2022; Motoki
et al., 2023), hallucination (Chen et al., 2023; Sadat et al., 2023), consistency (Tam et al., 2023; Ye
et al., 2023), and reliability (Shen et al., 2023b) have been extensively discussed in the literature.
However, a more fundamental challenge to the long-term success of LLMs is their ability to reason:
the distinguishing factor between probabilistic pattern matching and logical understanding. This
distinction has significant implications for the future of LLMs and how we employ these models in
decision-making.
One necessary requirement for reasoning is order independence. A model should provide the same
consistent response to a query regardless of the order of its content. Historically, LLMs have strug-
gled with this issue. Swapping subsequences within semantically identical inputs often leads to
significant changes in output, a problem that worsens as inputs grow in size and complexity (He
et al., 2024)


In [60]:
# 手動儲存剛剛建立的資料庫
vectordb.persist() 

  vectordb.persist()


In [95]:
# define retriever 改成使用mmr
retriever = vectordb.as_retriever(
    search_type="mmr", 
    search_kwargs={
        "k": 3,
        "fetch_k":20
        }
    )
question = "is there any author named Jean Kaddour ?"
docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)
docs_mmr[2].page_content

'The Prompt Report: A Systematic Survey of Prompt Engineering\nTechniques\nSander Schulhoff1,2∗ Michael Ilie1∗Nishant Balepur1 Konstantine Kahadze1\nAmanda Liu1 Chenglei Si4 Yinheng Li5 Aayush Gupta1 HyoJung Han1 Sevien Schulhoff1\nPranav Sandeep Dulepet1 Saurav Vidyadhara1 Dayeon Ki1 Sweta Agrawal12 Chau Pham13\nGerson Kroiz Feileen Li 1 Hudson Tao1 Ashay Srivastava1 Hevander Da Costa1 Saloni Gupta1\nMegan L'

In [92]:
docs_mmr[1].page_content

'SPANISH AND LLM B ENCHMARKS : IS MMLU L OST IN\nTRANSLATION ?\nA PREPRINT\nIrene Plaza, Nina Melero, Cristina del Pozo, Javier Conde and Pedro Reviriego\nETSI de Telecomunicación\nUniversidad Politécnica de Madrid\n28040 Madrid, Spain\nMarina Mayor-Rocher\nFacultad de Filosofía y Letras\nUniversidad Autónoma de Madrid\n28049 Madrid, Spain\nMaría Grandury\nSomosNLP\n24402, Ponferrada, Spain\nJune 27, 2024\nABSTRACT\nThe evaluation of Large Language Models (LLMs) is a key element in their continuous improvement\nprocess and many benchmarks have been developed to assess the performance of LLMs in different\ntasks and topics. As LLMs become adopted worldwide, evaluating them in languages other than\nEnglish is increasingly important. However, most LLM benchmarks are simply translated using\nan automated tool and then run in the target language. This means that the results depend not\nonly on the LLM performance in that language but also on the quality of the translation'

In [93]:
docs_mmr[2].page_content

'. . . . . . 36\n3.3 Computer Programming . . . . . 37\n*Equal contribution.\n†{jean.kaddour,robert.mchardy}.20@ucl.ac.uk,\njoshua.harris@ukhsa.gov.uk\nDesign\nUnfathomable \nDatasets, \nTokenizer-Reliance,\nFine-Tuning \nOverhead\nScience\n \nEvaluations \nBased \non \nStatic \nHuman-Written \nGround \nTruth,\nLacking \nExperimental \nDesigns,\nLack \nof \nReproducibility\nBehavior\nPrompt \nBrittleness, \nMisaligned \nBehavior,\nOutdated \nKnowledge\nDetecting \nGenerated \nTexts, \nBrittle \nEvaluations\nHigh \nPre-Training \nCosts\nHigh \nInference \nLatency, \nLimited \nContext \nLength, \nHallucinations\nTasks \nNot \nSolvable\nBy \nScale\nFigure 1: Overview of LLM Challenges. Designing\nLLMs relates to decisions taken before deployment. Be-\nhaviorial challenges occur during deployment. Science\nchallenges hinder academic progress.\n3.4 Creative Work . . . . . . . . . . . 39\n3.5 Knowledge Work . . . . . . . . . 40\n3.6 Law . . . . . . . . . . . . . . . . 42\n3.7 Medicine . . . 

In [67]:
# 選擇模型
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

# 初始化聊天機器人要用到的llm
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)
llm.predict("Hello world!")



gpt-3.5-turbo


  llm = ChatOpenAI(model_name=llm_name, temperature=0)
  llm.predict("Hello world!")


'Hello! How can I assist you today?'

In [68]:
# 新項目：載入ConversationBufferMemory
# 此套件能讓問答機器人記住過往歷史問答
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history", # 告訴 chain 去哪裡找歷史對話
    return_messages=True, # 返回的是物件（長得像json or meta data）
    output_key="answer"
)

  memory = ConversationBufferMemory(


In [69]:
qa = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(model_name=llm_name, temperature=0), 
        chain_type="map_reduce", 
        retriever=retriever, 
        return_source_documents=True,
        return_generated_question=True,
        memory=memory
    )

In [84]:
question = "What evaluation methods are used for LLMs?"
result = qa({"question": question})
print(result['answer'])


Evaluation methods for Large Language Models (LLMs) include using benchmarks to assess performance in tasks such as common sense reasoning problems and mathematical questions. Some benchmarks evaluate multiple tasks to provide a more comprehensive evaluation of LLM capabilities. However, evaluations for LLMs are based on static human-written ground truth, which has been criticized for lacking experimental designs and reproducibility. Additionally, evaluations suffer from prompt brittleness, misaligned behavior, outdated knowledge, and challenges in detecting generated texts and conducting evaluations due to various factors like high pre-training costs, high inference latency, limited context length, and hallucinations.


In [90]:
question = "can you show me the reference's name of those three llm's limitations you gave me before?"
result = qa({"question": question})
print(result['answer'])

The names of the references for the limitations of Large Language Models (LLMs) mentioned earlier are Jean Kaddour, Joshua Harris, Maximilian Mozes, Herbie Bradley, Roberta Raileanu, Robert McHardy, Zhao et al., 2021, and Wang et al., 2023b.
