In [None]:
import os

In [None]:
os.chdir("../../../")

In [None]:
import json
from operator import itemgetter

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import chain
from langchain.retrievers import BM25Retriever
from langchain.docstore.document import Document

from src.initialization import credential_init
from src.io.path_definition import get_project_dir


# 快速建立chat_prompt_template
def build_standard_chat_prompt_template(kwargs):

    messages = []
 
    if 'system' in kwargs:
        content = kwargs.get('system')
        prompt = PromptTemplate(**content)
        message = SystemMessagePromptTemplate(prompt=prompt)
        messages.append(message)  

    if 'human' in kwargs:
        content = kwargs.get('human')
        prompt = PromptTemplate(**content)
        message = HumanMessagePromptTemplate(prompt=prompt)
        messages.append(message)
        
    chat_prompt = ChatPromptTemplate.from_messages(messages)

    return chat_prompt


#快速建立pipeline
def build_pipeline(model, inputs, parser=None):
    prompt = build_standard_chat_prompt_template(inputs)
    chain = prompt | model
    if parser:
        chain |= parser
    return chain


credential_init()

model = ChatOpenAI(openai_api_key=os.environ['OPENAI_API_KEY'],
                   model_name="gpt-4o", temperature=0)

# RAG N-Shot

> 🎯 **本章學完你將能學會什麼：**
> - 了解 RAG (Retrieval-Augmented Generation) 與 N-Shot Prompting 的概念與應用
> - 學會如何讓模型「學風格」──從示例中提取風格特徵並以相同風格生成內容
> - 掌握 分類任務 的不同策略（Zero-Shot、Few-Shot、LLM-based Classification）
> - 能實作 詩詞分類、飛安報告分類 等多樣化應用
> - 了解 非同步資料處理流程 (Async + LCEL)，提升資料擷取與分析效能
> - 學會從 網頁、Word、PDF 中提取文字資料並轉化為可用內容
> - 能建立一個 AI 招募匹配系統：從網頁抓取職缺 → 提取履歷 → 分析 → 自動配對

> 📘 最終你將具備的能力：
> -能夠獨立設計並實作一個整合檢索、風格模仿、分類與匹配的 AI 自動化工作流，
> -讓模型不只回答問題，更能以特定風格「思考、生成與決策」。

在 RAG (Retrieval-Augmented Generation) 中，我們不只可以讓模型檢索資料庫來回答問題，還可以透過 N-Shot 提示 (N-Shot Prompting) 的方式，讓模型學習「風格」。

這裡的 N，代表你給模型幾個示例 (Examples)。

1-Shot：只給一個示例，模型會模仿該風格來生成。

Few-Shot (N-Shot)：給多個示例，模型會歸納出共同的風格特徵。

0-Shot：完全沒有示例，模型只能靠內建知識來生成。

透過這種方式，我們可以讓模型不只是「回答問題」，而是「用指定風格來回答問題」。

老樣子，選擇風格鮮明的例子。

## 風格學習: 掄語

In [None]:
data = [["人不知，而不愠，不亦君子乎", "有人不知道我的大名，可我還沒發怒，這已經很君子了。"], 
        ["君子不重則不威", "君主打人一定要下重手，不然就樹立不了威信。"],
        ["君子愛財，取之有道", "我喜歡錢，所以拿走你的錢，這是很有道理的。"],
        ["既來之，則安之。", "既然來到了這裡，那麼就安葬在這裡吧。"],
        ["子不語怪力亂神。", "夫子不想說話，施展起怪力將人打的神志不清。"],
        ["不義而富且貴，於我如浮雲。", "不正當的錢財，對我來說猶如浮雲一般多。"],
        ["朝聞道，夕可死矣。", "早上聽到我來了，晚上你就得死。"],
        ["三人行，必有我師焉。", "有三個人，只要其中有一個是我，戰力就相當於一個師。"],
        ["凡事豫則立，不豫則廢。", "但凡打架，只要猶豫，對面便站起來了。不猶豫就能直接將對面打廢。"],
        ["孔子東遊，見兩小兒辯日。", "孔子去東邊打架，小孩在討論和孔子打架的人還能不能見到明天的太陽。"],
        ["父母在，不遠遊，遊必有方。", "你父母在我手裡，你跑不了的，就算你跑了，我也有辦法把你抓回來。"],
        ["始作傭者，其無後乎。", "這件事的主謀，已經被我打的絕後了。"],
        ["鬼神敬而遠之", "孔子一旦發威，連鬼神見了都得敬畏的遠離他。"],
        ["力不足者，中道而廢。", "力量不如我的人，在道上就只能被我打廢。"],
        ["不恥下問", "看到我不自愧者，你就去下面問問。"],
        ["三年無改於父之道，可謂孝矣。", "三年不該認我當父親的習慣，可以算作孝了。"],
        ["人之將死，其言也善。", "把人打到瀕死，說的話也就好聽了。"],
        ["知之為知之，不知為不知，是知也。", "該知道的知道，不該知道的少知道，知道嗎?"],
        ["有教無類", "我在教你做事情，不管你是什麼人"],
        ["子在川上曰: 逝者如斯夫，不捨晝夜。", "夫子站在河上說:死的人這麼多，是因為我不分晝夜地打人。"]]

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document

documents = []

for row in data:
    document = Document(page_content=row[0],
                        metadata={"翻譯": row[1]})
    documents.append(document)

embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")

In [None]:
client = QdrantClient(path="/tmp/langchain_qdrant")

collection_name = "掄語"
dimension =  embeddings.client.get_sentence_embedding_dimension()

try:
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),
    )
except ValueError:
    client.delete_collection(collection_name=collection_name)
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),
    )

vectorstore_QVS = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
    retrieval_mode=RetrievalMode.DENSE,
)

vectorstore_QVS.add_documents(documents=documents)

In [None]:
user_query = "其為人也孝弟，而好犯上者，鮮矣"

In [None]:
retriever_QVS = vectorstore_QVS.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retriever_QVS.invoke(user_query)

In [None]:
from textwrap import dedent

system_template = dedent("""
You are a helpful AI assistant and you will help us interpret the content based on the style of the examples:

{context}
""")

human_template = dedent("""
{query}
""")

input_ = {"system": {"template": system_template,
                     "input_variables": ['context']},
          "human": {"template": human_template,
                    "input_variables": ["query"],
                    }}

chat_prompt_template = build_standard_chat_prompt_template(input_)

In [None]:
retrieved_documents = retriever_QVS.invoke(user_query)

In [None]:
context = [f"Human: {document.page_content}\nAI: {document.metadata['翻譯']}" for document in retrieved_documents]

print("\n\n".join(context))

In [None]:
merged_context = "\n\n".join(context)

chat_prompt = chat_prompt_template.invoke({"query": user_query, "context": merged_context})
model.invoke(chat_prompt)

味道不太對...我們有辦法強化生成嗎?

In [None]:
model.invoke(f"Help us analyze the style:\n{merged_context} and reply in traditional Chinese(繁體中文)")

In [None]:
from pydantic import BaseModel, Field
from typing import Literal

from langchain_core.output_parsers import PydanticOutputParser

class requirements(BaseModel):

    style: str = Field(description="The underlying style shown in the content. The output shall be in traditional Chinese (繁體中文).")

output_parser = PydanticOutputParser(pydantic_object=requirements)
format_instructions = output_parser.get_format_instructions()

human_template = dedent("""
                 {query}
                 format instruction: {format_instructions}
                 """)

input_ = {"human": {"template": human_template,
                    "input_variable": ["query"],
                    "partial_variables": {"format_instructions": 
                                          format_instructions}}}

chat_prompt_template = build_standard_chat_prompt_template(input_)

prompt = chat_prompt_template.invoke({"query": f"Help us analyze the style:\n{merged_context}"})

In [None]:
output = model.invoke(prompt)

In [None]:
output_parser.parse(output.content)

### 方法:

1. 使用檢索器檢索相關訊息
2. 根據檢索出來的內容抽取風格

In [None]:
query

In [None]:
from operator import itemgetter

from langchain_core.runnables import RunnablePassthrough, chain, RunnableLambda

@chain
def document_2_context(documents):

    context = [f"Human: {document.page_content}\nAI: {document.metadata['翻譯']}" for document in documents]

    return "\n\n".join(context)

context_extraction_pipeline = itemgetter("query")|retriever_QVS|document_2_context

# Retrieval
# context_extraction_pipeline.invoke({"query": query})

In [None]:
human_template = dedent("""
                 Help us analyze the style of interpretation shown in the text: 
                 
                 {context}
                 
                 format instruction: {format_instructions}
                 """)

input_ = {"human": {"template": human_template,
                    "input_variables": ["context"],
                    "partial_variables": {"format_instructions": 
                                          format_instructions}}}

style_prompt_template = build_standard_chat_prompt_template(input_)

# style_pipeline = RunnablePassthrough.assign(context=itemgetter("query")|retriever_QVS|document_2_context)
# print(style_pipeline.invoke({"query": query}))

In [None]:
from langchain_core.runnables import RunnableLambda

style_pipeline = RunnablePassthrough.assign(context=context_extraction_pipeline)
style_pipeline.invoke({"query": query})

In [None]:
style_extraction_pipeline = style_prompt_template|model|output_parser|RunnableLambda(lambda x: x.style)

style_pipeline = RunnablePassthrough.assign(context=context_extraction_pipeline)|RunnablePassthrough.assign(style=style_extraction_pipeline)
style_pipeline.invoke({"query": query})

#### 現在我們看到風格可以被提取出來了

In [None]:
system_template = dedent("""
You are a helpful AI assistant and you will help us interpret the user query with this style:
{style}
""")

human_template = dedent("""
Examples:
{context}

query: {query}
""")

input_ = {"system": {"template": system_template,
                     "input_variables": ['style']},
          "human": {"template": human_template,
                    "input_variables": ["query", "context"],
                    }}



generate_prompt_template = build_standard_chat_prompt_template(input_)

generate_pipeline = style_pipeline|generate_prompt_template|model

print(generate_pipeline.invoke({"query": user_query}))

In [None]:
query = "有顏回者好學，不遷怒，不貳過。不幸短命死矣！"

generate_prompt_template = build_standard_chat_prompt_template(input_)

generate_pipeline = style_pipeline|generate_prompt_template|model

print(generate_pipeline.invoke({"query": query}))

In [None]:
query = "子路曰：「衛君待子而為政，子將奚先？」子曰：「必也正名乎！」"

generate_prompt_template = build_standard_chat_prompt_template(input_)

generate_pipeline = style_pipeline|generate_prompt_template|model

print(generate_pipeline.invoke({"query": query}))

# 分類任務

在機器學習中，分類主要有兩大類：

- 情感分類 (Sentiment Classification)：判斷文本所表達的情感，例如「正面」「中立」「負面」。

- 主題/類別分類 (Topic/Categorical Classification)：將文本歸到特定類別，例如「新聞 → 體育 / 政治 / 財經」。

## 工作原理:

分類模型通常透過 監督學習 (Supervised Learning) 訓練而成。

- 數據與標籤：我們有輸入數據（例如一段文字）和對應標籤（例如「正面」）。

- 學習過程：模型反覆從「題目—答案」對中學習，逐漸掌握輸入與輸出之間的規律。

這更像是學會「抓模式」而不是「死背答案」。

## 挑戰

監督學習分類的實際挑戰包括：

- 耗時：資料標註需要大量人工投入。

- 不一致：不同標註者可能對相同數據有不同理解。

- 昂貴：大規模數據收集與標註成本高。

- 資源密集：模型訓練往往需要強大硬體或雲端服務（AWS、Azure 等）。

- 運營成本：模型在雲端持續運行也需要付出高額費用。

## 大型語言模型 (LLM) 的應用

近年的 大型語言模型（LLM，如 GPT-3、GPT-4） 提供了新的分類方式：

- 少樣本 / 零樣本分類：只需少量範例，甚至僅靠提示，就能完成分類任務。

- 降低標註需求：不必建立龐大的人工標註資料集。

- 可微調：仍可透過微調 (fine-tuning) 針對特定分類任務強化效果。

這使得分類不再完全依賴傳統的監督學習流程，降低了成本與時間。

## 0-Shot

In [None]:
from IPython.display import Image

Image(url="https://substackcdn.com/image/fetch/w_1456,c_limit,f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fb157d0e9-d18e-4835-a601-edeb011f0ee6_721x247.png")

## 飛安事故原因分類

In [None]:
import pandas as pd

df = pd.read_excel(os.path.join(get_project_dir(), 'tutorial', 'LLM+Langchain', 'Week-3', 'Data sample.xlsx'))

In [None]:
df.head(5)

In [None]:
df.iloc[5]

In [None]:
from IPython.display import Image

Image(filename= "tutorial/LLM+Langchain/Week-3/HFACS_Org_Inf.jpg")

In [None]:
system_template = dedent("""
                  You are an AI assistant assigned with a task of safety report 
                  classification based on the content. You are a seasoned 
                  flight safety inspector with deep and extensive knowledge of 
                  aviation safty. 
    
                  You always do the best work you can. You are highly 
                  analytical and pay close attention to details. 
    
                  The candidates of the output are:

                  - `Organizational Influence;Resource Management`
                  - `Organizational Influence;Organizational Climate`
                  - `Organizational Influence;Organizational Process`
                  - `Unsafe Supervisions;Inadequate Supervision`
                  - `Unsafe Supervisions;Planned Inappropriate Operations`
                  - `Unsafe Supervisions;Failed to Correct Problem`
                  - `Unsafe Supervisions;Supervisory Violation`
                  - `Precondition for Unsafe Acts;Environmental Factors;Physical Environment`
                  - `Precondition for Unsafe Acts;Environmental Factors;Technological Environment`
                  - `Precondition for Unsafe Acts;Condition of Operators;Adverse Mental State`
                  - `Precondition for Unsafe Acts;Condition of Operators;Adverse Physiological State`
                  - `Precondition for Unsafe Acts;Condition of Operators;Physical/Mental Limitations`
                  - `Precondition for Unsafe Acts;Personnel Factors;Crew Resource Management`
                  - `Precondition for Unsafe Acts;Personnel Factors;Personal Readiness`
                  - `Unsafe Acts;Errors;Decision Errors`
                  - `Unsafe Acts;Errors;Skill-Based Errors`
                  - `Unsafe Acts;Errors;Perceptual Errors`
                  - `Unsafe Acts;Violations;Routine`
                  - `Unsafe Acts;Violations;Exceptional`
            
                 The output is from one of the candidates. 
                 """)

human_template = "{report}"

input_ = {"system": {"template": system_template},
          "human": {"template": human_template,
                    "input_variables": ["report"]}}

pipeline_ = build_pipeline(model=model, inputs=input_, parser=StrOutputParser())

# chat_prompt_template = build_standard_chat_prompt_template(input_)

# pipeline_ = chat_prompt_template | model | StrOutputParser()

In [None]:
text = df.iloc[3]['Report 1']

print(text)

- 可以想一下，當一天有上百份這種報告的時候，你想要自己閱讀報告得出結論或是將這件工作外包給機器。

In [None]:
output = pipeline_.invoke({"report": text})

print(output)

### 使用parser精煉結果

In [None]:
response_schemas = [
        ResponseSchema(name="category", 
                       description=dedent("""
                                   The predicted category of the classification
                                   """))]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

system_template = dedent("""
                  You are an AI assistant assigned with a task of safety report 
                  classification based on the content. You are a seasoned 
                  flight safety inspector with deep and extensive knowledge of 
                  aviation safty. 
    
                  You always do the best work you can. You are highly 
                  analytical and pay close attention to details. 
    
                  The candidates of the output are:

                  - `Organizational Influence;Resource Management`
                  - `Organizational Influence;Organizational Climate`
                  - `Organizational Influence;Organizational Process`
                  - `Unsafe Supervisions;Inadequate Supervision`
                  - `Unsafe Supervisions;Planned Inappropriate Operations`
                  - `Unsafe Supervisions;Failed to Correct Problem`
                  - `Unsafe Supervisions;Supervisory Violation`
                  - `Precondition for Unsafe Acts;Environmental Factors;Physical Environment`
                  - `Precondition for Unsafe Acts;Environmental Factors;Technological Environment`
                  - `Precondition for Unsafe Acts;Condition of Operators;Adverse Mental State`
                  - `Precondition for Unsafe Acts;Condition of Operators;Adverse Physiological State`
                  - `Precondition for Unsafe Acts;Condition of Operators;Physical/Mental Limitations`
                  - `Precondition for Unsafe Acts;Personnel Factors;Crew Resource Management`
                  - `Precondition for Unsafe Acts;Personnel Factors;Personal Readiness`
                  - `Unsafe Acts;Errors;Decision Errors`
                  - `Unsafe Acts;Errors;Skill-Based Errors`
                  - `Unsafe Acts;Errors;Perceptual Errors`
                  - `Unsafe Acts;Violations;Routine`
                  - `Unsafe Acts;Violations;Exceptional`
            
                 The output is from one of the candidates. 
                 """)

human_template = """
                 {report}; 
                 format instruction: {format_instructions}
                 """


input_ = {"system": {"template": system_template},
          "human": {"template": human_template,
                    "input_variables": ["report"],
                    "partial_variables": {'format_instructions': format_instructions}}}

pipeline_ = build_pipeline(model=model, inputs=input_, parser=StrOutputParser())

# chat_prompt_template = build_standard_chat_prompt_template(input_)

# pipeline_ = chat_prompt_template|model|output_parser

In [None]:
output = pipeline_.invoke({"report": text})

In [None]:
output

## 詩詞分類

把唐詩宋詞拿出來回鍋利用

有沒有一種高中時該好好學習的感覺?

In [None]:
import pandas as pd

# Read file
filename = os.path.join("tutorial", "LLM+Langchain", "Week-1", "唐詩三百首.txt")
with open(filename, "r", encoding="utf-8") as f:
    text = f.read()

poems = []

# Split by blank lines
blocks = [b.strip() for b in text.strip().split("\n\n") if b.strip()]

for block in blocks:
    entry = {}
    for line in block.split("\n"):
        if line.startswith("詩名:"):
            entry["詩名"] = line.replace("詩名:", "").strip()
        elif line.startswith("作者:"):
            entry["作者"] = line.replace("作者:", "").strip()
        elif line.startswith("詩體:"):
            entry["詩體"] = line.replace("詩體:", "").strip()
        elif line.startswith("詩文:"):
            entry["詩文"] = line.replace("詩文:", "").strip()
    if len(entry) != 0:
        poems.append(entry)

df_poem = pd.DataFrame(poems)

In [None]:
class Output(BaseModel):

    name: Literal['五言古詩', '七言古詩', '七言律詩', 
                  '五言絕句', '樂府', '七言絕句', '五言律詩'] = Field(description="唐詩詩體")

output_parser = PydanticOutputParser(pydantic_object=requirements)

format_instructions = output_parser.get_format_instructions()


system_template = dedent("""
                  You are a help AI assistant specialized at Chinese literature, especially the 唐詩。
                  You are assigned with a task of classify the given poem.
                  The outcomes will be from one of these candidates:

                  - 五言古詩
                  - 七言古詩
                  - 七言律詩
                  - 五言絕句
                  - 樂府
                  - 七言絕句
                  - 五言律詩
                  """)

human_template = dedent("""
                 poem: {query}
                 format instruction: {format_instructions}
                 """)

input_ = {"system": {"template": system_template},
          "human": {"template": human_template,
                    "input_variables": ["query"],
                    "partial_variables": {"format_instructions": 
                                          format_instructions}}}

classification_pipeline = build_pipeline(model=model, inputs=input_, parser=output_parser)

# classification_prompt_template = build_standard_chat_prompt_template(input_)

# classification_pipeline = classification_prompt_template|model|output_parser

In [None]:
df_poem[df_poem['作者']=='李白']

In [None]:
query = df_poem.loc[89]['詩文']

In [None]:
query

In [None]:
classification_pipeline.invoke({"query": query})

In [None]:
query = dedent("""
趙客縵胡纓，吳鉤霜雪明；
銀鞍照白馬，颯沓如流星。
十步殺一人，千里不留行；
事了拂衣去，深藏身與名。
閑過信陵飲，脫劒膝前橫；
將炙啖朱亥，持觴勸侯嬴。
三杯吐然諾，五嶽倒爲輕；
眼花耳熱後，意氣素霓生。
救趙揮金槌，邯鄲先震驚；
千秋二壯士，烜赫大梁城。
縱死俠骨香，不慙世上英；
誰能書閤下，白首太玄經。
""")

classification_pipeline.invoke({"query": query})

練習：請修改 RAG 模型，讓它能模仿李白詩風回答現代問題。

### 挑戰宋詞

In [None]:
# Read file
filename = os.path.join("tutorial", "LLM+Langchain", "Week-1", "宋詞三百首.txt")
with open(filename, "r", encoding="utf-8") as f:
    text = f.read()

poems = []

# Split by blank lines
blocks = [b.strip() for b in text.strip().split("\n\n") if b.strip()]

for block in blocks:
    entry = {}
    for line in block.split("\n"):
        if line.startswith("詞牌:"):
            entry["詞牌"] = line.replace("詞牌:", "").strip()
        elif line.startswith("作者:"):
            entry["作者"] = line.replace("作者:", "").strip()
        elif line.startswith("詞文:"):
            entry["詞文"] = line.replace("詞文:", "").strip()
    if len(entry) != 0:
        poems.append(entry)

df_poem = pd.DataFrame(data=poems)

In [None]:
values = df_poem['詞牌'].dropna().unique().tolist()

# Generate a Literal definition
literal_def = f"Literal[{', '.join(repr(v) for v in values)}]"

eval(literal_def)

In [None]:
class Output(BaseModel):

    name: eval(literal_def) = Field(description="宋詞詞牌")

output_parser = PydanticOutputParser(pydantic_object=requirements)

format_instructions = output_parser.get_format_instructions()


classes = '\n'.join(v for v in values)


system_template = dedent(f"""
                  You are a help AI assistant specialized at Chinese literature, especially the 宋詞。
                  You are assigned with a task of classify the given poem.
                  The outcomes will be from one of these candidates:

                  {classes}
                  """)

human_template = dedent("""
                 poem: {query}
                 format instruction: {format_instructions}
                 """)

input_ = {"system": {"template": system_template},
          "human": {"template": human_template,
                    "input_variables": ["query"],
                    "partial_variables": {"format_instructions": 
                                          format_instructions}}}

classification_pipeline = build_pipeline(model=model, inputs=input_, parser=output_parser)

# classification_prompt_template = build_standard_chat_prompt_template(input_)

# classification_pipeline = classification_prompt_template|model|output_parser

In [None]:
classification_pipeline.invoke({"query": df_poem.loc[0]['詞文']})

In [None]:
df_poem.loc[0]

詞牌檢測似乎很困難，可能需要依賴一些專業知識來進行特徵抽取。
猜測原因可能是詞牌對於平仄的格式有強烈的要求，而中文是表意文字，在LLM模型訓練的時候，聲音並不會被記錄。
所以詞牌偵測可能需要先將內容全部轉換為平仄，然後使用BLEU score計算和詞牌的相似性進行預測。

## HR: Job-Applicant Matching

- 效率和速度： LLMs 能夠快速處理和分析大量申請，相較於人工審查，顯著縮短初步篩選所需的時間。

- 一致性和公平性： LLMs 對所有申請應用相同的標準，最小化人為偏見，確保初步篩選過程的公平性。

- 詳細分析： LLMs 能夠分析複雜的語言模式，從簡歷、求職信和其他申請材料中提取相關信息，識別符合工作要求的關鍵技能和資格。

- 自定義和靈活性： LLMs 可以根據具體的工作要求自定義優先考慮的技能和經驗，允許更有針對性的篩選過程。

- 可擴展性： LLMs 能夠同時處理大量申請，非常適合接收大量申請人的組織。

- 成本效益： 通過自動化申請篩選的初始階段，LLMs 可以減少對大量人力資源的需求，從而降低運營成本。

- 持續改進： LLMs 可以根據反饋和新數據持續進行訓練和改進，隨著時間的推移提高其準確性和有效性。-

- 提升候選人經驗： 更快的回應時間和更一致的評估可以改善整體候選人經驗，因為申請人更有可能及時收到反饋。

In [None]:
from IPython.display import IFrame

IFrame("https://www.techjobasia.com/zh-Hant/jobs/GMMlhU0qSayr6ZwTB0U6zA---Software-Engineer-(ReactJS)", width=800, height=400)

###  1. 發送 GET 請求到指定的 URL

- 這行程式碼向指定的 URL 發送 HTTP GET 請求，並將響應儲存在 response 變數中。

In [None]:
import requests

from bs4 import BeautifulSoup
from langchain_community.document_loaders import BSHTMLLoader

url = "https://www.techjobasia.com/zh-Hant/jobs/GMMlhU0qSayr6ZwTB0U6zA---Software-Engineer-(ReactJS)"

# Send a GET request to the URL
response = requests.get(url)
print(response)

### 2. 獲取響應的內容

- 這行程式碼將響應的內容作為文字字串提取，並將其儲存在 html_content 變數中。

In [None]:
html_content = response.text

### 3. 解析 HTML 內容

- 這行程式碼使用 BeautifulSoup 解析儲存在 html_content 中的 HTML 內容，並創建一個名為 soup 的 BeautifulSoup 對象。

In [None]:
soup = BeautifulSoup(html_content, 'html.parser')

In [None]:
# soup

### 4. 移除所有 CSS 樣式標籤

- 這個循環找到解析後的 HTML 內容中的所有 <style> 標籤，並使用 decompose() 方法將它們移除。

In [None]:
for style in soup.find_all('style'):
    style.decompose()

### 6. 提取並打印僅包含文字的內容

- 這行程式碼從解析後的 HTML 中提取文字內容，使用換行符將元素分隔，並將其儲存在 text_content 變數中。

In [None]:
text_content = soup.get_text(separator='\n')

In [None]:
text_content

### 7. 清理文字內容

- 這行程式碼通過移除每行的首尾空白並丟棄空行來清理提取的文字內容。清理後的文字儲存在 cleaned_text 變數中。
- 將清理後的文字內容打印到控制台。

In [None]:
cleaned_text = '\n'.join(line.strip() for line in text_content.splitlines() if line.strip())

print(cleaned_text)

### 8. 使用LLM提取工作相關訊息

In [None]:
template= dedent("""
          Extract the job description part of the text: {content} 
          """)

human_prompt = PromptTemplate(template=template)
human_message = HumanMessagePromptTemplate(prompt=human_prompt)
    
chat_prompt_template = ChatPromptTemplate.from_messages([human_message])

pipeline_ = chat_prompt_template|model|StrOutputParser()

job_description = pipeline_.invoke({"content": cleaned_text})

print(job_description)

### 9. 將上述步驟打包成函數

In [None]:
from langchain_core.runnables import chain, Runnable

def parsing_process(url):
    """
    Fetches and extracts text content from a given URL.

    Parameters:
    url (str): The URL of the web page to fetch and parse.

    Returns:
    str: Cleaned text content extracted from the web page.

    Raises:
    requests.exceptions.RequestException: If an error occurs while fetching the URL.

    Notes:
    - This function sends a GET request to the specified URL.
    - It uses BeautifulSoup to parse the HTML content of the response.
    - Any <style> tags in the HTML are removed to extract only textual content.
    - The extracted text is cleaned by removing extra whitespace and empty lines.
    """
    # Send a GET request to the URL
    response = requests.get(url)

    # Get the content of the response
    html_content = response.text
    
    soup = BeautifulSoup(html_content, 'html.parser')
    
    for style in soup.find_all('style'):
        style.decompose()

    # Extract and print only the text content
    text_content = soup.get_text(separator='\n')

    # Clean up the text (optional)
    cleaned_text = '\n'.join(line.strip() for line in text_content.splitlines() if line.strip())
    
    return cleaned_text

拿一個1111來試試

In [None]:
url = "https://www.1111.com.tw/job/131976692/"
cleaned_text = parsing_process(url)

In [None]:
print(pipeline_.invoke({"content": cleaned_text}))

這個過程中牽涉到從網路上讀取數據，本地數據處理等等。適合用異步流來進行加速:

目前程式是 同步阻塞 的：

- requests.get(url) → 會阻塞直到網路請求完成。

- BeautifulSoup 的解析則是 CPU 本地操作，速度通常不是瓶頸。

所以：

如果你一次只抓單一 URL，沒必要用 async，因為主要瓶頸就是等待那一次請求。

如果要抓 多個 URL，那麼改用 非同步（async/await + aiohttp） 或 多執行緒 / 多處理 才能顯著加速，因為你能同時發送多個請求，避免 I/O 等待造成浪費。

In [None]:
import aiohttp

from bs4 import BeautifulSoup
from langchain_core.runnables import chain

@chain
async def async_parsing_process(url: str) -> str:
    """
    Fetches and extracts cleaned text content from a given URL asynchronously.

    Parameters:
    url (str): The URL of the web page to fetch and parse.

    Returns:
    str: Cleaned text content extracted from the web page.

    Raises:
    aiohttp.ClientError: If an error occurs while fetching the URL.
    """

    # request 不支援 asynchronization
    
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            html_content = await response.text()
            soup = BeautifulSoup(html_content, "html.parser")

            # 移除 style 和 script
            for tag in soup(["style", "script"]):
                tag.decompose()

            # 提取文字
            text_content = soup.get_text(separator="\n")

            # 清理空白與空行
            cleaned_text = "\n".join(
                line.strip() for line in text_content.splitlines() if line.strip()
            )

            return cleaned_text

若是我們使用LCEL，pipeline 就是這個樣子:

1. 資料提取與清洗
2. 使用LLM進行最後的數據提煉

In [None]:
job_description_pipeline = RunnablePassthrough.assign(content=itemgetter("url")|async_parsing_process)|chat_prompt_template|model|StrOutputParser()

job_result = await job_description_pipeline.ainvoke({"url": url})

In [None]:
print(job_result)

## 履歷提取

- 網路上公開資料 (把工作提取的Prompt 稍微修一下就行，網路抓數據的流程是一樣的)
- Word
- PDF

### 提取 Word 內容

In [None]:
!pip install python-docx

In [None]:
from docx import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table
from docx.text.paragraph import Paragraph

def iter_block_items(parent):
    """
    Yield each paragraph and table in document order.
    """
    for child in parent.element.body:
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

# Load the Word document
doc = Document(os.path.join("tutorial", "LLM+Langchain", "Week-3", "Resume.docx"))

content = []

for block in iter_block_items(doc):
    if isinstance(block, Paragraph):
        text = block.text.strip()
        if text:  # skip empty paragraphs
            content.append(text)
    elif isinstance(block, Table):
        for row in block.rows:
            row_data = [cell.text.strip() for cell in row.cells]
            content.append("\t".join(row_data))

# Combine everything in order
full_text = "\n".join(content)

print(full_text)

### 打包成異步流Langchain Runnable

python-docx 本身是一個 同步阻塞的函式庫 —— 它在讀檔、解析 .docx 時會佔用 Python 主執行緒。

在 async/await 的應用場景中（例如 LangChain agent、async pipeline、Web server），如果直接呼叫同步的 python-docx，會讓 事件迴圈被卡住，其他 async 任務無法並行進行。

await asyncio.to_thread(func, *args)

把這個 同步阻塞的工作丟到背景 thread pool 執行，從而讓事件迴圈保持「非阻塞」。

In [None]:
# 因為python-docx是同步流，所以我們需要用asyncio.to_thread將其重新包裝成異步流
import asyncio
from typing import Dict, Any


class DocxExtractor(Runnable):
    """LangChain Runnable that extracts text (paragraphs + tables) from a Word file."""

    async def ainvoke(self, filename: str, config: Dict[str, Any] | None = None) -> str:
        """
        python-docx 是同步阻塞的函式庫，若直接在 async 環境呼叫會卡住事件迴圈，因此需用 asyncio.to_thread 將其包裝，
        讓阻塞操作在背景執行緒執行，避免阻塞其他非同步任務。
        """
        
        return await asyncio.to_thread(self._extract, filename)

    def invoke(self, filename: str, config=None) -> str:
        # synchronous version directly calls the sync helper
        return self._extract(filename)
    
    def _extract(self, filename: str) -> str:
        doc = Document(filename)
        content = []

        for block in iter_block_items(doc):
            if isinstance(block, Paragraph):
                text = block.text.strip()
                if text:
                    content.append(text)
            elif isinstance(block, Table):
                for row in block.rows:
                    row_data = [cell.text.strip() for cell in row.cells]
                    content.append("\t".join(row_data))

        return "\n".join(content)

In [None]:
extractor = DocxExtractor()
result = await extractor.ainvoke(os.path.join("tutorial", "LLM+Langchain", "Week-3", "Resume.docx"))
print(result)

這個 Word 內容提取是通用型的，不限於簡歷，也可以提取公文或其他文件文字內容。它完全基於程式邏輯，不涉及大語言模型，所以不需要額外付費。並非所有應用都需要大語言模型，有時候你只是需要一台 50cc 小型代步車，沒必要自己去打造一台 F1 賽車。

### 提取 PDF 內容

In [None]:
from pypdf import PdfReader

def extract_pdf(filename: str) -> str:
    reader = PdfReader(filename)
    content = []
    for page in reader.pages:
        text = page.extract_text()
        if text:
            content.append(text.strip())
    return "\n".join(content)

# Example
print(extract_pdf(os.path.join("tutorial", "LLM+Langchain", "Week-3", "Resume.pdf")))

異步流版本:

In [None]:
class PdfExtractor(Runnable):
    """LangChain Runnable that extracts text from a PDF file."""

    async def ainvoke(self, filename: str, config: Dict[str, Any] | None = None) -> str:
        return await asyncio.to_thread(self._extract, filename)

    def invoke(self, filename: str, config: Dict[str, Any] | None = None) -> str:
        return self._extract(filename)

    def _extract(self, filename: str) -> str:
        reader = PdfReader(filename)
        content = []
        for page in reader.pages:
            text = page.extract_text()
            if text:
                content.append(text.strip())
        return "\n".join(content)

直接使用根據 Docx 提取的內容

技術上確實可以直接在讀取檔案的同時進行匹配，但在系統設計上，將「檔案內容提取」獨立成工作流會更恰當，因為這樣更有利於維護、監控與後續擴展，再將提取結果交由下游流程進行匹配或其他處理。

1. 技術上可行：
    確實可以「on-the-fly」直接讀取檔案，然後立刻做比對或匹配，這在 demo 或小規模應用時沒問題。

2. 務實上的考量：
    在實際系統設計中，把「檔案讀取 / 解析」獨立成一個模組或工作流會更好：

    - 維護性：解析邏輯獨立，容易替換不同檔案類型（Word、PDF、Excel…）。

    - 可監控：可以針對「解析失敗」做監控和錯誤處理，不會和匹配邏輯糾纏在一起。

    - 可擴展：之後不只做匹配，還可能做索引、摘要、分類等，提前抽離流程更有彈性。

3. 流程建議：

    Step 1：檔案 → 讀取 & 提取文字（抽象成 Extractor 工作流）。

    Step 2：抽取內容 → 匹配 / NLP 處理 / 下游任務。

## 內容精煉，輸出內容一致化

In [None]:
from langchain_core.output_parsers import StrOutputParser

system_template = dedent("""
                  I am going to give you a template for your output. 
                  CAPITALIZED WORDS are my placeholders. 
                  Fill in my placeholders with your output. Please preserve 
                  the overall formatting of my template. My template is:

                  *** Working Experience:*** WORKING EXPERIENCE 
                  *** Education:*** EDUCATION
                  *** Skills:*** SKILLS

                  I will give you the data to format in the next prompt. 
                  Create a resume using my template.
                  """)

human_template = """
                 {query}
                 """

input_ = {"system": {"template": system_template},
          "human": {"template": human_template,
                    "input_variables": ["query"]}}

resume_pipeline = build_pipeline(model=model, inputs=input_, parser=StrOutputParser())

# resume_prompt_template = build_standard_chat_prompt_template(input_)

# resume_pipeline = resume_prompt_template|model|StrOutputParser()

In [None]:
resume_output = resume_pipeline.invoke(result)

In [None]:
print(resume_output)

## 履歷和工作的匹配

In [None]:
class Output(BaseModel):

    result: Literal['Yes', 'No'] = Field(description="If the candidate is a good fit, either Yes or No")
    reason: str = Field(description="Applicant - Job matching")
    
output_parser = PydanticOutputParser(pydantic_object=Output)

format_instructions = output_parser.get_format_instructions()

In [None]:
system_template = dedent("""
                  You are an AI assistant acting as an experienced senior 
                  recruiter in IT field.
                  
                  You are assigned a task of identifying if an applicant, 
                  based on the description in the resume, is a good match to the described job. 
                    
                  You always do the best work you can. You are highly 
                  analytical and pay close attention to details. 
                  """)

human_template = dedent("""
                 Job description: {job}
                 resume: {resume}
                 output format instructions: {format_instructions}
                 """)

input_ = {"system": {"template": system_template},
          "human": {"template": human_template,
                    "input_variables": ["job", "resume"],
                    "partial_variables": {'format_instructions': format_instructions}}}

match_prompt_template = build_standard_chat_prompt_template(input_)

In [None]:
job_result

In [None]:
result

In [None]:
matching_pipeline = build_pipeline(model=model, inputs=input_, parser=output_parser)

# matching_pipeline = match_prompt_template|model|output_parser

In [None]:
matching_output = matching_pipeline.invoke({"job":job_result, "resume":result})

print(matching_output)

# 其他的大語言模型API提供商

> 🎯 本章學完你將能學會什麼：
> - 理解如何在 LangChain 中整合第三方大語言模型 API（如 Perplexity、DeepSeek）
> - 學會設定與呼叫非 OpenAI 模型的 API（包含自訂 base_url、api_key）
> - 熟悉 Perplexity 的 sonar-* 模型與 DeepSeek 的 deepseek-reasoner 模型在 LangChain 中的使用方式

## Perplexity

https://python.langchain.com/docs/integrations/chat/perplexity/

- sonar-deep-research
- sonar-reasoning-pro
- sonar-reasoning
- sonar-pro
- sonar	128k
- r1-1776

Langchain '似乎'支持Perplexity，但我在使用時發現會出問題，所以需要自己套殼

In [None]:
# Perplexity 基本API使用

messages = [
    {
        "role": "system",
        "content": (
            "You are an AI assistant that focuses on equity market analysis and you need to "
            "engage in an accurate, comprehensive, helpful and  polite conversation with a user."
        ),
    },
    {  
        "role": "user",
        "content": (
            "Find the SKU number of Carslan Lasting Cover Foundation N01"
        ),

    },

]

client = OpenAI(api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")

In [None]:
from langchain_core.messages import SystemMessage, HumanMessage


@chain
def prompt_template_2_messages(chat_prompt):

    output_messages = []
     
    _messages = chat_prompt.messages

    for message in _messages:
        if isinstance(message, SystemMessage):
            output_messages.append({"role": "system", "content": message.content})
        if isinstance(message, HumanMessage):
            output_messages.append({"role": "user", "content": message.content})

    return output_messages


@chain
def messages_2_perplexity(messages):

    client = OpenAI(api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")

    response = client.chat.completions.create(
        model="sonar-deep-research",
        messages=messages
    )

    content = response.choices[0].message.content
    citations = response.citations

    return {"content": content,
            "citations": citations}

In [None]:
pipeline_perplexity = chat_prompt_perplexity|prompt_template_2_messages|messages_2_perplexity

## DeepSeek

- Langchain 支援 Deepseek

In [None]:
from langchain_deepseek import ChatDeepSeek

deepseek_r1 = ChatDeepSeek(api_key=os.environ['DEEPSEEK_API_KEY'], temperature=0, model='deepseek-reasoner')

system_template = "You are a helpful assistant."
human_template = "Create a financial report of {ticker} based on:\n {context}"

input_ = {"system": {"template": system_template},
          "human": {"template": human_template,
                    "input_variables": ["context", "ticker"]}
         }

chat_prompt_deepseek = build_standard_chat_prompt_template(input_)

pipeline_deepseek = chat_prompt_deepseek|deepseek_r1|output_parser

# MLFlow Part 1

>🎯 本章學完你將能學會什麼：
> - 理解 MLflow 的核心概念（Experiment、Run、Model Registry、Artifact 等）
> - 學會在本地端啟動並連接 MLflow Tracking Server
> - 能夠使用 MLflow 追蹤與記錄機器學習模型（以 Logistic Regression + Iris dataset 為例）
> - 掌握如何透過 MLflow 註冊、載入、標記（Tag）與別名（Alias）管理模型版本
> - 熟悉 mlflow.sklearn 與 mlflow.pyfunc 的模型存取方式，並能模擬遠端載入流程
> - 理解如何結合 LangChain 與 MLflow，將 LLMChain pipeline 進行版本化與追蹤
> - 學會設定 Model Signature，明確定義模型的輸入輸出結構，提升模型可解釋性與可移植性
> - 能夠將自訂 Python pipeline 以 MLflow 模型格式部署與重載運行

重啟Notebook來節省硬體資源

- pip install mlflow
- 在CLI: mlflow server --host 127.0.0.1 --port 8080

In [None]:
!pip install mlflow

In [None]:
import mlflow
from mlflow.models import infer_signature

import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Load the Iris dataset
X, y = datasets.load_iris(return_X_y=True)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define the model hyperparameters
params = {
    "solver": "lbfgs",
    "max_iter": 1000,
    "multi_class": "auto",
    "random_state": 8888,
}

# Train the model
lr = LogisticRegression(**params)
lr.fit(X_train, y_train)

# Predict on the test set
y_pred = lr.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)

In [None]:
import mlflow

# 通知mlflow要把紀錄送去哪裡
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("MLflow Quickstart")

with mlflow.start_run(run_name="Week-3-MLFlow") as run:

    # 你需要這個來確保結果都會被記錄在同一個Run裡
    run_id = run.info.run_id

    mlflow.log_params(params)

    # Log the loss metric
    mlflow.log_metric("accuracy", accuracy)

    # Infer the model signature
    signature = infer_signature(X_train, lr.predict(X_train))

    # Log the model, which inherits the parameters and metric
    model_info = mlflow.sklearn.log_model(
        sk_model=lr,
        # model local name
        name="iris_model",
        signature=signature,
        input_example=X_train,
        # model global name
        registered_model_name="tracking-quickstart",
    )

    # Set a tag that we can use to remind ourselves what this model was for
    mlflow.set_logged_model_tags(
        model_info.model_id, {"Training Info": "Basic LR model for iris data"}
    )
    

In [None]:
model_info.model_uri

In [None]:
model_info

## 本地執行

1. mlflow.pyfunc.load_model(model_info.model_uri)

- 這裡用的是 model_info.model_uri，它指向剛剛在 local run 裡 mlflow.sklearn.log_model(...) 存下來的 artifact 路徑。

- 因為你在 同一個 local tracking server (或預設的本地檔案系統) 執行，所以可以直接載入。

2. predictions = loaded_model.predict(X_test)

- mlflow.pyfunc 會包裝成一個 通用 Python function 模型 (不管底層是 sklearn、pytorch、xgboost...)。

- 所以你可以直接 .predict(...)，得到預測結果。

In [None]:
# Load the model back for predictions as a generic Python Function model

loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

predictions = loaded_model.predict(X_test)

iris_feature_names = datasets.load_iris().feature_names

result = pd.DataFrame(X_test, columns=iris_feature_names)
result["actual_class"] = y_test
result["predicted_class"] = predictions

result[:4]

## 模擬遠端執行

In [None]:
import mlflow
from mlflow.tracking import MlflowClient

client = MlflowClient()

# 列出特定模型的所有版本
versions = client.search_model_versions("name='tracking-quickstart'")

for v in versions:
    print(f"Version: {v.version}, Stage: {v.current_stage}, Run ID: {v.run_id}")

In [None]:
model_uri = "models:/tracking-quickstart/1"    # version 1

model_remote = mlflow.sklearn.load_model(model_uri)

In [None]:
predictions = model_remote.predict(X_test)

iris_feature_names = datasets.load_iris().feature_names

result = pd.DataFrame(X_test, columns=iris_feature_names)
result["actual_class"] = y_test
result["predicted_class"] = predictions

result[:4]

### 模型 Alias（別名）

Alias 是一種可變（mutable）的命名引用，可指向某個註冊模型（registered model）的特定版本。這對於隨後更新部署模型卻不想改程式碼時非常方便。例如：

In [None]:
client = mlflow.tracking.MlflowClient()

client.set_registered_model_alias(
    name="tracking-quickstart",
    alias="champion",
    version="1"
)




In [None]:
# 以 alias 載入模型
model = mlflow.sklearn.load_model("models:/tracking-quickstart@champion")

也可以用 API 刪除 alias：

In [None]:
client.delete_registered_model_alias("tracking-quickstart", "champion")

### 模型 Tags（標籤）

MLflow 支援兩層 Tag：

- Registered model-level tags：整體模型的 metadata，例如用途、團隊等資訊。

- Model version-level tags：針對每個版本做不同註記，例如驗證狀態、效能資訊等。

In [None]:
# Registered model-level tag
client.set_registered_model_tag("tracking-quickstart", "task", "classification")

# Version-level tag
client.set_model_version_tag("tracking-quickstart", version="1", key="validation_status", value="approved")

## Langchain with MLflow

In [None]:
import os

os.chdir("../../../")

In [None]:
import mlflow
from langchain.chains import LLMChain ## Langchain boilerplate
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

from src.initialization import credential_init
credential_init()

# Initialize the OpenAI model and the prompt template

model = ChatOpenAI(openai_api_key=os.environ['OPENAI_API_KEY'],
                   model_name="gpt-4o-mini", temperature=0)

prompt = PromptTemplate(
    input_variables=["product"],
    template="What is a good name for a company that makes {product}?",
)

experiment = "Week-3-Langchain"

# Create the LLMChain with the specified model and prompt
# 最早我也是用這個
chain = LLMChain(llm=model, prompt=prompt)

# Log the LangChain LLMChain in an MLflow run
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment(experiment)

with mlflow.start_run(run_name="LLMChain") as run:
    logged_model = mlflow.langchain.log_model(chain, 
                                              name="langchain_model",
                                              registered_model_name="LLMChain_Demo")

    prompt_path = os.path.join("tutorial", "LLM+Langchain", "Week-3" ,"prompt.txt")
    with open(prompt_path, "w") as f:
        f.write(prompt.template)
    # Log the prompt as an artifact
    mlflow.log_artifact(prompt_path, artifact_path="prompts")


In [None]:
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

model = ChatOpenAI(openai_api_key=os.environ['OPENAI_API_KEY'],
                   model_name="gpt-4o-mini", temperature=0)

prompt = PromptTemplate(
    input_variables=["product"],
    template="What is a good name for a company that makes {product}?",
)

chain = LLMChain(llm=model, prompt=prompt)

In [None]:
chain.invoke({"product": "colorful socks"})

If you load the model like the MLflow tutorial suggests, we will get an error message because:

The core problem is that ChatOpenAI contains fields like openai_api_key: SecretStr (a Pydantic type). When MLflow serializes the chain, those SecretStr objects get dumped to YAML with a tag like:

!!python/object/pydantic.types.SecretStr

But when you later reload the model, MLflow’s YAML loader doesn’t know how to re-instantiate SecretStr.

In [None]:
import mlflow

from src.initialization import credential_init

credential_init()

# Log the LangChain LLMChain in an MLflow run
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Reload chain (structure only)
loaded_model = mlflow.pyfunc.load_model("models:/LLMChain_Demo/1")

# Re-attach the LLM with fresh credentials
loaded_model.llm = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model_name="gpt-4o-mini",
    temperature=0
)

# Predict using the loaded model
print(loaded_model.predict([{"product": "colorful socks"}]))

An Universal Solution

1. Create a python script file for your pipeline
2. upload the `model` into your MLflow server

In [None]:
import pandas as pd
import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec

在 MLflow 的 log_model 時標明 signature，主要是為了清楚定義模型的輸入與輸出格式。這對模型管理、部署以及後續使用都有實際好處

- 1. 什麼是 signature？

    signature 是一個 Schema 定義，描述了模型預期的 輸入資料型態與欄位結構，以及模型會輸出的 結果型態與欄位結構。 在這個例子裡：

    - 輸入是一個欄位 "input"，型別是 string。
    - 輸出包含兩個欄位："product" 和 "text"，都為 string。
    - 
- 2. 為什麼要標明 signature？

    如果不指定，MLflow 會嘗試自動推斷模型的輸入/輸出格式，但這有幾個問題：

    - 推斷可能不準確（特別是在複雜 pipeline 或輸入非標準 pandas/numpy 物件時）。

    - 模型被部署或交付給其他人時，使用者可能不知道要如何準備正確的輸入資料。

    因此 明確指定 signature 可以避免模糊不清。

- 3. 好處

    - 可讀性與可解釋性

        清楚描述「這個模型吃什麼、吐什麼」，讓其他人（或未來的自己）一眼看懂。

    - 驗證輸入輸出

        MLflow 的 pyfunc API 會利用 signature 驗證輸入資料是否符合定義，避免「欄位缺失」或「型態不符」的錯誤。

    - 提升可移植性

        模型部署到 MLflow Serving 或其他 REST API 時，會自動帶上 schema 文件，API 使用者可以直接參考。

    - 幫助自動化工具

        在 AutoML pipeline 或 MLflow Model Registry 中，signature 可以被用來檢查模型相容性（例如 pipeline 不同階段之間輸入/輸出格式是否一致）。

In [None]:
# You need to know what you will put into it and what you will get out of it.
input_schema = Schema([ColSpec("string", "input")])
output_schema = Schema([ColSpec("string", "product"),
                        ColSpec("string", "text")])
signature = ModelSignature(inputs=input_schema, outputs=output_schema)

In [None]:
from src.io.path_definition import get_project_dir

model_path = os.path.join(get_project_dir(), 'tutorial', 'LLM+Langchain', "Week-3", "llmchain_mlflow_experiment.py")

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

"""
The registry is a separate feature: you must either (a) log with a registered_model_name argument, 
or (b) promote a logged model to the registry manually in the UI/CLI.
"""

with mlflow.start_run(run_name="LLMChain") as run:
    
    mlflow.log_artifact(model_path, artifact_path="source_code")

    input_example = pd.DataFrame(data=[['colorful socks']], columns=['input'])
    
    model_info = mlflow.pyfunc.log_model(
        python_model=model_path,  # Define the model as the path to the Python file
        name="langchain_model",
        input_example=input_example,
        signature=signature,
        registered_model_name="LLMChain_Demo"
    )

In [None]:
import mlflow

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

loaded_model = mlflow.pyfunc.load_model("models:/LLMChain_Demo/17")

input_ = pd.DataFrame(data=[['歐姆尼賽亞的化身，會行走的大教堂: 帝皇級泰坦 (Imperator-class Titan )']], columns=['input'])

loaded_model.predict(input_)

你可以看到檔案會以Artifact形式下載，並且複寫

換句話說 load_model 時要小心控制下載位置

In [None]:
mlflow.pyfunc.load_model?