# 釣魚信件辨識
利用 LLM 來 辨識釣魚信件，主要會使用到 Langchian 套件與 HuggingFace 模型來實作。 <br>
並包含以下技術：
1. Prompt Engineering
    * Few shot
    * Chain Of Thought
    * Self-Consistency
2. RAG

## Data Preprocess

In [73]:
# 載入套件
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

from wordcloud import WordCloud # pip install wordcloud
import matplotlib.pyplot as plt

# 設定圖的字體大小
plt.rcParams['font.size'] = 14

# 設定中文字體 (無法顯示的話可以試試‘Microsoft JhengHei’字體)
# 也可參考：https://pyecontech.com/2020/03/27/python_matplotlib_chinese/
plt.rcParams['font.sans-serif'] = ['Arial Unicode Ms']

In [74]:
maildata = pd.read_csv("dataset/Phishing_Email.csv")

In [75]:
# 刪除標點符號/數字/換行符號
maildata["Email Text"] = maildata["Email Text"].apply(lambda x: re.sub(r'[^\w\s]','', str(x)))  #只留下英文字母和空格(包含換行符號)
maildata["Email Text"] = maildata["Email Text"].apply(lambda x: re.sub(r'[\n_-]+',' ', x)) #將換行符號替換成空格
maildata["Email Text"] = maildata["Email Text"].apply(lambda x: re.sub(r'\s+', ' ', x)) # 將多個空格替換成一個空格
# 新增一個 column，計算 "Email Text" 的字數
maildata["Word Count"] = maildata["Email Text"].apply(lambda x: len(str(x)))

# 字數介於 100 到 600 之間的資料
# 避免 token 太多
maildata = maildata[maildata["Word Count"] > 100]
maildata = maildata[maildata["Word Count"] < 600]

In [76]:
nums = 25
# 隨機抽取 Phishing Email
phishing_emails = maildata[maildata['Email Type'] == 'Phishing Email'].sample(n=nums, random_state=1116)
# 隨機抽取 Safe Email
safe_emails = maildata[maildata['Email Type'] == 'Safe Email'].sample(n=nums, random_state=1116)

In [77]:
phishing_emails.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type,Word Count
17721,17722,re 200 000 1st year subscribers company cavalc...,Phishing Email,400
14303,14304,get your babies diapers bill paid for for a ye...,Phishing Email,102
18073,18074,please solve your impotence once and for all h...,Phishing Email,362
3542,3542,mr childers has 716 256 for you and your famil...,Phishing Email,414
16802,16803,this has worked for me marrow enemy i think th...,Phishing Email,385


In [78]:
safe_emails.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type,Word Count
1046,1046,any sumerologists out there are there any sume...,Safe Email,145
17955,17956,jmjmasonorg Justin Mason writes BTW I tried tw...,Safe Email,417
2740,2740,start date 12 21 01 hourahead hour 10 start da...,Safe Email,231
6673,6674,book administrators gas the highlighted names ...,Safe Email,103
7939,7940,URL httpjeremyzawodnycomblogarchives000206html...,Safe Email,302


## 利用 huggingFace + LangChain 使用 LLM

In [79]:
import os
from getpass import getpass

import pandas as pd
import ast
import json
import re
import jieba

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import BitsAndBytesConfig # huggingface 量化

from langchain_community.llms import HuggingFaceEndpoint

from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate

from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel
from langchain_core.runnables import RunnableLambda

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

### huggingFace Token

In [80]:
# 輸入 huggingface 的 token
HUGGINGFACEHUB_API_TOKEN = getpass()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

### import LLM

### HuggingFacePipeline
將 Model 從 HuggungFace 載入到本地

In [None]:
# Load Model

# 設定量化配置，可以用來減少模型記憶體占用
quant_config = BitsAndBytesConfig(load_in_4bit=True, load_in_8bit_fp32_cpu_offload=True)
# 選擇要使用的模型
model_id = "google/gemma-2b-it"
# model_id = "google/gemma-7b-it"
# 載入 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HUGGINGFACEHUB_API_TOKEN)

# 載入語言模型
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quant_config, token=HUGGINGFACEHUB_API_TOKEN, device_map="auto")

In [83]:
# 建立 text-generation pipeline，讓模型可以直接產生文本
# max_new_tokens 生成文本的最大長度
# temperature: 模型回答的創意程度，0~1 越大每次回答的多樣性越高
pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    max_new_tokens=3,
    temperature=1.0, 
    repetition_penalty=1.2,
    return_full_text=False)
# 使用 HuggingFacePipeline 封裝 pipeline 物件，讓它符合 langchain 的接口
llm = HuggingFacePipeline(pipeline=pipe)

### HuggingFaceEndpoint
讓 langchain 調動 huggingface model

In [10]:
# 透過 Hugging Face API 部署的 Endpoint 來呼叫模型
# temperature: 模型回答的創意程度，0~1 越大每次回答的多樣性越高
llm = HuggingFaceEndpoint(
    repo_id=model_id, temperature=1.0, model_kwargs={'use_cache':False}, huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
)

  warn_deprecated(


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\win90\.cache\huggingface\token
Login successful


## 創建 LLM Prompt Template

### Normal

In [86]:
# 計算 accuracy
def calcu_result(result_list):
    true_count = result_list.count("True")
    false_count = result_list.count("False")
    return true_count / (true_count + false_count)

In [84]:
normal_template = """<start_of_turn>user
你是一位資安專家，將會幫助判斷以下郵件是否為釣魚信件。
如果是釣魚信件，請回答「True」，如果不是釣魚信件，請回答「False」。

問題：{question}<end_of_turn>
<start_of_turn>model
答案："""

normal_prompt = PromptTemplate.from_template(normal_template)

normal_chain = (
    {"question": RunnablePassthrough()}
    | normal_prompt
    | llm
)

In [85]:
# 將結果存入 list
normal_result = []

# 使用for迴圈遍歷phishing_emails的"Email Text"欄位
# 將是否答對存進 list 中，而非回答的答案
for email_text in phishing_emails["Email Text"]:
    result = normal_chain.invoke(email_text)
    if "True" in result or "true" in result:
        normal_result.append("True")
    else:
        normal_result.append("False")

for email_text in safe_emails["Email Text"]:
    result = normal_chain.invoke(email_text)
    if "False" in result or "false" in result:
        normal_result.append("True")
    else:
        normal_result.append("False")

In [None]:
normal_acc = calcu_result(normal_result)
normal_acc
# 0.58

In [23]:
normal_acc = 0.58

### Few shot

In [13]:
fewshot_template = """<start_of_turn>user
你是一位資安專家，將會幫助判斷以下郵件是否為釣魚信件。
如果是釣魚信件，請回答「True」，如果不是釣魚信件，請回答「False」。

以下為一些例子：
```
郵件：hot teen flasher sex spy over 100  000 voyeur images tons live hidden cams 50  000  extreme movies total invasion of privacy click here for more  sex spy is the number one site for complete invasion of privacy photos  videos and more  these extreme voyeur photos have the hottest women caught on tape doing unthinkable acts  these sluts have no clue we are watching  click here for a free preview of the dumbest whores getting caught on camera  remove me ashtray bertrand ductwork witt secretion lucre lundberg incurring nostalgic papua yakima
答案：True

郵件：2002 it systems development allocations kevin  tim  please review the attached listing and advise if you agree to the projects and amounts listed as billings that enron north america will receive in 2002 for support of these efforts  i  m at x 30352 if you have any questions  thanks 
答案：False

郵件：enrononline executive summary for november 08  2001 following please find the daily enrononline executive summary  note  the executive summary transaction counts have been reduced to reflect the removal of sleeve trade activity 
答案：False
```

問題：{question}<end_of_turn>
<start_of_turn>model
答案："""

fewshot_template = PromptTemplate.from_template(fewshot_template)

fewshot_chain = (
    {"question": RunnablePassthrough()}
    | fewshot_template
    | llm
)

In [16]:
# 將結果存入 list
fewshot_result = []

# 使用for迴圈遍歷phishing_emails的"Email Text"欄位
# 將是否答對存進 list 中，而非回答的答案
for email_text in phishing_emails["Email Text"]:
    result = fewshot_chain.invoke(email_text)
    if "True" in result or "true" in result:
        fewshot_result.append("True")
    else:
        fewshot_result.append("False")

for email_text in safe_emails["Email Text"]:
    result = fewshot_chain.invoke(email_text)
    if "False" in result or "false" in result:
        fewshot_result.append("True")
    else:
        fewshot_result.append("False")

In [None]:
fewshot_acc = calcu_result(fewshot_result)
fewshot_acc
# 0.68

0.68

In [24]:
fewshot_acc = 0.68

### Chain of Thought

In [13]:
CoT_template = """<start_of_turn>user
你是一位資安專家，將會幫助判斷以下郵件是否為釣魚信件。
請逐步分析郵件的內容，並按照以下流程進行：
1. 提取郵件的主旨與目的。
2. 分析郵件的語言特徵（例如是否使用過於吸引人的措辭、過多的呼籲性語句）。
3. 判斷是否包含潛在的釣魚信件特徵（例如要求用戶提供敏感信息、提供不可信的 URL 或強調緊急性）。
4. 根據分析給出結論，如果是釣魚信件，請回答「True」，如果不是釣魚信件，請回答「False」。

以下為一些例子：
```
郵件：HI As a professional bulk mailer for 5 years I made over 200000 last 12 months selling only one product The Banned CDLuckily for you the Bulker CD 2003 is now for sale I decide to sell all secrets how I made over 20000 per month at home only few hours workWhy dont more people use Bulk Email Marketing1 Lack of knowledge Most people do not know how to set up a marketing campaign let alone set up an effective email marketing campaign Through hard work and trial and error we have developed simple yet successful strategies to send your emails We can show you how to do it properly 2 Fear of getting into trouble Most people do not send email because they have heard negative things about SPAM and that your isp will shut you down This is true if you dont know what you are doing and bulk email to the masses If you dont believe in SPAM we have developed alternative ways to bulk email so that you are sending your emails responsibly without getting into any trouble at all 3 Dont have the necessary equipmentsoftwares To send your emails out you need a computer with specialized email software installed that will send or harvest your emails We are the email marketing software experts The softwares ranging will up to thousands of dollars Buying the correct software for your needs can be confusing Depending on your budget requirements and goals we can help recommend the best software for youBULKER CDROM has everything you need to start bulk Emailing immediately all on this one CD BULKER CDROM is excellent for the beginner as well as the professional Advertising my products have never been easier Please Click the URL for the Detail http7777772E62a6E6E6564c642E6E6574bulkhtmYou are receiving this special offer because you have provided permission to receive third party email communications regarding special online promotions or offers We strongly oppose the use of SPAM email and do not want to send our mailings to anyone who does not wish to receive them If you do not wish to receive any further messages from netcommission To Be Removed From Our List http7777772E62a6E6E6564c642E6E6574removehtml
分析：
1. 主旨與目的：該郵件的主要目的是推銷一款名為 "Bulker CDROM" 的軟件，並強調其能幫助用戶實現有效的電子郵件營銷。
2. 語言特徵：郵件使用了吸引人的措辭（例如「我一年內賺了200000美元」、「僅需幾個小時的工作」），並帶有誘惑性言語來吸引目標群體。它還通過羅列「為什麼人們不用批量電子郵件行銷」的原因，試圖建立信任。
3. 潛在釣魚信特徵：
- 提供了 URL：http7777772E62a6E6E6564c642E6E6574bulkhtm，這看起來並非可信且可能存在安全風險。
- 試圖說服用戶購買產品以獲得「快速致富」的方案。
- 提到如果不希望收到郵件，可點擊移除鏈接，但鏈接的真實性無法確認，可能會收集用戶敏感信息。
4. 結論：
該郵件包含了典型的釣魚信件特徵，包括不可信的 URL 和過於誘惑的語言，因此判斷為釣魚信件（True）。
答案：True


郵件：2002 it systems development allocations kevin  tim  please review the attached listing and advise if you agree to the projects and amounts listed as billings that enron north america will receive in 2002 for support of these efforts  i  m at x 30352 if you have any questions  thanks 
分析：
1. 主旨與目的：該郵件的主旨是請求 Kevin 和 Tim 檢視附上的專案清單，並確認其是否同意所列出的項目與金額。
2. 語言特徵：郵件語言直接且清晰，使用了正式且專業的語氣，未包含過於吸引人的措辭或過多的呼籲性語句。
3. 潛在釣魚信特徵：
- 未包含任何不可信的 URL。
- 未要求接收者提供敏感信息（例如賬戶密碼或個人身份資料）。
- 未強調任何緊急性或試圖操控情緒。
4. 結論：
此郵件是安全郵件，未包含任何釣魚信件特徵（False）。
答案：False
```

問題：{question}<end_of_turn>
<start_of_turn>model
答案："""

CoT_prompt = PromptTemplate.from_template(CoT_template)

CoT_chain = (
    {"question": RunnablePassthrough()}
    | CoT_prompt
    | llm
)

In [None]:
# 將結果存入 list
CoT_result = []

# 使用for迴圈遍歷phishing_emails的"Email Text"欄位
# 將是否答對存進 list 中，而非回答的答案
for email_text in phishing_emails["Email Text"]:
    result = CoT_chain.invoke(email_text)
    if "True" in result or "true" in result:
        CoT_result.append("True")
    else:
        CoT_result.append("False")

for email_text in safe_emails["Email Text"]:
    result = CoT_chain.invoke(email_text)
    if "False" in result or "false" in result:
        CoT_result.append("True")
    else:
        CoT_result.append("False")

CoT_result = ['True',
 'True',
 'False',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'False',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False','False',
 'True',
 'False',
 'False',
 'False',
 'True',
 'False']

In [22]:
CoT_acc = calcu_result(CoT_result)
CoT_acc
# 0.5

0.5

### Self-Consistency

In [13]:
# 將結果存入 list
self_result = []

# 使用for迴圈遍歷phishing_emails的"Email Text"欄位
# 將是否答對存進 list 中，而非回答的答案
for i in range(10):
    result = normal_chain.invoke(phishing_emails["Email Text"].iloc[0])
    if "True" in result or "true" in result:
        self_result.append("True")
    else:
        self_result.append("False")

for i in range(10):
    result = normal_chain.invoke(safe_emails["Email Text"].iloc[0])
    if "False" in result or "false" in result:
        self_result.append("True")
    else:
        self_result.append("False")

In [None]:
self_acc = calcu_result(self_result)
self_acc
# 0.65

0.65

## RAG

讀入 PDF 檔


In [26]:
# 載入 PDF 文件，將文件分割成一頁一頁
loader = PyPDFLoader("./dataset/PhishingMail_Analysis.pdf")
pages = loader.load()

# 將 PDF 內容分割成小段落
text_splitter = RecursiveCharacterTextSplitter(
    # 分割的字元數量
    chunk_size=1000,
    # 每個段落之間的重疊字元數量
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.split_documents(pages)
texts[:10]

[Document(page_content='SMA\nFinal Project悲傷 Peter 與他的快樂⼩伙伴\n組員\nB094020046 ⿈奕瑋\nB094020011  邱亮傑\nB094020007 陳⽂薇\nM124020010 鄭雅云', metadata={'source': './dataset/PhishingMail_Analysis.pdf', 'page': 0}),
 Document(page_content='Overview\nEDA、詞頻分析 01\n情緒分析 04詞性分析 02\n主題模型 03\n分類器 05\n總結 06', metadata={'source': './dataset/PhishingMail_Analysis.pdf', 'page': 1}),
 Document(page_content='平均字數 535.17\n字數標準差 25850.65\n最少字數 0\n25% 字數 74\n50% 字數 160\n75% 字數 354\n最多字數 3527576EDA\n資料集： Kaggle 上的釣⿂信件資料集了解資料集\n資料欄位數： 3 個欄位  – index, Email Text, Email Type\nEmail Type ： Safe Email, Phishing Email\n資料筆數： 18650 筆資料來源：\nhttps://www.kaggle.com/datasets/\nsubhajournal/phishingemails/data', metadata={'source': './dataset/PhishingMail_Analysis.pdf', 'page': 2}),
 Document(page_content='詞頻分析\n分類觀察：總信件、安全信件、釣⿂信件計算去除停⽤字後的斷詞詞頻\nenron  事件\n使⽤者設定  languageenron  事件\n使⽤者設定  languagecompany\nfree、 get 、 click 、 money詞頻分析結果與後續分析結果幾乎⼀致：釣⿂信件具明顯特徵', metadata={'source': './dataset/PhishingMail_Analysis.pdf', 'page': 3

將 PDF 內資料 embedding，並存入向量資料庫

In [None]:
# 建立 embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="lier007/xiaobu-embedding")
# 建立向量資料庫 (Chroma)，並將 texts 轉換為向量儲存
vectorstore = Chroma.from_documents(documents=texts, embedding=embedding_function)
# Retriever 能夠根據查詢內容找到最相關的文字片段
retriever = vectorstore.as_retriever()

In [None]:
# query it
query = "釣魚信件有哪些主題"
docs = vectorstore.similarity_search(query)

print(docs[0].page_content)

BERTopic 主題模型 
BERTopic 分類結果
各主題代表字
主題⼀、釣⿂信件的常⾒⽤詞
      free、 company 、 information 、 report 、 get
主題⼆、國家與政府
      enenkio 、 islands 、 kingdom 、 marshall 、 atoll
主題三、科技技術
      fuel、 battery 、 cell 、 box 、 ones
主題四、網⾴程式碼
     function 、 documentwritett 、 easy 、 var 、 pattern主題五、宗教
      cns、 counseling 、 christian 、 crditos 、 theological
主題六、學術領域
      karpenkov 、 dcenter 、 align 、 occurrences 、 conductors
主題七、健康與醫學
      acts、 organs 、 pathway 、 endocrine 、 neuro
BERTopic 更詳細的分割不同主題，我們可以知道除了  LDA 所分出的常⾒主題外，釣⿂信還包含了各式各樣的主題。
這也提醒我們需要更加警覺，因為釣⿂信會變換多種形式和內容進⾏釣⿂。


In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_prompt_templete = """<start_of_turn>user
"角色": "資安專家",
"工作內容":
你是一位資安專家，將會幫助判斷以下郵件是否為釣魚信件。\n
現在給予以下相關文件, 你可以參考以下[文件][文件]中的相關文件，來判斷郵件是否為釣魚信件。\n
請逐步分析郵件的內容，並按照以下流程進行：
1. 提取郵件的主旨與目的。
2. 分析郵件的語言特徵（例如是否使用過於吸引人的措辭、過多的呼籲性語句）。
3. 判斷是否包含潛在的釣魚信件特徵（例如要求用戶提供敏感信息、提供不可信的 URL 或強調緊急性）。
4. 根據分析給出結論，如果是釣魚信件，請回答「True」，如果不是釣魚信件，請回答「False」。
\n
[文件]
{context}
[文件]

<</SYS>>
現在，基於上述的文件，分析 () 中的郵件\n[郵件]:({question})\n<end_of_turn>
<start_of_turn>model[結果]:"""

rag_prompt = PromptTemplate.from_template(rag_prompt_templete)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
)
# rag_chain.get_graph().print_ascii()

In [None]:
# 將結果存入 list
rag_result = []

# 使用for迴圈遍歷phishing_emails的"Email Text"欄位
# 將是否答對存進 list 中，而非回答的答案
for email_text in phishing_emails["Email Text"]:
    result = rag_chain.invoke(email_text)
    if "True" in result or "true" in result:
        rag_result.append("True")
    else:
        rag_result.append("False")

for email_text in safe_emails["Email Text"]:
    result = rag_chain.invoke(email_text)
    if "False" in result or "false" in result:
        rag_result.append("True")
    else:
        rag_result.append("False")

rag_result = ['True', 'True', 'True', 'True','False',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'False',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True','True',
 'True',
 'True',
 'True',
 'False',
 'False',
 'False',
 'True',
 'False',
 'False',
 'True',
 'True',
 'False',
 'False',
 'True',
 'False',
 'False',
 'False',
 'False',
 'True',
 'False',
 'False',
 'True',
 'True',
 'False']

In [None]:
rag_acc = calcu_result(rag_result)
rag_acc
# 0.68

0.68

In [None]:
# 建立數據
data = {
    "Method": ["Normal", "Few-shot", "CoT", "Self-consistency", "RAG"],
    "Accuracy": [normal_acc, fewshot_acc, CoT_acc, self_acc, rag_acc]
}

# 轉換為 DataFrame
df = pd.DataFrame(data)

# 顯示表格
print(df)

             Method  Accuracy
0            Normal      0.58
1          Few-shot      0.68
2               CoT      0.50
3  Self-consistency      0.65
4               RAG      0.68


根據結果來看 RAG 以及 Few-Shot 有著最好的結果。<br>
CoT 的結果沒有預期來的好，從 result 來看，模型將幾乎所有信件當成釣魚信件。