In [1]:
# 自定義語氣分類詞典
tone_map = {
    "立即": "催促",
    "馬上": "催促",
    "請在": "催促",
    "限時": "催促",
    "否則": "威脅",
    "通緝": "恐嚇",
    "警察": "權威假冒",
    "法院": "權威假冒",
    "客服": "假冒正規",
    "司法": "恐嚇",
    "監管": "恐嚇",
    "保證金": "誘導",
    "中獎": "誘導",
    "轉帳": "指示",
    "ATM": "指示",
    "帳戶": "指示",
    "醫藥費": "求助",
    "救我": "求助",
}


In [5]:
import jieba
import pandas as pd
from collections import Counter

with open("關鍵字.txt", "r", encoding="utf-8") as f:
    text = f.read()

# 自定義停用詞（可擴充）
stopwords = set([
    '我們', '您', '你的', '我的', '是', '在', '了', '就', '都', '以', '為', '請', '的', 
    '這裡', '您好', '您好！', '您好，', '若', '若您', '請您', '現在', '將', '可能'
])

tokens = jieba.lcut(text)
tokens = [word for word in tokens if word.strip() not in stopwords and len(word.strip()) > 1]

# 詞頻
token_counts = Counter(tokens)

# 加語氣分類
tone_map = {
    "立即": "催促", "馬上": "催促", "請在": "催促", "限時": "催促",
    "否則": "威脅", "通緝": "恐嚇", "警察": "權威假冒", "法院": "權威假冒",
    "客服": "假冒正規", "司法": "恐嚇", "監管": "恐嚇", "保證金": "誘導",
    "中獎": "誘導", "轉帳": "指示", "ATM": "指示", "帳戶": "指示",
    "醫藥費": "求助", "救我": "求助",
}

df = pd.DataFrame(token_counts.items(), columns=["詞語", "出現次數"])
df["語氣分類"] = df["詞語"].map(tone_map).fillna("其他/未分類")
df = df.sort_values(by="出現次數", ascending=False)

df.to_csv("去除贅詞_詐騙語氣統計.csv", index=False)
print(df.head(20))


     詞語  出現次數    語氣分類
324  支付    60  其他/未分類
896  保單    53  其他/未分類
214  提供    45  其他/未分類
148  立即    40      催促
144  帳戶    39      指示
2    需要    38  其他/未分類
866  保險    37  其他/未分類
419  公司    24  其他/未分類
962  保費    23  其他/未分類
222  通知    23  其他/未分類
193  XX    22  其他/未分類
911  投保    21  其他/未分類
468  領取    21  其他/未分類
225  辦理    21  其他/未分類
143  銀行    19  其他/未分類
94   才能    19  其他/未分類
168  資料    18  其他/未分類
294  客服    17    假冒正規
346  避免    16  其他/未分類
13   一筆    16  其他/未分類


In [3]:
pip install jieba

Collecting jieba
  Downloading jieba-0.42.1.tar.gz (19.2 MB)
     ---------------------------------------- 0.0/19.2 MB ? eta -:--:--
     ----- ---------------------------------- 2.6/19.2 MB 16.7 MB/s eta 0:00:01
     ---------------- ----------------------- 8.1/19.2 MB 22.9 MB/s eta 0:00:01
     ---------------------------- ---------- 13.9/19.2 MB 24.9 MB/s eta 0:00:01
     --------------------------------------  19.1/19.2 MB 25.7 MB/s eta 0:00:01
     --------------------------------------- 19.2/19.2 MB 23.3 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: jieba
  Building wheel for jieba (setup.py): started
  Building wheel for jieba (setup.py): finished with status 'done'
  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314527 sha256=c25bcdeb1987ceaa798da2b47f937849eeeff189a84837324cc6bbd4848b4324
  Stored in directory: c:\users\robby1206\appdata\loc

In [6]:
import csv

def process_fraud_keywords(csv_file):
    """
    Processes a CSV file containing fraud keywords, adds tone markers,
    and filters the list based on frequency and relevance.

    Args:
        csv_file (str): Path to the CSV file.

    Returns:
        list: A list of tone-marked fraud keywords.
    """

    fraud_keywords = []
    tone_mapping = {
        "催促": "(緊急)",
        "威脅": "(威脅)",
        "誘導": "(誘餌)",
        "假冒正規": "(假冒)",
        "指示": "(指示)",
        "恐嚇": "(恐嚇)",
        "權威假冒": "(假冒權威)"
    }

    with open(csv_file, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header row
        for row in reader:
            try:
                word, frequency, category = row
                frequency = int(frequency)
                tone = tone_mapping.get(category, "")  # Get tone from mapping
                if tone:
                    word = f"{word} {tone}"  # Add tone marker
                # Filter based on frequency (adjust thresholds as needed)
                if frequency >= 3:  # Keep words with frequency >= 3
                    fraud_keywords.append((word, frequency))
            except ValueError as e:
                print(f"Skipping row due to error: {row} - {e}")
            except Exception as e:
                print(f"An unexpected error occurred: {e}")

    # Sort by frequency (descending)
    fraud_keywords.sort(key=lambda item: item[1], reverse=True)

    # Further filtering and reduction (customize as needed)
    filtered_keywords = []
    for word, frequency in fraud_keywords:
        #Remove generic terms (customize the list)
        generic_terms = ["資料", "公司", "需要", "可以", "真的", "系統", "完成"]
        if word.split(" ")[0] not in generic_terms: #check the base word, not including tone
            filtered_keywords.append(word)

    return filtered_keywords[:500] # Limit to 500 words

# Example usage:
csv_file = "去除贅詞_詐騙語氣統計.csv"
processed_keywords = process_fraud_keywords(csv_file)

for keyword in processed_keywords:
    print(keyword)

支付
保單
提供
立即 (緊急)
帳戶 (指示)
保險
保費
通知
XX
投保
領取
辦理
銀行
才能
客服 (假冒)
避免
一筆
確認
否則 (威脅)
安全
涉及
以免
保障
費用
客戶
繳納
申請
發現
活動
000
調查
身分
資金
聯絡
保證
手續費
帳號
資訊以
扣款
萬元
中心
交易
訂單
退款
資訊
盜用
重新
不然
指示
理賠
配合
異常
保證金 (誘餌)
未繳
無法
付款
收益
包裹
信用卡
50
處理
更新
所有
登入
獲得
資格
這是
取消
醫療
繳清
電話
專案
提醒
操作
現金
問題
罰款
協助
獎品
投資
受益人
10
即可
驗證
一次
升級
凍結
身份
犯罪
中獎 (誘餌)
如需
商品
連結
感謝
VIP
繳費
平台
手續
只要
監管 (恐嚇)
利息
證明
退費
需繳
要求
保險理
程序
參加
不及
目前
失效
30
不想
以便
即將
網站
每年
收取
顯示
政府
本金
錯過
手機
案件
今日
今天
電腦
支持
號碼
盡快
撥款
抽獎
回饋
500
有人
懷疑
5%
海外
親屬
一定
貸款
額度
按照
生效
出現
幸運
差額
偵測
匯款
已經
近期
意外
尚未
請先
1000
購買
投入
機會
提醒您
金額
欠款
100
指定
翻倍
必須
獎金
退回
補繳
密碼
損失
購物
最新
一份
我現
恭喜
準備
幫忙
遇到
領回
賠償
過期
醫院
不是
地址
助您
繳保費
學校
正在
非法
即刻
簽署
詐騙
實性
機構
之前
建議
加保
增加
沒有
可疑
卡號
利用
旅遊
名額
核准
證金
放款
一位
影響
額外
訴訟
存款
警告
寄送
重要
明天
墊付
請將
放棄
一年
加入
回報
證件
涉嫌
需先
親愛的
小時
10%
流程
檢察官
一起
領獎
依法
先繳
相關
2000
不要
填寫
成功
平安
強制
本案
孩子
請點擊
鎖定
停機
自動
相信
運費
法律
軟體
人壽
每月
網路
以確
解鎖
專員
審查
優惠
提交
如果
逾期
費以
不法
可選擇
退還
服務
一張
退保
終止
監控
立刻
暫時
使用
方案
錯誤
分紅
ATM (指示)
最近
我們將
文件
效力
權益
價值
推薦
不會
停用
高額
不足
本行
專線
交保
用以
年度
和解
先借


In [7]:
import csv

def process_fraud_keywords(csv_file, output_file):
    """
    Processes a CSV file containing fraud keywords, adds tone markers,
    filters the list based on frequency and relevance, and saves the
    results to a new CSV file.

    Args:
        csv_file (str): Path to the input CSV file.
        output_file (str): Path to the output CSV file.
    """

    fraud_keywords = []
    tone_mapping = {
        "催促": "(緊急)",
        "威脅": "(威脅)",
        "誘導": "(誘餌)",
        "假冒正規": "(假冒)",
        "指示": "(指示)",
        "恐嚇": "(恐嚇)",
        "權威假冒": "(假冒權威)"
    }

    with open(csv_file, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header row
        for row in reader:
            try:
                word, frequency, category = row
                frequency = int(frequency)
                tone = tone_mapping.get(category, "")  # Get tone from mapping
                if tone:
                    word = f"{word} {tone}"  # Add tone marker

                if frequency >= 3:  # Keep words with frequency >= 3
                    fraud_keywords.append((word, frequency))
            except ValueError as e:
                print(f"Skipping row due to error: {row} - {e}")
            except Exception as e:
                print(f"An unexpected error occurred: {e}")

    # Sort by frequency (descending)
    fraud_keywords.sort(key=lambda item: item[1], reverse=True)

    # Further filtering and reduction (customize as needed)
    filtered_keywords = []
    for word, frequency in fraud_keywords:
        #Remove generic terms (customize the list)
        generic_terms = ["資料", "公司", "需要", "可以", "真的", "系統", "完成"]
        if word.split(" ")[0] not in generic_terms: #check the base word, not including tone
            filtered_keywords.append(word)

    filtered_keywords = filtered_keywords[:500] # Limit to 500 words

    # Save to CSV
    with open(output_file, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["keyword"])  # Header row
        for keyword in filtered_keywords:
            writer.writerow([keyword])

# Example usage:
csv_file = "去除贅詞_詐騙語氣統計.csv"
output_file = "processed_fraud_keywords.csv"
process_fraud_keywords(csv_file, output_file)

print(f"Processed keywords saved to {output_file}")

Processed keywords saved to processed_fraud_keywords.csv


In [8]:
import csv

def process_fraud_keywords(csv_file, output_file):
    """
    Processes a CSV file containing fraud keywords, adds tone markers,
    filters the list based on frequency and relevance, and saves the
    results to a new CSV file.

    Args:
        csv_file (str): Path to the input CSV file.
        output_file (str): Path to the output CSV file.
    """

    fraud_keywords = []
    tone_mapping = {
        "催促": "(緊急)",
        "威脅": "(威脅)",
        "誘導": "(誘餌)",
        "假冒正規": "(假冒)",
        "指示": "(指示)",
        "恐嚇": "(恐嚇)",
        "權威假冒": "(假冒權威)"
    }

    with open(csv_file, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header row
        for row in reader:
            try:
                word, frequency, category = row
                frequency = int(frequency)
                tone = tone_mapping.get(category, "")  # Get tone from mapping
                if tone:
                    word = f"{word} {tone}"  # Add tone marker

                if frequency >= 2:  # Keep words with frequency >= 2 (Reduced Threshold)
                    fraud_keywords.append((word, frequency))
            except ValueError as e:
                print(f"Skipping row due to error: {row} - {e}")
            except Exception as e:
                print(f"An unexpected error occurred: {e}")

    # Sort by frequency (descending)
    fraud_keywords.sort(key=lambda item: item[1], reverse=True)

    # Further filtering and reduction (customize as needed)
    filtered_keywords = []
    for word, frequency in fraud_keywords:
        #Remove generic terms (customize the list)
        generic_terms = ["資料", "公司", "需要", "可以", "真的", "系統"] # Reduced Generic Terms
        if word.split(" ")[0] not in generic_terms: #check the base word, not including tone
            filtered_keywords.append(word)

    filtered_keywords = filtered_keywords[:500] # Limit to 500 words

    # Save to CSV
    with open(output_file, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["keyword"])  # Header row
        for keyword in filtered_keywords:
            writer.writerow([keyword])

# Example usage:
csv_file = "去除贅詞_詐騙語氣統計.csv"
output_file = "processed_fraud_keywords.csv"
process_fraud_keywords(csv_file, output_file)

print(f"Processed keywords saved to {output_file}")

Processed keywords saved to processed_fraud_keywords.csv


In [9]:
# 重新載入檔案（因為執行狀態重置）
import pandas as pd

# 嘗試以多種編碼打開 CSV 檔案
encodings_to_try = ["utf-8", "utf-8-sig", "big5", "cp950", "gbk"]
file_path = "500精簡詐騙字詞.csv"

for enc in encodings_to_try:
    try:
        df = pd.read_csv(file_path, encoding=enc)
        detected_encoding = enc
        break
    except UnicodeDecodeError:
        continue

# 重新儲存為 UTF-8 格式
output_path = "500精簡詐騙字詞_UTF8.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")

output_path


'500精簡詐騙字詞_UTF8.csv'