In [4]:
import pandas as pd

# 載入資料
ptt_df = pd.read_csv("ptt_語料_處理後.csv")
mobile01_df = pd.read_csv("mobile01_處理後.csv")
finfo_df = pd.read_csv("finfo_posts_產險_壽險_投資型.csv")
scam_keywords_df = pd.read_csv("cleaned/500精簡詐騙字詞_UTF8.csv", header=None)

# 合併所有來源
combined_df = pd.concat([ptt_df, mobile01_df, finfo_df], ignore_index=True)

# 正規化欄位名稱
combined_df.columns = [col.strip().lower() for col in combined_df.columns]
scam_keywords = scam_keywords_df[0].dropna().unique().tolist()

# 取出所有帳號資料（發文、留言、Mobile01）
post_authors = combined_df['發文者帳號'].dropna().astype(str)
comment_authors = combined_df['留言帳號'].dropna().astype(str)
mobile01_authors = combined_df['author'].dropna().astype(str)

# 統計帳號出現次數
all_accounts = pd.concat([post_authors, comment_authors, mobile01_authors], ignore_index=True)
account_counts = all_accounts.value_counts()
single_appearance_accounts = set(account_counts[account_counts == 1].index)

# 整合所有文字內容
combined_df['完整內容'] = combined_df[['內容', '發文內容', '留言內容', 'content']].fillna('').agg(' '.join, axis=1)

# 篩選帳號只出現一次的樣本
account_related_records = combined_df[
    combined_df['發文者帳號'].isin(single_appearance_accounts) |
    combined_df['留言帳號'].isin(single_appearance_accounts) |
    combined_df['author'].isin(single_appearance_accounts)
].copy()

# 篩掉包含詐騙關鍵字的樣本
account_related_records['包含詐騙詞'] = account_related_records['完整內容'].apply(
    lambda x: any(keyword in x for keyword in scam_keywords)
)
filtered_records = account_related_records[~account_related_records['包含詐騙詞']]

# 最終樣本數
print(f"收斂後樣本數量：{len(filtered_records)}")


收斂後樣本數量：5926


In [5]:
# Export the filtered (收斂後) records to a CSV file
output_path = "收斂樣本_無詐騙詞_帳號只出現一次.csv"
filtered_records.to_csv(output_path, index=False)

output_path

'收斂樣本_無詐騙詞_帳號只出現一次.csv'

In [3]:
# 準備完整 .ipynb 所需的程式碼內容
notebook_code = {
    "cells": [
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "# 詐騙帳號偵測特徵建構與異常分析\n",
                "本 notebook 將處理收斂過的樣本資料，建構詐騙特徵，並使用 Isolation Forest 與 One-Class SVM 進行異常行為偵測。"
            ]
        },
        {
            "cell_type": "code",
            "metadata": {},
            "source": [
                "import pandas as pd\n",
                "from sklearn.feature_extraction.text import TfidfVectorizer\n",
                "from sklearn.ensemble import IsolationForest\n",
                "from sklearn.svm import OneClassSVM\n",
                "from IPython.display import display\n",
                "\n",
                "# 讀取資料\n",
                "df = pd.read_csv(\"收斂樣本_無詐騙詞_帳號只出現一次.csv\")\n",
                "scam_keywords = pd.read_csv(\"cleaned/500精簡詐騙字詞_UTF8.csv\", header=None)[0].dropna().unique().tolist()\n",
                "df.columns = [col.strip().lower() for col in df.columns]"
            ]
        },
        {
            "cell_type": "code",
            "metadata": {},
            "source": [
                "# 整合欄位與時間處理\n",
                "df['帳號'] = df[['發文者帳號', '留言帳號', 'author']].bfill(axis=1).iloc[:, 0]\n",
                "df['內容'] = df[['內容', '發文內容', '留言內容', 'content']].fillna('').agg(' '.join, axis=1)\n",
                "df['是否主文'] = df['是否主文'].fillna(False).astype(int)\n",
                "df['時間'] = df[['發文時間', '留言時間', 'post_time', 'time']].bfill(axis=1).iloc[:, 0]\n",
                "df['時間'] = pd.to_datetime(df['時間'], errors='coerce')"
            ]
        },
        {
            "cell_type": "code",
            "metadata": {},
            "source": [
                "# 特徵工程\n",
                "account_counts = df['帳號'].value_counts()\n",
                "df['帳號出現次數'] = df['帳號'].map(account_counts)\n",
                "df['是否凌晨'] = df['時間'].dt.hour.apply(lambda x: 1 if pd.notnull(x) and 0 <= x < 6 else 0)\n",
                "df['是否含詐騙字詞'] = df['內容'].apply(lambda x: any(word in x for word in scam_keywords)).astype(int)\n",
                "df['內容長度'] = df['內容'].apply(len)\n",
                "來源_dummies = pd.get_dummies(df['來源'], prefix='來源')"
            ]
        },
        {
            "cell_type": "code",
            "metadata": {},
            "source": [
                "# TF-IDF 特徵\n",
                "tfidf_vectorizer = TfidfVectorizer(max_features=100)\n",
                "tfidf_matrix = tfidf_vectorizer.fit_transform(df['內容'].fillna(''))\n",
                "tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])])"
            ]
        },
        {
            "cell_type": "code",
            "metadata": {},
            "source": [
                "# 合併所有特徵\n",
                "feature_set = pd.concat([\n",
                "    df[['帳號', '帳號出現次數', '是否凌晨', '是否含詐騙字詞', '內容長度', '是否主文']].reset_index(drop=True),\n",
                "    來源_dummies.reset_index(drop=True),\n",
                "    tfidf_df.reset_index(drop=True)\n",
                "], axis=1)"
            ]
        },
        {
            "cell_type": "code",
            "metadata": {},
            "source": [
                "# 異常偵測模型\n",
                "iso_model = IsolationForest(random_state=42)\n",
                "feature_set['isolation_anomaly_score'] = iso_model.fit_predict(feature_set.drop(columns=['帳號']))\n",
                "\n",
                "svm_model = OneClassSVM(gamma='scale', nu=0.1)\n",
                "feature_set['svm_anomaly_score'] = svm_model.fit_predict(feature_set.drop(columns=['帳號']))"
            ]
        },
        {
            "cell_type": "code",
            "metadata": {},
            "source": [
                "# 顯示前幾筆結果\n",
                "display(feature_set.head())"
            ]
        },
        {
            "cell_type": "code",
            "metadata": {},
            "source": [
                "# 匯出為 CSV\n",
                "feature_set.to_csv(\"完整詐騙偵測特徵表.csv\", index=False)"
            ]
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "name": "python",
            "version": "3.8"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 5
}

# 儲存為 .ipynb 檔案
import json

notebook_path = "詐騙特徵建構與異常偵測.ipynb"
with open(notebook_path, "w", encoding="utf-8") as f:
    json.dump(notebook_code, f, ensure_ascii=False, indent=2)

notebook_path



'詐騙特徵建構與異常偵測.ipynb'

In [4]:
# 假設你已經訓練好這個 vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['內容'].fillna(''))

# 查看每個 tfidf 特徵名稱（對應詞語）
feature_names = tfidf_vectorizer.get_feature_names_out()

# 建立對應字典
tfidf_feature_mapping = {f'tfidf_{i}': word for i, word in enumerate(feature_names)}

# 顯示前幾個看看
for i in range(10):
    print(f'tfidf_{i} => {tfidf_feature_mapping[f"tfidf_{i}"]}')


NameError: name 'df' is not defined

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 重新讀取原始內容資料（非特徵表，避免重複 tfidf）
df = pd.read_csv("收斂樣本_無詐騙詞_帳號只出現一次.csv")
df.columns = [col.strip().lower() for col in df.columns]

# 整合內容欄位
df['內容'] = df[['內容', '發文內容', '留言內容', 'content']].fillna('').agg(' '.join, axis=1)

# 重建 TfidfVectorizer，這次保留原始詞彙作為欄位名
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['內容'].fillna(''))

# 使用詞彙名稱建立欄位名
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df_named = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# 輸出新的 TF-IDF 特徵表（含詞彙名）
output_path = "TFIDF_詞彙命名特徵表.csv"
tfidf_df_named.to_csv(output_path, index=False)

output_path


'TFIDF_詞彙命名特徵表.csv'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 重新載入原始收斂樣本資料
df = pd.read_csv("收斂樣本_無詐騙詞_帳號只出現一次.csv")
df.columns = [col.strip().lower() for col in df.columns]

# 整合內容欄位
df['內容'] = df[['內容', '發文內容', '留言內容', 'content']].fillna('').agg(' '.join, axis=1)

# 建立乾淨版本的 TfidfVectorizer
clean_vectorizer = TfidfVectorizer(
    max_features=100,
    stop_words=['com', 'http', 'https', 'imgur', 'wrote', '恕刪', '分享', '笑死'],
    token_pattern=r"(?u)\b\w{2,}\b"  # 只保留長度 >= 2 的詞，排除純數字與符號
)

# 擬合與轉換
clean_tfidf_matrix = clean_vectorizer.fit_transform(df['內容'].fillna(''))
clean_feature_names = clean_vectorizer.get_feature_names_out()

# 建立 DataFrame 並輸出
clean_tfidf_df = pd.DataFrame(clean_tfidf_matrix.toarray(), columns=clean_feature_names)
clean_tfidf_output_path = ".csv"
clean_tfidf_df.to_csv(clean_tfidf_output_path, index=False)

clean_tfidf_output_path


'TFIDF_詞彙命名特徵表.csv'