In [5]:
pip install transformers

Collecting transformers
  Downloading transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp313-cp313-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.51.2-py3-none-any.whl (10.4 MB)
   ---------------------------------------- 0.0/10.4 MB ? eta -:--:--
   ------ --------------------------------- 1.6/10.4 MB 10.2 MB/s eta 0:00:01
   ------------------- -------------------- 5.0/10.4 MB 12.4 MB/s eta 0:00:01
 


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# 使用 DistilBERT 預測帳號是否為潛在詐騙者（適用全量資料）
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import joblib
from captum.attr import IntegratedGradients

# 檢查 GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("使用裝置：", device)

# 載入資料
mobile_df = pd.read_csv("mobile01_處理後.csv")
ptt_df = pd.read_csv("ptt_語料_處理後.csv")
finfo_df = pd.read_csv("finfo_posts_產險_壽險_投資型.csv")

# 平台標記
mobile_df['平台'] = 'Mobile01'
ptt_df['平台'] = 'PTT'
finfo_df['平台'] = 'Finfo'

# 帳號欄位統一化
for df in [mobile_df, ptt_df, finfo_df]:
    if '留言帳號' in df.columns and '發文者帳號' in df.columns:
        df['帳號'] = df['留言帳號'].fillna(df['發文者帳號'])
    elif '帳號' in df.columns:
        df['帳號'] = df['帳號']
    elif 'author' in df.columns:
        df['帳號'] = df['author']
    else:
        df['帳號'] = '未知帳號'

    df['留言內容'] = df['留言內容'] if '留言內容' in df.columns else ''
    df['發文內容'] = df['發文內容'] if '發文內容' in df.columns else ''
    df['text'] = df['留言內容'].fillna('') + ' ' + df['發文內容'].fillna('')

    if '詐騙關鍵詞次數' not in df.columns:
        df['詐騙關鍵詞次數'] = 0

# 合併資料
combined_df = pd.concat([
    mobile_df[['平台', '帳號', 'text', '詐騙關鍵詞次數']],
    ptt_df[['平台', '帳號', 'text', '詐騙關鍵詞次數']],
    finfo_df[['平台', '帳號', 'text', '詐騙關鍵詞次數']]
], ignore_index=True)

combined_df = combined_df.dropna(subset=['帳號'])
combined_df = combined_df[combined_df['text'].str.strip() != '']
combined_df['label'] = (combined_df['詐騙關鍵詞次數'] > 0).astype(int)

# 分割資料
train_texts, test_texts, train_labels, test_labels, train_accounts, test_accounts = train_test_split(
    combined_df['text'].values,
    combined_df['label'].values,
    combined_df['帳號'].values,
    test_size=0.2,
    random_state=42
)
test_text_raw = test_texts

# Tokenizer
model_name = 'ckiplab/bert-base-chinese'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Dataset 類別
class ScamDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=64)
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = ScamDataset(train_texts, train_labels)
test_dataset = ScamDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4)

# 模型定義
class DistilBertClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 1)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        dropped = self.dropout(pooled_output)
        return torch.sigmoid(self.classifier(dropped)).squeeze()

model = DistilBertClassifier().to(device)

# Captum 整合解釋函數 + 視覺化 + 匯出
class WrapperModel(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
    def forward(self, input_ids, attention_mask):
        return self.model(input_ids=input_ids, attention_mask=attention_mask).unsqueeze(0)

def explain_with_captum(text, visualize=True, save_csv=True):
    model.eval()
    wrapper = WrapperModel(model).to(device)
    ig = IntegratedGradients(wrapper)

    encoding = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=64)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    baseline = torch.zeros_like(input_ids).to(device)

    attributions, delta = ig.attribute(inputs=input_ids, baselines=baseline, additional_forward_args=attention_mask, return_convergence_delta=True)
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    scores = attributions.squeeze(0).sum(dim=1).detach().cpu().numpy()

    df = pd.DataFrame({'Token': tokens, 'Importance': scores})
    df = df[df['Token'].str.startswith('▁') | df['Token'].str.len() == 1]  # 過濾特殊token（選擇性）
    df = df[df['Token'] != '[PAD]']
    df = df[df['Importance'] != 0.0]

    if visualize:
        plt.figure(figsize=(10, 4))
        sns.barplot(x='Importance', y='Token', data=df.sort_values(by='Importance', ascending=False))
        plt.title("Token 特徵貢獻視覺化")
        plt.tight_layout()
        plt.show()

    if save_csv:
        df.to_csv("captum_token_contributions.csv", index=False, encoding='utf-8-sig')
        print("已儲存 captum_token_contributions.csv")

    return df

# 範例
# explain_with_captum("這是一個投資報酬高的免費保單")


使用裝置： cpu


Some weights of BertModel were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# 使用 DistilBERT 預測帳號是否為潛在詐騙者（適用全量資料）
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import joblib
from captum.attr import IntegratedGradients

# Captum 整合解釋函數 + 視覺化 + 匯出
class WrapperModel(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
    def forward(self, input_ids, attention_mask):
        return self.model(input_ids=input_ids, attention_mask=attention_mask).unsqueeze(0)

def explain_with_captum(text, wrapper_model, model, tokenizer, device, visualize=True, save_csv=True, plot_file=None):
    model.eval()
    ig = IntegratedGradients(wrapper_model)

    encoding = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=64)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    baseline = torch.zeros_like(input_ids).to(device)

    attributions, delta = ig.attribute(inputs=input_ids, baselines=baseline, additional_forward_args=attention_mask, return_convergence_delta=True)
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    scores = attributions.squeeze(0).sum(dim=1).detach().cpu().numpy()

    df = pd.DataFrame({'Token': tokens, 'Importance': scores})
    df = df[df['Token'].str.startswith('▁') | df['Token'].str.len() == 1]  # 過濾特殊token
    df = df[df['Token'] != '[PAD]']
    df = df[df['Importance'] != 0.0]

    if visualize:
        plt.figure(figsize=(10, 4))
        sns.barplot(x='Importance', y='Token', data=df.sort_values(by='Importance', ascending=False))
        plt.title("Token 特徵貢獻視覺化")
        plt.tight_layout()
        if plot_file:
            plt.savefig(plot_file, dpi=300)
        plt.close()

    if save_csv:
        df.to_csv(f"captum_token_contributions.csv", index=False, encoding='utf-8-sig')

    return df

# 批次執行 Top 10 高風險預測文字分析
def run_batch_captum_analysis(model, test_texts, test_labels, test_preds, tokenizer):
    wrapper = WrapperModel(model).to(device)
    high_risk_indices = np.argsort(test_preds)[-10:][::-1]
    top10_records = []

    for idx, i in enumerate(high_risk_indices):
        text = test_texts[i]
        label = test_labels[i]
        prob = test_preds[i]
        df = explain_with_captum(
            text, wrapper, model, tokenizer, device,
            visualize=True, save_csv=False, plot_file=f"captum_explain_{idx}.png"
        )
        df['Text_Index'] = i
        df['Original_Text'] = text
        df['True_Label'] = label
        df['Predicted_Prob'] = prob
        top10_records.append(df)

    final_df = pd.concat(top10_records, ignore_index=True)
    final_df.to_csv("captum_top10_tokens.csv", index=False, encoding='utf-8-sig')
    print("已匯出 top10 Captum 解釋結果與圖檔")


In [5]:
# 預測並啟用 Captum 批次分析
model.eval()
preds, trues, probs = [], [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        probs.extend(outputs.cpu().numpy())
        pred_labels = (outputs > 0.5).int().cpu().numpy()
        preds.extend(pred_labels)
        trues.extend(labels.cpu().numpy())

probs = np.array(probs)
preds = np.array(preds)
trues = np.array(trues)

run_batch_captum_analysis(model, test_texts, trues, probs, tokenizer)


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [2]:

pip install captum

Collecting captum
  Using cached captum-0.8.0-py3-none-any.whl.metadata (26 kB)
Using cached captum-0.8.0-py3-none-any.whl (1.4 MB)
Installing collected packages: captum
Successfully installed captum-0.8.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
!pip install transformers


# # 使用 DistilBERT 預測帳號是否為潛在詐騙者（適用全量資料）
# import pandas as pd
# import numpy as np
# import torch
# from torch import nn
# from torch.utils.data import Dataset, DataLoader
# from transformers import AutoTokenizer, AutoModel
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import classification_report, accuracy_score, f1_score
# from collections import defaultdict
# import matplotlib.pyplot as plt
# import seaborn as sns
# import gc
# import joblib
# from captum.attr import IntegratedGradients

Collecting transformers
  Using cached transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Using cached transformers-4.51.2-py3-none-any.whl (10.4 MB)
Using cached huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
Downloading regex-2024.11.6-cp310-cp310-win_amd64.whl (274 kB)
Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl (308 kB)
Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl (2.4 MB)
Installing collected packages: safetensors, regex, huggingface-hub, tokenizers, transf