In [2]:
import pandas as pd
import re

# === 檔案路徑 ===
mobile01_path = '../mobile01_full_articles_f291.csv'
scam_path = '500精簡詐騙字詞_UTF8.csv'
output_path = './mobile01_處理後.csv'

# === 載入資料 ===
mobile01_df = pd.read_csv(mobile01_path)
scam_keywords_df = pd.read_csv(scam_path)
scam_keywords = scam_keywords_df['詞語'].dropna().astype(str).str.strip().tolist()

insurance_keywords = ['產險', '投資型', '壽險', '保單']

def count_keywords(text, keywords):
    if pd.isna(text): return 0
    return sum(text.count(k) for k in keywords)

# === 特徵工程處理 ===
mobile01_df['論壇'] = 'Mobile01'
mobile01_df['是否主文'] = mobile01_df['page'] == 1
mobile01_df['連結'] = mobile01_df['url']

# 發文與留言帳號
def extract_author(text):
    if isinstance(text, str):
        match = re.search(r'(\w+)\s+wrote:', text)
        return match.group(1) if match else None
    return None

mobile01_df['發文者帳號'] = mobile01_df.apply(
    lambda row: extract_author(row['content']) if row['是否主文'] else None,
    axis=1
)
mobile01_df['留言帳號'] = mobile01_df.apply(
    lambda row: extract_author(row['content']) if not row['是否主文'] else None,
    axis=1
)

# 發文與留言時間
mobile01_df['發文時間'] = mobile01_df.apply(lambda r: r['post_time'] if r['是否主文'] else None, axis=1)
mobile01_df['留言時間'] = mobile01_df.apply(lambda r: r['post_time'] if not r['是否主文'] else None, axis=1)

# 發文與留言內容
mobile01_df['發文內容'] = mobile01_df.apply(lambda r: r['content'] if r['是否主文'] else None, axis=1)
mobile01_df['留言內容'] = mobile01_df.apply(lambda r: r['content'] if not r['是否主文'] else None, axis=1)

# 詞彙標記
mobile01_df['保險關鍵詞次數'] = mobile01_df['content'].apply(lambda x: count_keywords(x, insurance_keywords))
mobile01_df['詐騙關鍵詞次數'] = mobile01_df['content'].apply(lambda x: count_keywords(x, scam_keywords))
mobile01_df['是否提及特定公司'] = mobile01_df['content'].apply(lambda x: any(k in str(x) for k in ['國泰', '大樹']))

# === 輸出 ===
mobile01_df.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f'✅ 已完成清洗並輸出：{output_path}')


✅ 已完成清洗並輸出：./mobile01_處理後.csv


In [5]:
# 現在詐騙關鍵字詞檔案已上傳，重新處理三份 mobile01 原始資料
import pandas as pd
import re

# === 檔案路徑 ===
file_paths = [
    "../mobile01_full_articles_f291.csv",
    "../mobile01_full_articles_802.csv",
    "../mobile01_articles804_full.csv",
]
scam_path = "500精簡詐騙字詞_UTF8.csv"

# === 詐騙與保險關鍵字 ===
scam_keywords_df = pd.read_csv(scam_path)
scam_keywords = scam_keywords_df['詞語'].dropna().astype(str).str.strip().tolist()
insurance_keywords = ['產險', '投資型', '壽險', '保單']

def count_keywords(text, keywords):
    if pd.isna(text): return 0
    return sum(text.count(k) for k in keywords)

def extract_author(text):
    if isinstance(text, str):
        match = re.search(r'(\w+)\s+wrote:', text)
        return match.group(1) if match else None
    return None

def clean_mobile01(df):
    df['論壇'] = 'Mobile01'
    df['是否主文'] = df['page'] == 1
    df['連結'] = df['url']
    
    df['發文者帳號'] = df.apply(lambda row: extract_author(row['content']) if row['是否主文'] else None, axis=1)
    df['留言帳號'] = df.apply(lambda row: extract_author(row['content']) if not row['是否主文'] else None, axis=1)
    
    df['發文時間'] = df.apply(lambda r: r['post_time'] if r['是否主文'] else None, axis=1)
    df['留言時間'] = df.apply(lambda r: r['post_time'] if not r['是否主文'] else None, axis=1)
    df['發文內容'] = df.apply(lambda r: r['content'] if r['是否主文'] else None, axis=1)
    df['留言內容'] = df.apply(lambda r: r['content'] if not r['是否主文'] else None, axis=1)
    
    df['保險關鍵詞次數'] = df['content'].apply(lambda x: count_keywords(x, insurance_keywords))
    df['詐騙關鍵詞次數'] = df['content'].apply(lambda x: count_keywords(x, scam_keywords))
    df['是否提及特定公司'] = df['content'].apply(lambda x: any(k in str(x) for k in ['國泰', '大樹']))
    return df

# 處理所有檔案
cleaned_dfs = []
for path in file_paths:
    df = pd.read_csv(path)
    cleaned_df = clean_mobile01(df)
    cleaned_dfs.append(cleaned_df)

# 合併後輸出
final_mobile_df = pd.concat(cleaned_dfs, ignore_index=True)
output_path = "../mobile01_處理後.csv"
final_mobile_df.to_csv(output_path, index=False, encoding='utf-8-sig')
output_path


'../mobile01_處理後.csv'