In [3]:
import requests
from bs4 import BeautifulSoup
import re
import csv
import time
import json


def get_web_page(page_url):
    resp = requests.get(page_url)
    if resp.status_code != 200:
        print("Invalid url:", resp.url)
        return None
    else:
        return resp.text


def get_articles(dom):
    soup = BeautifulSoup(dom, "html.parser")

    # 若 <a>有 href 屬性, 代表有超連結
    prev_link = soup.find("div", "btn-group-paging").find_all("a")[1]
    prev_link = prev_link["href"] if "href" in prev_link.attrs else None

    # 瀏覽每一個文章所在區塊
    positve_posts = []
    negative_posts = []
    for d in soup.find_all("div", "r-ent"):
        href = d.find("div", "title").a["href"]
        title = d.find("div", "title").text.strip()
        # 若標題為 [] 開頭, e.g., [好雷]XXXX
        if re.match(r"\[.*\]", title):
            tag = re.match(r"\[.*\]", title).group(0)
            # 標前內含'好'為好評; 含'負'或'爛'為負評
            if "好" in tag:
                positve_posts.append([title, href])
            if "負" in tag or "爛" in tag:
                negative_posts.append([title, href])
    return prev_link, positve_posts, negative_posts


def sanitize(txt):
    # 保留英數字、中文(\u4e00-\u9fa5)、中文標點符號、部分特殊符號
    # ^ 表示非括弧內指定的字元
    expr = re.compile(r'[^\u4e00-\u9fa5。﹔，：＂（）、？「」『』【】\s\w:/\-.()"]')
    txt = re.sub(expr, "", txt)
    # 用空白取代中英文標點
    txt = re.sub(r'[。﹔，：＂（）、？「」『』【】:/\-.()"]', " ", txt)
    txt = re.sub(r'(\s)+', ' ', txt)  # 用單一空白取代多個換行或 tab 符號
    txt = txt.replace("--", "")
    txt.lower()  # 英文字轉為小寫
    return txt


def get_post(post_url):
    resp = requests.get(url=post_url, cookies={"over18": "1"})
    soup = BeautifulSoup(resp.text, "html5lib")
    main_content = soup.find("div", id="main-content")

    # 把非本文的部分()標題區及推文區移除
    # 移除標題區塊
    for meta in main_content.find_all("div", "article-metaline"):
        meta.extract()
    # 移除推文區塊
    for push in main_content.find_all("div", "push"):
        push.extract()

    parsed = []

    # 移除 '※ 發信站:', '--'開頭, 本文區最後一行的文章網址
    for txt in main_content.stripped_strings:
        if txt[0] == "※" or txt[:2] == "--" or post_url in txt:
            continue
        txt = sanitize(txt)
        if txt:
            parsed.append(txt)
    return " ".join(parsed)

def get_article_body(csv_file):
    id_to_body = {}
    with open(csv_file, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            print("處理", row["title"], row["href"])
            title = " ".join(row["title"].split("]")[1:])
            title = sanitize(title)
            body = get_post(PTT_URL + row["href"])
            # 以文章超連結為 key, 標題 + 本文為 value
            id_to_body[row["href"]] = title + " " + body
            time.sleep(1)  # 放慢爬蟲速度
    return id_to_body


# 主程式流程
PTT_URL = "https://www.ptt.cc"
start_url = PTT_URL + "/bbs/movie/search?page=1&q=黑豹"
page = get_web_page(start_url)
positve_posts, negative_posts = [], []
if page:

    prev_link, pos, neg = get_articles(page)
    positve_posts += pos
    negative_posts += neg

    while prev_link:
        url = PTT_URL + prev_link
        next_page = get_web_page(url)
        prev_link, pos, neg = get_articles(next_page)
        positve_posts += pos
        negative_posts += neg

# 顯示 正負評的數量及前三篇貼文
print(len(positve_posts), positve_posts[:3])
print(len(negative_posts), negative_posts[:3])

with open("mov_pos.csv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["title", "href"])
    writer.writerows(positve_posts[:27])  # 因為負評文章只有27篇，為了讓機器學習沒有偏向某一邊，所以已取27篇好評

with open("mov_neg.csv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["title", "href"])
    writer.writerows(negative_posts)

d1 = get_article_body("mov_pos.csv")
d2 = get_article_body("mov_neg.csv")
id_to_body = {**d1, **d2}  # 將兩個 dict 合併成一個
with open("id_to_body.json", "w", encoding="utf-8") as f:
    json.dump(id_to_body, f, indent=2, ensure_ascii=False)

61 [['[好雷] 《暴走曼哈頓》黑豹化身正義猛警抽絲剝繭', '/bbs/movie/M.1575041152.A.537.html'], ['[好雷]  黑豹 --其實蠻好看的', '/bbs/movie/M.1552276420.A.F6F.html'], ['[好雷?] 黑豹 自慰片的新高度', '/bbs/movie/M.1545317279.A.EFC.html']]
27 [['[負雷]黑豹-失衡的烏托邦', '/bbs/movie/M.1529245622.A.AF5.html'], ['[負雷] 四不像的黑豹', '/bbs/movie/M.1527918611.A.56C.html'], ['[微負雷] 黑豹有點讓人失望....', '/bbs/movie/M.1527839684.A.EF0.html']]
處理 [好雷] 《暴走曼哈頓》黑豹化身正義猛警抽絲剝繭 /bbs/movie/M.1575041152.A.537.html
處理 [好雷]  黑豹 --其實蠻好看的 /bbs/movie/M.1552276420.A.F6F.html
處理 [好雷?] 黑豹 自慰片的新高度 /bbs/movie/M.1545317279.A.EFC.html
處理 [二刷好雷] 水行俠真的不是黑豹 /bbs/movie/M.1545065816.A.46A.html
處理 [好雷] 盲點：關於《黑豹》的奧克蘭也關於你我的故事 /bbs/movie/M.1538060300.A.8FD.html
處理 [好雷] 瘋狂亞洲富豪─絕不是新加坡黑豹 /bbs/movie/M.1536676591.A.C8F.html
處理 [微好雷]《雷神索爾3諸神黃昏》＆《黑豹》 /bbs/movie/M.1536217925.A.B6E.html
處理 [好雷] 比黑豹好看的蟻人與黃蜂女 /bbs/movie/M.1531217457.A.6C8.html
處理 [好雷]黑豹 — 符合台灣政治的隱喻分析 /bbs/movie/M.1525369009.A.CFA.html
處理 [好雷] 黑豹：一部政治寓言 /bbs/movie/M.1521128014.A.19B.html
處理 [好雷]黑豹，好看但可惜的反派 /bbs/movie/M.1520467155.A.8E5

In [6]:
import jieba
import csv
import json

def load_data(a_csv, b_json, label):
    a_ids = []
    with open(a_csv, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            a_ids.append(row["href"])
    with open(b_json, "r", encoding="utf-8") as f:
        id_to_body = json.load(f)
    
    data = []
    for a_id in a_ids:
        tokenized_post = []
        txt = id_to_body[a_id]
        for sent in txt.split(): # 將文章切成句子
            # 斷句後的結果, 若非空白且長度為 2 以上,則列入詞庫
            filterd = [t for t in jieba.cut(sent) if t.split() and len(t) > 1]
            tokenized_post += filterd
        data.append([tokenized_post, label])
    return data

# 主程式流程
pos_data = load_data("mov_pos.csv", "id_to_body.json", "正評")
neg_data = load_data("mov_neg.csv", "id_to_body.json", "負評")

# 印出正評與負評文章前幾個字，確認資料無誤
for post, label in pos_data[:3]:
    print(post[:5], label)
for post, label in neg_data[:3]:
    print(post[:5], label)

['暴走', '曼哈頓', '黑豹', '化身', '正義猛警'] 正評
['黑豹', '其實', '好看', 'movie', '最近'] 正評
['黑豹', '自慰', '高度', 'movie', '最近'] 正評
['黑豹', '失衡', '烏托邦', 'movie', '以下'] 負評
['四不像', '黑豹', 'movie', '首先', '主角'] 負評
['黑豹', '有點', '失望', 'movie', '前陣子'] 負評


In [11]:
import random

# 打亂資料順序
random.seed(42)
random.shuffle(pos_data)
random.shuffle(neg_data)

x_train, y_train, x_test, y_test = [], [], [], []

# 前 22 筆資料及答案放進 training set
# 建立資料時要用空白將斷好的詞組成一個字串
# 以便以後用 scikit learn 建立字典並將文字資料轉換成向量
for i in range(22):
    x_train.append(" ".join(pos_data[i][0]))
    x_train.append(" ".join(neg_data[i][0]))
    y_train.append(pos_data[i][1])
    y_train.append(neg_data[i][1])

# 最後 5 筆資料及答案放進 testing set
for i in range(22, len(pos_data)):
    x_test.append(" ".join(pos_data[i][0]))
    x_test.append(" ".join(neg_data[i][0]))
    y_test.append(pos_data[i][1])
    y_test.append(neg_data[i][1])

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 
# CountVectorizer 及 TfidfTransformer 模組是用來計算 TF-IDF
# TF-IDF（Term Frequency-InversDocument Frequency）是一种常用于信息处理和数据挖掘的加权技术。
# 该技术采用一种统计方法，根据字词的在文本中出现的次数和在整个语料中出现的文档频率来计算一个字词在整个语料中的重要程度。
# 它的优点是能过滤掉一些常见的却无关紧要本的词语，同时保留影响整个文本的重要字词。计算方法如下面公式所示。
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(x_train)
transformer = TfidfTransformer()
x_train = transformer.fit_transform(x_train)
clf = SGDClassifier(random_state=42) # 宣告SVM (Support Vector Machine)模型
clf.fit(x_train, y_train)
x_test = vectorizer.transform(x_test)
x_test = transformer.transform(x_test)
y_pred = clf.predict(x_test)
print("預測結果:", list(y_pred))
print("正確答案:", y_test)
print("正確率:", accuracy_score(y_test, y_pred))



預測結果: ['正評', '負評', '負評', '負評', '正評', '負評', '負評', '負評', '正評', '負評']
正確答案: ['正評', '負評', '正評', '負評', '正評', '負評', '正評', '負評', '正評', '負評']
正確率: 0.8


In [18]:
sentences = ["這部 電影 給我 很大 的 啟發", 
             "真的 覺得 浪費 錢"]
analyze = vectorizer.build_analyzer()
print(analyze(sentences[0]))
print(analyze(sentences[1]))
custom_data = transformer.transform(vectorizer.transform(sentences))
print(clf.predict(custom_data))

['這部', '電影', '給我', '很大', '啟發']
['真的', '覺得', '浪費']
['正評' '負評']
