In [None]:
%pip install selenium

In [13]:
## Import Package
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options

# 原本想要藉由 headless 減少渲染來加速，但有BUG
options = Options()
#options.add_argument("--headless")  
options.add_argument("--disable-gpu")  
options.add_argument("--window-size=1920x1080")
options.add_argument("--log-level=3")  
options.add_argument("--disable-blink-features=AutomationControlled")

import pandas as pd
#from webdriver_manager.chrome import ChromeDriverManager
import time

In [34]:
def scrape_imdb_reviews(url, output_filename="imdb_reviews.csv", review_limit=5):
    # Initialize Chrome driver
    chrome_browser = webdriver.Chrome(options=options)
    chrome_browser.get(url)
    wait = WebDriverWait(chrome_browser, 5)

    ratings = []
    reviews = []
    sentiments = []

    # 主要概念是從1~10的星等評論都抓過一遍
    for stars in range(1, 11):
        review_num = 0
        sentiment = ""
        if stars in range(1, 4):
            sentiment = "negative"
        elif stars in range(4, 7):
            sentiment = "neutral"
        else:
            sentiment = "positive"

        try:
            star_filter = wait.until(EC.presence_of_element_located((By.ID, "user-review-rating-filter")))
            select = Select(star_filter)

            # 選擇星等
            select.select_by_value(str(stars))
        except:
            # 因為網頁重整的問題所以第一次可能會選不到星等而發生錯誤
            # 所以發生錯誤再重選一遍即可
            time.sleep(0.5)
            star_filter = wait.until(EC.presence_of_element_located((By.ID, "user-review-rating-filter")))
            select = Select(star_filter)

            select.select_by_value(str(stars))
        
        review_blocks = []
        try:
            load_all = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/main/div/section/div/section/div/div[1]/section[1]/div[3]/div/span[2]/button')))

            # 點擊載入全部，再一次抓完呈現在畫面中的所有評論
            chrome_browser.execute_script("arguments[0].click();", load_all)
            time.sleep(10)  #等待評論載入
            review_blocks = wait.until(EC.visibility_of_all_elements_located((By.XPATH, '//section[1]/article')))
        except:
            pass

        try:
            review_blocks = wait.until(EC.visibility_of_all_elements_located((By.XPATH, '//section[1]/article')))
        except:
            pass
        
        print(f"Fetching reviews for {stars} stars... Total blocks: {len(review_blocks)}")
        
        for block in review_blocks:
            
            try:                
                
                # 抓取星等資訊
                rating = block.find_element(By.XPATH, './/span[contains(@class, "rating")]').text.split("/")[0]

                review = block.find_element(By.XPATH, './/div[@class="ipc-html-content-inner-div"]').text
                
                # 儲存
                ratings.append(rating)
                reviews.append(review)
                sentiments.append(sentiment)
                review_num += 1
            except Exception as e:
                pass
        print(f"{review_num} reviews collected")
    

    df = pd.DataFrame({"Sentiment": sentiments, "Rating": ratings, "Review": reviews})

    df.to_csv(output_filename, mode='a', index=False, header=not pd.io.common.file_exists(output_filename), encoding="utf-8")
    
    print(f"Scraping complete! Reviews saved to {output_filename}.")
    
    chrome_browser.close()


In [5]:
# 得到推薦電影選單中所有電影評論連結
def get_imdb_movie_reviews_urls(url):

    chrome_browser = webdriver.Chrome(options=options)
    chrome_browser.get(url)

    wait = WebDriverWait(chrome_browser, 20)
    time.sleep(10)
    movie_urls = []
    movies = chrome_browser.find_elements(By.CLASS_NAME, 'ipc-title-link-wrapper')

    for movie in movies:
        movie_url = movie.get_attribute('href')  
        
        # 網址格式：https://www.imdb.com/title/tt0111161/reviews/
        if movie_url.startswith('https://www.imdb.com/title/'):
            reviews_url = movie_url.split('?')[0] + 'reviews/'
            movie_urls.append(reviews_url)

    chrome_browser.quit()

    return movie_urls

# IMDb Top 250 URL
top_250_url = "https://www.imdb.com/chart/top"
top_250_movies = get_imdb_movie_reviews_urls(top_250_url)

# IMDb Bottom 250 URL (其實只有100部)
bottom_250_url = "https://www.imdb.com/chart/bottom"
bottom_250_movies = get_imdb_movie_reviews_urls(bottom_250_url)

top_tv_250_url = "https://www.imdb.com/chart/toptv/?ref_=chttp_ql_6"
top_250_tv = get_imdb_movie_reviews_urls(top_tv_250_url)

movies = top_250_movies + bottom_250_movies + top_250_tv

with open("movie_urls.txt", "w", encoding="utf-8") as file:
    for url in movies:
        file.write(url + "\n")  # 每個網址換行存入

print(movies[0])


https://www.imdb.com/title/tt0111161/reviews/


In [25]:
# 讀取 txt 檔案
with open("movie_urls.txt", "r", encoding="utf-8") as file:
    movies = [line.strip() for line in file.readlines()]  # 移除換行符號

for movie in movies:
    if (movie.startswith('https://www.imdb.com/title/')):
        continue
    else:
        print("Failed")
        break
print(len(movies))

600


In [35]:
for url in movies:
    scrape_imdb_reviews(url)

Fetching reviews for 1 stars... Total blocks: 75
55 reviews collected
Fetching reviews for 2 stars... Total blocks: 26
18 reviews collected
Fetching reviews for 3 stars... Total blocks: 36
24 reviews collected
Fetching reviews for 4 stars... Total blocks: 59
41 reviews collected
Fetching reviews for 5 stars... Total blocks: 120
86 reviews collected
Fetching reviews for 6 stars... Total blocks: 170
131 reviews collected
Fetching reviews for 7 stars... Total blocks: 199
148 reviews collected
Fetching reviews for 8 stars... Total blocks: 200
146 reviews collected
Fetching reviews for 9 stars... Total blocks: 149
117 reviews collected
Fetching reviews for 10 stars... Total blocks: 150
126 reviews collected
Scraping complete! Reviews saved to imdb_reviews.csv.
Fetching reviews for 1 stars... Total blocks: 142
128 reviews collected
Fetching reviews for 2 stars... Total blocks: 48
40 reviews collected
Fetching reviews for 3 stars... Total blocks: 41
33 reviews collected
Fetching reviews for 4