In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
from webdriver_manager.chrome import ChromeDriverManager
import nltk
from nltk.corpus import stopwords
import string
from tqdm import tqdm
import re
import imdb

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', string.punctuation)
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # Transport & map symbols
                           u"\U0001F700-\U0001F77F"  # Alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric shapes
                           u"\U0001F800-\U0001F8FF"  # Supplemental arrows
                           u"\U0001F900-\U0001F9FF"  # Supplemental symbols & pictographs
                           u"\U0001FA00-\U0001FA6F"  # Symbols & pictographs extended-A
                           u"\U0001FA70-\U0001FAFF"  # Symbols & pictographs extended-B
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251" 
                           "]+", flags=re.UNICODE)

# Remove stopwords，punctuations，html tags，emojis and transform into lowercase
def preprocess_text(text):
    text = emoji_pattern.sub(r'', text)
    text = text.lower()
    text = text.replace(",", " , ").replace(".", " . ").replace("-", " - ").replace("/", " / ")
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    words = text.split()
    filtered_sentence = " ".join(word.translate(table) for word in words if word not in stop_words)
    return filtered_sentence.strip()

def scrape_imdb_reviews(movie_ids):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    # Manually set local chromedriver path
    # CHROMEDRIVER_PATH = "/opt/homebrew/bin/chromedriver"
    # service = Service(CHROMEDRIVER_PATH)
    # driver = webdriver.Chrome(service=service, options=options)

    all_reviews = []

    for movie_id in movie_ids:

        url = f"https://www.imdb.com/title/{movie_id}/reviews"
        driver.get(url)

        # Wait for the page to load
        wait = WebDriverWait(driver, 10)

        load_more_clicked = 0
        # Click the btn to load more reviews
        while load_more_clicked < 5:
            try:
                load_more_button = driver.find_element(By.CSS_SELECTOR, "button.ipc-see-more__button")
                # load_more_button.click()
                load_more_button.send_keys('\n')
                wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, ".ipl-load-more__load-indicator")))
                time.sleep(2)
                load_more_clicked +=1
            except Exception as e:
                # print(e)
                break
        print(load_more_clicked)

        # Get the full HTML 
        soup = BeautifulSoup(driver.page_source, "html.parser") 

        #Extract movie name
        movie_name_tag = soup.select_one("[data-testid='subtitle']")
        movie_name = movie_name_tag.text.strip() if movie_name_tag else f"Unknown ({movie_id})"

        # Extract review data
        title_reviews = []
        content_reviews = []
        review_date = []

        for review_block in soup.select("article.user-review-item"):
            try:
                title = review_block.select_one("h3.ipc-title__text").text.strip()
                content = review_block.select_one("div.ipc-html-content-inner-div").text.strip()
                date = review_block.select_one("li.review-date").text.strip()
                title_reviews.append(title)
                content_reviews.append(content)
                review_date.append(date)
            except AttributeError:
                print("Skipping a review that cannot be parsed")

        # Check the number of extracted reviews
        print(f"number of review titles: {len(title_reviews)}")
        print(f"number of review content: {len(content_reviews)}")
        print(f"number of review date: {len(review_date)}")

        processed_titles = [preprocess_text(title) for title in title_reviews]
        processed_reviews = [preprocess_text(review) for review in content_reviews]

        for title, content, date in zip(processed_titles, processed_reviews, review_date):
            all_reviews.append({"Movie Name": movie_name, "Review Title": title, "Review Content": content, "Review Date": date})

    driver.quit() 

    reviews_df = pd.DataFrame(all_reviews)
    csv_filename = "data/IMDb_Cleaned_Reviews.csv"
    reviews_df.to_csv(csv_filename, index=False)
    print(reviews_df.head()) 

# movie_ids = ["tt1375666","tt6320628"]  
# scrape_imdb_reviews(movie_ids)

In [None]:
def get_imdb_ids(movie_names):
    ia = imdb.IMDb()
    movie_ids = []

    for name in movie_names:
        search = ia.search_movie(name)
        if search:
            movie_id = f"tt{search[0].movieID}" 
            movie_ids.append(movie_id)  # Store as a string
        else:
            print(f"No IMDb ID found for: {name}")

    return movie_ids

# movie_names = ["Inception", "The Dark Knight", "Forrest Gump", "The Matrix", "Interstellar"]
# movie_ids = get_imdb_ids(movie_names)
# print(movie_ids)

In [None]:
def get_imdb_movie_list_names(url):
    movie_names = []

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    # Manually set local chromedriver path
    # CHROMEDRIVER_PATH = "/opt/homebrew/bin/chromedriver" 
    # service = Service(CHROMEDRIVER_PATH)
    # driver = webdriver.Chrome(service=service, options=options)

    driver.get(url)

    # Wait for the page to load
    time.sleep(3) 

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()  

    # Extract movie titles
    for card in soup.select('div.sc-f30335b4-0'):
        try:
            title = card.select_one('h3.ipc-title__text').text.strip()
            movie_names.append(title)
        except AttributeError:
            print("Skipping a movie that cannot be parsed.")

    return movie_names 

# url = "https://www.imdb.com/chart/moviemeter/"
# movie_names = get_imdb_movie_list_names(url)
# print(movie_names)


In [None]:
url = "https://www.imdb.com/chart/moviemeter/" #Most Popular Movies List
movie_names = get_imdb_movie_list_names(url)

# Extract only the first ten movie names for test
first_ten_movies = movie_names[:10]
print(first_ten_movies)
movie_ids = get_imdb_ids(first_ten_movies)
print(movie_ids)
scrape_imdb_reviews(movie_ids)


['Anora', 'The Brutalist']
['tt28607951', 'tt8999762']
5
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
