In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
from webdriver_manager.chrome import ChromeDriverManager
import nltk
from nltk.corpus import stopwords
import string
from tqdm import tqdm
import re
import imdb

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\remote
[nltk_data]     desktop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
stop_words = set(stopwords.words('english'))

#List of negation words (with and without "t")
negation_words = {
    "not", "no", "nor",
    "don't", "do", "doesn't", "does", "doesn","don",
    "won't","won", "wouldn't", "would", "wouldn",
    "can't", "couldn't", "could", "couldn","can",
    "isn't", "is", "aren't", "are", "wasn't", "was", "weren't", "were","isn","aren","wasn","weren",
    "shouldn't", "should", "shouldn",
    "shan't", "shall","shan",
    "mustn't", "must", "mustn",
    "hadn't", "had", "hasn't", "has", "haven't", "have", "haven","hadn","hasn",
    "needn't", "need", "needn",
    "mightn't", "might", "mightn",
    "didn't", "did", "didn"
}
stop_words = stop_words - negation_words

table = str.maketrans('', '', string.punctuation)
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # Transport & map symbols
                           u"\U0001F700-\U0001F77F"  # Alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric shapes
                           u"\U0001F800-\U0001F8FF"  # Supplemental arrows
                           u"\U0001F900-\U0001F9FF"  # Supplemental symbols & pictographs
                           u"\U0001FA00-\U0001FA6F"  # Symbols & pictographs extended-A
                           u"\U0001FA70-\U0001FAFF"  # Symbols & pictographs extended-B
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251" 
                           "]+", flags=re.UNICODE)

# Remove stopwords，punctuations，html tags，emojis and transform into lowercase
def preprocess_text(text):
    text = emoji_pattern.sub(r'', text)
    text = text.lower()
    text = text.replace(",", " , ").replace(".", " . ").replace("-", " - ").replace("/", " / ")
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    words = text.split()
    filtered_sentence = " ".join(word.translate(table) for word in words if word not in stop_words)
    return filtered_sentence.strip()

def scrape_imdb_reviews(movie_ids):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    # Manually set local chromedriver path
    # CHROMEDRIVER_PATH = "/opt/homebrew/bin/chromedriver"
    # service = Service(CHROMEDRIVER_PATH)
    # driver = webdriver.Chrome(service=service, options=options)

    all_reviews = []

    for movie_id in movie_ids:

        url = f"https://www.imdb.com/title/{movie_id}/reviews"
        driver.get(url)

        # Wait for the page to load
        wait = WebDriverWait(driver, 10)

        load_more_clicked = 0
        # Click the btn to load more reviews
        while load_more_clicked < 5:
            try:
                load_more_button = driver.find_element(By.CSS_SELECTOR, "button.ipc-see-more__button")
                # load_more_button.click()
                load_more_button.send_keys('\n')
                wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, ".ipl-load-more__load-indicator")))
                time.sleep(2)
                load_more_clicked +=1
            except Exception as e:
                # print(e)
                break
        print(load_more_clicked)

        # Get the full HTML 
        soup = BeautifulSoup(driver.page_source, "html.parser") 

        #Extract movie name
        movie_name_tag = soup.select_one("[data-testid='subtitle']")
        movie_name = movie_name_tag.text.strip() if movie_name_tag else f"Unknown ({movie_id})"

        # Extract review data
        title_reviews = []
        content_reviews = []
        review_date = []

        for review_block in soup.select("article.user-review-item"):
            try:
                title = review_block.select_one("h3.ipc-title__text").text.strip()
                content = review_block.select_one("div.ipc-html-content-inner-div").text.strip()
                date = review_block.select_one("li.review-date").text.strip()
                title_reviews.append(title)
                content_reviews.append(content)
                review_date.append(date)
            except AttributeError:
                print("Skipping a review that cannot be parsed")

        # Check the number of extracted reviews
        print(f"number of review titles: {len(title_reviews)}")
        print(f"number of review content: {len(content_reviews)}")
        print(f"number of review date: {len(review_date)}")

        processed_titles = [preprocess_text(title) for title in title_reviews]
        processed_reviews = [preprocess_text(review) for review in content_reviews]

        for title, content, date in zip(processed_titles, processed_reviews, review_date):
            all_reviews.append({"Movie Name": movie_name, "Review Title": title, "Review Content": content, "Review Date": date})

    driver.quit() 

    reviews_df = pd.DataFrame(all_reviews)
    csv_filename = "data/IMDb_Cleaned_Reviews.csv"
    reviews_df.to_csv(csv_filename, index=False)
    print(reviews_df.head()) 

# movie_ids = ["tt1375666","tt6320628"]  
# scrape_imdb_reviews(movie_ids)

In [None]:
def get_imdb_ids(movie_names):
    ia = imdb.IMDb()
    movie_ids = []

    for name in movie_names:
        search = ia.search_movie(name)
        if search:
            movie_id = f"tt{search[0].movieID}" 
            movie_ids.append(movie_id)  # Store as a string
        else:
            print(f"No IMDb ID found for: {name}")

    return movie_ids

# movie_names = ["Inception", "The Dark Knight", "Forrest Gump", "The Matrix", "Interstellar"]
# movie_ids = get_imdb_ids(movie_names)
# print(movie_ids)

In [None]:
def get_imdb_movie_list_names(url):
    movie_names = []

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    # Manually set local chromedriver path
    # CHROMEDRIVER_PATH = "/opt/homebrew/bin/chromedriver" 
    # service = Service(CHROMEDRIVER_PATH)
    # driver = webdriver.Chrome(service=service, options=options)

    driver.get(url)

    # Wait for the page to load
    time.sleep(3) 

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()  

    # Extract movie titles
    for card in soup.select('div.sc-f30335b4-0'):
        try:
            title = card.select_one('h3.ipc-title__text').text.strip()
            movie_names.append(title)
        except AttributeError:
            print("Skipping a movie that cannot be parsed.")

    return movie_names 

# url = "https://www.imdb.com/chart/moviemeter/"
# movie_names = get_imdb_movie_list_names(url)
# print(movie_names)


In [None]:
url = "https://www.imdb.com/chart/moviemeter/" #Most Popular Movies List
movie_names = get_imdb_movie_list_names(url)

# Extract only the first ten movie names for test
first_ten_movies = movie_names[:10]
print(first_ten_movies)
movie_ids = get_imdb_ids(first_ten_movies)
print(movie_ids)
scrape_imdb_reviews(movie_ids)


['Anora', 'The Brutalist']
['tt28607951', 'tt8999762']
5
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed


Random titles

In [6]:
def scrape_imdb_reviews(movie_id):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    url = f"https://www.imdb.com/title/{movie_id}/reviews"
    driver.get(url)

    # Wait for the page to load
    wait = WebDriverWait(driver, 10)

    load_more_clicked = 0
    # Click the btn to load more reviews
    while load_more_clicked < 1:
        try:
            load_more_button = driver.find_element(By.CSS_SELECTOR, "button.ipc-see-more__button")
            # load_more_button.click()
            load_more_button.send_keys('\n')
            wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, ".ipl-load-more__load-indicator")))
            time.sleep(2)
            load_more_clicked +=1
        except Exception as e:
            # print(e)
            break
    print(load_more_clicked)

    # Get the full HTML 
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()  

    # #Extract movie name
    # movie_name = soup.select_one("[data-testid='subtitle']")

    # Extract review data
    title_reviews = []
    content_reviews = []
    review_date = []

    for review_block in soup.select("article.user-review-item"):
        try:
            title = review_block.select_one("h3.ipc-title__text").text.strip()
            content = review_block.select_one("div.ipc-html-content-inner-div").text.strip()
            date = review_block.select_one("li.review-date").text.strip()
            title_reviews.append(title)
            content_reviews.append(content)
            review_date.append(date)
        except AttributeError:
            print("Skipping a re view that cannot be parsed")

    # check the number of extracted reviews
    print(f"number of review titles: {len(title_reviews)}")
    print(f"number of review content: {len(content_reviews)}")
    print(f"number of review date: {len(review_date)}")

    processed_titles = [preprocess_text(title) for title in title_reviews]
    processed_reviews = [preprocess_text(review) for review in content_reviews]

    reviews_df = pd.DataFrame({
        "Movie ID": movie_id,
        "Review Title": processed_titles,
        "Review Content": processed_reviews,
        "Review Date": review_date
    })

    # csv_filename = f"{movie_id}_Cleaned_Reviews.csv"
    # reviews_df.to_csv(csv_filename, index=False)

    return reviews_df

In [None]:
import re
import pandas as pd
from utils import get_random_movie_id, scrape_worldwide_box_office

scraped = 0
to_scrape =50
min_votes = 1000  # Only select movies with at least 1000 votes

# Create an empty DataFrame to store all movie reviews
all_reviews_df = pd.DataFrame()

while scraped < to_scrape:
    # Get a random movie ID that meets the minimum votes criteria.
    random_movie_id = get_random_movie_id(min_votes=min_votes)
    
    # Scrape IMDb reviews for the movie.
    reviews_df = scrape_imdb_reviews(random_movie_id)
    
    # If reviews_df is empty, skip this movie and continue.
    if reviews_df.empty:
        continue

    # Try to get the worldwide box office figure; if it fails, skip to the next movie.
    try:
        box_office_str = scrape_worldwide_box_office(random_movie_id)
    except Exception as e:
        continue

    # Extract the number from the box office string, e.g., "$47,680,966" becomes 47680966.
    match = re.search(r'\$([\d,]+)', box_office_str)
    if match:
        box_office_value = int(match.group(1).replace(',', ''))
    else:
        continue  # If no number can be extracted, skip this movie.

    # Add the worldwide box office value as a new column to the reviews DataFrame.
    reviews_df['Worldwide BO'] = box_office_value

    # Append the movie's reviews to the cumulative DataFrame.
    all_reviews_df = pd.concat([all_reviews_df, reviews_df], ignore_index=True)

    scraped += 1
    print(f"Scraped {scraped} movies.")

# Optionally, save the cumulative DataFrame to a CSV file.
all_reviews_df.to_csv("data/IMDb_Cleaned_Reviews.csv", index=False)

GZ file already exists. Skipping download.
TSV file already exists. Skipping extraction.
Loading movie IDs from the ratings dataset...
Loaded 95260 movie IDs with at least 1000 votes.
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 34
number of review content: 34
number of review date: 34
Scraped 1 movies.
1
Skipping a re view that cannot be parsed
Skipping a re 

  soup = BeautifulSoup(text, "html.parser")


1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 22
number of review content: 22
number of review date: 22
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 2
number of review content: 2
number of review date: 2
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a 

  soup = BeautifulSoup(text, "html.parser")


1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 30
number of review content: 30
number of review date: 30
Scraped 4 movies.
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 19
number of review content: 19
number of review date: 19
Scraped 5 movies.
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of rev

  soup = BeautifulSoup(text, "html.parser")


1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 34
number of review content: 34
number of review date: 34
Scraped 6 movies.
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 1
number of review content: 1
number of review date: 1
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 17
number of review content: 17
number of review date: 17


  soup = BeautifulSoup(text, "html.parser")


1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 40
number of review content: 40
number of review date: 40
Scraped 7 movies.
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 11
number of review content: 11
number of review date: 11
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 2
number of review content: 2
number of review date: 2
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 23
num

  soup = BeautifulSoup(text, "html.parser")


Scraped 8 movies.
0
number of review titles: 2
number of review content: 2
number of review date: 2
0
Skipping a re view that cannot be parsed
number of review titles: 2
number of review content: 2
number of review date: 2
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 23
number of review content: 23
number of review date: 23
0
Skipping a re view that cannot be parsed
number of review titles: 2
number of review content: 2
number of review date: 2
0
Skipping a re view that cannot be parsed
number of review titles: 2
number of review content: 2
number of review date: 2
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 10
number of review content: 10
number of review date: 10
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot 

  soup = BeautifulSoup(text, "html.parser")


0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 3
number of review content: 3
number of review date: 3
0
number of review titles: 0
number of review content: 0
number of review date: 0
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 20
number of review content: 20
number of review date: 20
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a 

  soup = BeautifulSoup(text, "html.parser")


Scraped 13 movies.
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 12
number of review content: 12
number of review date: 12
Scraped 14 movies.
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 8
number of review content: 8
number of review date: 8


  soup = BeautifulSoup(text, "html.parser")


1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 46
number of review content: 46
number of review date: 46
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 41
number of review content: 41
number of review date: 41
Scraped 15 movies.
0
number of review titles: 0
number of review content: 0
number of review date: 0
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot

  soup = BeautifulSoup(text, "html.parser")


Scraped 17 movies.
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 47
number of review content: 47
number of review date: 47


  soup = BeautifulSoup(text, "html.parser")


0
Skipping a re view that cannot be parsed
number of review titles: 2
number of review content: 2
number of review date: 2
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 35
number of review content: 35
number of review date: 35
0
Skipping a re view that cannot be parsed
number of review titles: 2
number of review content: 2
number of review date: 2
0
number of review titles: 2
number of review content:

  soup = BeautifulSoup(text, "html.parser")


1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 27
number of review content: 27
number of review date: 27
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a 

  soup = BeautifulSoup(text, "html.parser")


0
number of review titles: 3
number of review content: 3
number of review date: 3
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 38
number of review content: 38
number of review date: 38
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 15
number of review content: 15
number of review date: 15
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number o

  soup = BeautifulSoup(text, "html.parser")


Scraped 22 movies.
0
number of review titles: 5
number of review content: 5
number of review date: 5
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 18
number of review content: 18
number of review date: 18


  soup = BeautifulSoup(text, "html.parser")


0
number of review titles: 9
number of review content: 9
number of review date: 9
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 14
number of review content: 14
number of review date: 14
0
Skipping a re view that cannot be parsed
number of review titles: 1
number of review content: 1
number of review date: 1
Scraped 23 movies.
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 0
number of review content: 0
number of review date: 0
0
Skipping a re view that cannot be parsed
number of review titles: 11
number of review content: 11
number of review date: 11
Scraped 24 movies.
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 16
number of review content: 16
number of review date: 16
0
Skipping a re view that cannot be parsed
Skipping

  soup = BeautifulSoup(text, "html.parser")


Scraped 28 movies.
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 14
number of review content: 14
number of review date: 14
0
number of review titles: 0
number of review content: 0
number of review date: 0
0
number of review titles: 12
number of review content: 12
number of review date: 12
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot

  soup = BeautifulSoup(text, "html.parser")


Scraped 31 movies.
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 18
number of review content: 18
number of review date: 18
0
number of review titles: 0
number of review content: 0
number of review date: 0
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 24
number of review content: 24
number of review date: 24
Scraped 32 movies.
1
Skipping a re view that cannot be parsed
Skipping a 

  soup = BeautifulSoup(text, "html.parser")


Scraped 33 movies.
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 5
number of review content: 5
number of review date: 5
0
number of review titles: 14
number of review content: 14
number of review date: 14
Scraped 34 movies.
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 42
number of review content: 42
number of review date: 42
Scraped 35 movies.
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be

  soup = BeautifulSoup(text, "html.parser")


0
number of review titles: 6
number of review content: 6
number of review date: 6
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 12
number of review content: 12
number of review date: 12
Scraped 37 movies.
0
number of review titles: 7
number of review content: 7
number of review date: 7


  soup = BeautifulSoup(text, "html.parser")


0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 10
number of review content: 10
number of review date: 10
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 26
number of review content: 26
number of review date: 26
0
Skipping a re view that cannot be parsed
number of review titles: 15
number of review content: 15
number of review date: 15
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 0
number of review content: 0
number of review date: 0
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 42
number of review content: 42
number of review date: 42
0
Sk

  soup = BeautifulSoup(text, "html.parser")


0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 6
number of review content: 6
number of review date: 6
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 37
number of review content: 37
number of review date: 37
1
Skipping a re view that cannot be parsed
Skipping a 

  soup = BeautifulSoup(text, "html.parser")


Scraped 40 movies.
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 43
number of review content: 43
number of review date: 43
0
number of review titles: 2
number of review content: 2
number of review date: 2
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 13
number of review content: 13
number of review date: 13
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 2
number of review content: 2
number of review date: 2
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 4
num

  soup = BeautifulSoup(text, "html.parser")


Scraped 41 movies.
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 35
number of review content: 35
number of review date: 35


  soup = BeautifulSoup(text, "html.parser")


0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 15
number of review content: 15
number of review date: 15
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 6
number of review content: 6
number of review date: 6
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 1
number of review content: 1
number of review date: 1
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a 

  soup = BeautifulSoup(text, "html.parser")


Scraped 43 movies.
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 41
number of review content: 41
number of review date: 41


  soup = BeautifulSoup(text, "html.parser")


Scraped 44 movies.
0
Skipping a re view that cannot be parsed
number of review titles: 7
number of review content: 7
number of review date: 7
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 26
number of review content: 26
number of review date: 26
Scraped 45 movies.
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re 

  soup = BeautifulSoup(text, "html.parser")


Scraped 46 movies.
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 44
number of review content: 44
number of review date: 44
Scraped 47 movies.
1
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 44
number of review content: 44
number of review date: 44
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 3
number of review content: 3
number of review date: 3
0
number of review titles: 1
number of review content: 1
number of review date: 1
0
Skipping a re view that cannot be parsed
Skipping a 

  soup = BeautifulSoup(text, "html.parser")


0
Skipping a re view that cannot be parsed
number of review titles: 2
number of review content: 2
number of review date: 2
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 11
number of review content: 11
number of review date: 11
0
Skipping a re view that cannot be parsed
number of review titles: 1
number of review content: 1
number of review date: 1
0
Skipping a re view that cannot be parsed
number of review titles: 10
number of review content: 10
number of review date: 10
Scraped 48 movies.
0
Skipping a re view that cannot be parsed
number of review titles: 6
number of review content: 6
number of review date: 6
0
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 14
number of review content: 14
number of review date: 14
Scraped 49 movies.
0
number o