In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

Data yang saya gunakan adalah review 10 film yang ada pada website Letterboxd.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from requests.exceptions import RequestException

def scrape_reviews(base_url, film_title, page_limit=256, max_retries=5):
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    reviews = []

    for page in range(1, page_limit + 1):
        print(f"[{film_title}] Scraping page {page}...")

        url = base_url.format(page)
        retries = 0
        success = False

        while retries < max_retries and not success:
            try:
                response = requests.get(url, headers=headers, timeout=10)

                if response.status_code != 200:
                    print(f"Status code {response.status_code} on page {page} of {film_title}")
                    break  # Stop retrying this page if it's a 4xx or 5xx error

                soup = BeautifulSoup(response.text, 'html.parser')
                review_blocks = soup.find_all('div', class_='film-detail-content')

                for block in review_blocks:
                    username_tag = block.find('strong', class_='name')
                    username = username_tag.text.strip() if username_tag else "N/A"

                    review_text_tag = block.find_next('div', class_='body-text')
                    review_text = review_text_tag.text.strip() if review_text_tag else "N/A"

                    rating_tag = block.find('span', class_='rating')
                    rating = rating_tag.text.strip() if rating_tag else "No rating"

                    reviews.append({
                        "film": film_title,
                        "username": username,
                        "review": review_text,
                        "rating": rating
                    })

                success = True

            except RequestException as e:
                retries += 1
                wait = 5 * retries  # Exponential backoff
                print(f"Error on page {page} of {film_title}: {e}. Retrying in {wait} seconds... ({retries}/{max_retries})")
                time.sleep(wait)

        time.sleep(2)

    return pd.DataFrame(reviews)

In [3]:
film_list = [
    {
        "title": "Dune (2021)",
        "url": "https://letterboxd.com/film/dune-2021/reviews/by/activity/page/{}/"
    },
    {
        "title": "Interstellar (2014)",
        "url": "https://letterboxd.com/film/interstellar/reviews/by/activity/page/{}/"
    },
    {
        "title": "2001: A Space Odyssey (1968)",
        "url": "https://letterboxd.com/film/2001-a-space-odyssey/reviews/by/activity/page/{}/"
    },
    {
        "title": "Blade Runner 2049 (2017)",
        "url": "https://letterboxd.com/film/blade-runner-2049/reviews/by/activity/page/{}/"
    },
    {
        "title": "Joker: Folie a Deux (2024)",
        "url": "https://letterboxd.com/film/joker-folie-a-deux/reviews/by/activity/page/{}/"
    },
    {
        "title": "Venom (2018)",
        "url": "https://letterboxd.com/film/venom-2018/reviews/by/activity/page/{}/"
    },
    {
        "title": "Don't Worry Darling (2022)",
        "url": "https://letterboxd.com/film/dont-worry-darling/reviews/by/activity/page/{}/"
    },
    {
        "title": "The Gift (2015)",
        "url": "https://letterboxd.com/film/the-gift-2015-1/reviews/by/activity/page/{}/"
    },
    {
        "title": "Oppenheimer (2023)",
        "url": "https://letterboxd.com/film/oppenheimer-2023/reviews/by/activity/page/{}/"
    },
    {
        "title": "Barbie (2023)",
        "url": "https://letterboxd.com/film/barbie/reviews/by/activity/page/{}/"
    }
]

In [4]:
all_dfs = []
for film in film_list:
    df = scrape_reviews(film["url"], film["title"], page_limit=256)
    all_dfs.append(df)

final_df = pd.concat(all_dfs, ignore_index=True)

final_df.to_csv("all_film_reviews.csv", index=False)

[Dune (2021)] Scraping page 1...
[Dune (2021)] Scraping page 2...
[Dune (2021)] Scraping page 3...
[Dune (2021)] Scraping page 4...
[Dune (2021)] Scraping page 5...
[Dune (2021)] Scraping page 6...
[Dune (2021)] Scraping page 7...
[Dune (2021)] Scraping page 8...
[Dune (2021)] Scraping page 9...
[Dune (2021)] Scraping page 10...
[Dune (2021)] Scraping page 11...
[Dune (2021)] Scraping page 12...
[Dune (2021)] Scraping page 13...
[Dune (2021)] Scraping page 14...
[Dune (2021)] Scraping page 15...
[Dune (2021)] Scraping page 16...
[Dune (2021)] Scraping page 17...
[Dune (2021)] Scraping page 18...
[Dune (2021)] Scraping page 19...
[Dune (2021)] Scraping page 20...
[Dune (2021)] Scraping page 21...
[Dune (2021)] Scraping page 22...
[Dune (2021)] Scraping page 23...
[Dune (2021)] Scraping page 24...
[Dune (2021)] Scraping page 25...
[Dune (2021)] Scraping page 26...
[Dune (2021)] Scraping page 27...
[Dune (2021)] Scraping page 28...
[Dune (2021)] Scraping page 29...
[Dune (2021)] Scraping 

In [5]:
final_df.head(10)

Unnamed: 0,film,username,review,rating
0,Dune (2021),Jay,how you gonna birth the chosen one with unlimi...,★★★½
1,Dune (2021),Karsten,got the 4D experience by forgetting to drink w...,★★★★½
2,Dune (2021),cookie,not bad if u ever just feel like staring at th...,★★½
3,Dune (2021),molly,I relate to paul because i also have recurring...,★★★★
4,Dune (2021),Julien Debaker,I'm sorry but I can't take Timothée Chalamet s...,★★★½
5,Dune (2021),demi adejuyigbe,crowd was dead silent for the entire film– exc...,No rating
6,Dune (2021),•lily•,"So i’m sitting there, sand on my titties",★★½
7,Dune (2021),Lucy,love love love when a blockbuster is almost al...,★★★★
8,Dune (2021),sree,mf named PAUL,★★★★★
9,Dune (2021),Matt Singer,I will never watch any kind of making-of docum...,★★★★½


In [6]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30720 entries, 0 to 30719
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   film      30720 non-null  object
 1   username  30720 non-null  object
 2   review    30720 non-null  object
 3   rating    30720 non-null  object
dtypes: object(4)
memory usage: 960.1+ KB


In [7]:
final_df.describe(include='all')

Unnamed: 0,film,username,review,rating
count,30720,30720,30720,30720
unique,10,12353,30648,11
top,Dune (2021),Jake,Masterpiece,★★★★★
freq,3072,61,6,9322
