In [None]:
import re
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
output_file = "letterboxd_urls_and_ratings.csv"


# ------------------------
# FUNCTION TO SCRAPE DATA
# ------------------------
def parse_shorthand_number(text): 
    text = text.lower().replace(",", "").strip()
    if text.endswith("k"):
        return int(float(text[:-1]) * 1000)
    elif text.endswith("m"):
        return int(float(text[:-1]) * 1_000_000)
    else:
        return int(text)

def parse_rating_bar_text(rating_text):
    match = re.match(r'([\d,]+)\s+([★½]+)', rating_text)
    if match:
        count = int(match.group(1).replace(',', ''))
        stars = match.group(2)
        decimal = stars.count('★') + 0.5 * stars.count('½')
        return f"{decimal}★", count
    return None, 0

def scrape_movie_data(driver, url):
    driver.get(url)
    time.sleep(3)

    # Title
    try:
        title = driver.find_element(By.CLASS_NAME, "headline-1").text
    except:
        title = "N/A"
    print("\n\n\nTitle:", title)

    # Director
    try:
        director = driver.find_element(By.XPATH, '//p[@class="credits"]//span[@class="prettify"]').text
    except:
        director = "N/A"
    print("Director:", director)

    # Release Year
    try:
        release_year_elem = driver.find_element(By.CSS_SELECTOR, "span.releasedate a")
        release_year = release_year_elem.text
    except:
        release_year = "N/A"
    print("Release Year:", release_year)

    # Average Rating
    try:
        avg_rating = driver.find_element(By.CLASS_NAME, "average-rating").text
    except:
        avg_rating = "N/A"
    print("Average Rating:", avg_rating)

    #Hannes: Individual star rating counts
    bars = driver.find_elements(By.CSS_SELECTOR, "li.rating-histogram-bar a")
    rating_dict = {}
    for bar in bars:
        try:
            rating_text = bar.get_attribute("data-original-title")
            star, count = parse_rating_bar_text(rating_text)
            if star:
                rating_dict[star] = count
        except:
            continue

    # Fill missing with 0
    for r in [f"{x/2:.1f}★" for x in range(1, 11)]:
        rating_dict.setdefault(r, 0)

    rating_columns = {f"X{r}": rating_dict[f"{r}★"] for r in ["0.5", "1.0", "1.5", "2.0", "2.5", "3.0", "3.5", "4.0", "4.5", "5.0"]}

    # Fans
    try: 
         fans_elem = driver.find_element(By.CSS_SELECTOR, 'section.ratings-histogram-chart a.all-link.more-link')
         fans_text = fans_elem.text.split()[0]
         fans = parse_shorthand_number(fans_text)

    except  Exception as e:
        fans = "N/A"
    print("Fans:", fans)

    # Watched
    try:
        watched_attr = driver.find_element(
            By.XPATH,
            '//a[contains(@href, "/members/") and contains(@data-original-title, "Watched by")]'
        ).get_attribute("data-original-title")
        watched = re.sub(r"[^\d]", "", watched_attr)
    except:
        watched = "N/A"
    print("Watched:", watched)

    # Likes
    try:
        liked_attr = driver.find_element(
            By.XPATH,
            '//a[contains(@href, "/likes/") and contains(@data-original-title, "Liked by")]'
        ).get_attribute("data-original-title")
        liked = re.sub(r"[^\d]", "", liked_attr)
    except:
        liked = "N/A"
    print("Liked:", liked)

    # Genres
    try:
        genres = [a.get_attribute("textContent").strip() for a in driver.find_elements(
            By.XPATH, '//div[@id="tab-genres"]//h3[span[text()="Genres"]]/following-sibling::div[1]//a')]
    except:
        genres = []
    print("Genres:", genres)

    # Themes
    try:
        tags = [a.get_attribute("textContent").strip() for a in driver.find_elements(
            By.XPATH, '//div[@id="tab-genres"]//h3[span[text()="Themes"]]/following-sibling::div[1]//a')]
    except:
        tags = []
    print("Tags:", tags)

    # Cast
    try:
        cast_elements = driver.find_elements(By.XPATH, '//div[@class="cast-list text-sluglist"]//a[@class="text-slug tooltip"]')
        cast = [el.get_attribute("textContent").strip() for el in cast_elements if el.get_attribute("textContent").strip()]
    except:
        cast = ["N/A"]
    print("Cast:", cast)

    # Runtime
    try:
        footer_elem = driver.find_element(By.CLASS_NAME, "text-footer")
        full_footer_text = footer_elem.get_attribute("textContent") 
        runtime = re.search(r"\d+", full_footer_text).group()
    except:
        runtime = "N/A"
    print("Runtime:", runtime)

    # Subtitle/tagline
    try:
        subtitle = driver.find_element(By.CLASS_NAME, "tagline").text
    except:
        subtitle = "N/A"
    print("Subtitle:", subtitle)

    # Movie Description
    try:
        description_elem = driver.find_element(
            By.XPATH, '//section//div[contains(@class, "truncate")]/p'
        )
        description = description_elem.text.strip()
    except Exception as e:
        description = "N/A"

    return {
        "title": title,
        "director": director,
        "release_year": release_year,
        "runtime": runtime,
        "subtitle": subtitle,
        "description": description,
        "genres": "---".join(genres),
        "tags": "---".join(tags),
        "cast": "---".join(cast),
        "watched": watched,
        "avg_rating": avg_rating,
        "rating_counts": "---".join(rating_counts),
        "fans": fans,
        "liked": liked
    }

In [2]:
# hannes: I changed from chrom to safari because I don't have chrome
#options = Options()
#options.add_argument("--headless")
driver = webdriver.Safari()  # Hannes: might need to go back to driver = webdriver.Chrome(service=Service(), options=options
# Hannes: in your script you use the file letterboxd_movie_data.csv but I don't see it being created by another script. I will therefore delete this section and insted load the file that your URL_extracter.ipynb script creates

# Loop through movie URLs
with open("letterboxd_urls.txt", "r") as f:
    movie_urls = [line.strip() for line in f if line.strip()]


movie_info = []
for url in movie_urls:
    try:
        print(f"Scraping: {url}")
        row = scrape_movie_data(driver, url)
        movie_info.append(row)
        df = pd.DataFrame(movie_info)
        df.to_csv(output_file, index=False)

    except Exception as e:
        print(f"Failed to scrape {url}: {e}")

driver.quit()

Scraping: https://letterboxd.com/film/time-for-revenge



Title: Time for Revenge
Director: Adolfo Aristarain
Release Year: 1981
Average Rating:  4.1 
bars: [<selenium.webdriver.remote.webelement.WebElement (session="38BFAA16-9C54-4493-A61A-08764329670D", element="node-ECAFD4BC-016C-4DDF-AE44-FA92ACFA7C75")>, <selenium.webdriver.remote.webelement.WebElement (session="38BFAA16-9C54-4493-A61A-08764329670D", element="node-CC55BE71-F663-4F46-A214-B595CDD9F011")>, <selenium.webdriver.remote.webelement.WebElement (session="38BFAA16-9C54-4493-A61A-08764329670D", element="node-B139F6F4-17EB-48C4-A9C1-4D58743F4F9E")>, <selenium.webdriver.remote.webelement.WebElement (session="38BFAA16-9C54-4493-A61A-08764329670D", element="node-56326ADF-F2C3-42B2-952E-CD83324C546C")>, <selenium.webdriver.remote.webelement.WebElement (session="38BFAA16-9C54-4493-A61A-08764329670D", element="node-F6EC6628-C5B1-47D2-9234-FBDF82B2D253")>, <selenium.webdriver.remote.webelement.WebElement (session="38BFAA16-9C54-4493