In [4]:
import re
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
output_file = "letterboxd_urls_and_ratings.csv"


# ------------------------
# FUNCTION TO SCRAPE DATA
# ------------------------
def parse_shorthand_number(text): 
    text = text.lower().replace(",", "").strip()
    if text.endswith("k"):
        return int(float(text[:-1]) * 1000)
    elif text.endswith("m"):
        return int(float(text[:-1]) * 1_000_000)
    else:
        return int(text)

def parse_rating_bar_text(rating_text):
    match = re.match(r'([\d,]+)\s+([★½]+)', rating_text)
    if match:
        count = int(match.group(1).replace(',', ''))
        stars = match.group(2)
        decimal = stars.count('★') + 0.5 * stars.count('½')
        return f"{decimal}★", count
    return None, 0

def scrape_movie_data(driver, url):
    driver.get(url)
    time.sleep(3)

    # Title
    try:
        title = driver.find_element(By.CLASS_NAME, "headline-1").text
    except:
        title = "N/A"

    # Director
    try:
        director = driver.find_element(By.XPATH, '//p[@class="credits"]//span[@class="prettify"]').text
    except:
        director = "N/A"

    # Release Year
    try:
        release_year_elem = driver.find_element(By.CSS_SELECTOR, "span.releasedate a")
        release_year = release_year_elem.text
    except:
        release_year = "N/A"

    # Average Rating
    try:
        avg_rating = driver.find_element(By.CLASS_NAME, "average-rating").text
    except:
        avg_rating = "N/A"

    #Hannes: Individual star rating counts
    bars = driver.find_elements(By.CSS_SELECTOR, "li.rating-histogram-bar")
    rating_counts = []
    for bar in bars:
        try:
            rating_text = bar.find_element(By.TAG_NAME, "a").text
            rating_counts.append(rating_text)
        except:
            rating_counts.append("No ratings for this amount of stars")


    # Fans
    try: 
         fans_elem = driver.find_element(By.CSS_SELECTOR, 'section.ratings-histogram-chart a.all-link.more-link')
         fans_text = fans_elem.text.split()[0]
         fans = parse_shorthand_number(fans_text)

    except  Exception as e:
        fans = "N/A"

    # Watched
    try:
        watched_attr = driver.find_element(
            By.XPATH,
            '//a[contains(@href, "/members/") and contains(@data-original-title, "Watched by")]'
        ).get_attribute("data-original-title")
        watched = re.sub(r"[^\d]", "", watched_attr)
    except:
        watched = "N/A"

    # Likes
    try:
        liked_attr = driver.find_element(
            By.XPATH,
            '//a[contains(@href, "/likes/") and contains(@data-original-title, "Liked by")]'
        ).get_attribute("data-original-title")
        liked = re.sub(r"[^\d]", "", liked_attr)
    except:
        liked = "N/A"

    # Genres
    try:
        genres = [a.get_attribute("textContent").strip() for a in driver.find_elements(
            By.XPATH, '//div[@id="tab-genres"]//h3[span[text()="Genres"]]/following-sibling::div[1]//a')]
    except:
        genres = []

    # Themes
    try:
        tags = [a.get_attribute("textContent").strip() for a in driver.find_elements(
            By.XPATH, '//div[@id="tab-genres"]//h3[span[text()="Themes"]]/following-sibling::div[1]//a')]
    except:
        tags = []

    # Cast
    try:
        cast_elements = driver.find_elements(By.XPATH, '//div[@class="cast-list text-sluglist"]//a[@class="text-slug tooltip"]')
        cast = [el.get_attribute("textContent").strip() for el in cast_elements if el.get_attribute("textContent").strip()]
    except:
        cast = ["N/A"]

    # Runtime
    try:
        footer_elem = driver.find_element(By.CLASS_NAME, "text-footer")
        full_footer_text = footer_elem.get_attribute("textContent") 
        runtime = re.search(r"\d+", full_footer_text).group()
    except:
        runtime = "N/A"

    # Subtitle/tagline
    try:
        subtitle = driver.find_element(By.CLASS_NAME, "tagline").text
    except:
        subtitle = "N/A"

    # Movie Description
    try:
        description_elem = driver.find_element(
            By.XPATH, '//section//div[contains(@class, "truncate")]/p'
        )
        description = description_elem.text.strip()
    except Exception as e:
        description = "N/A"

    return {
        "title": title,
        "director": director,
        "release_year": release_year,
        "runtime": runtime,
        "subtitle": subtitle,
        "description": description,
        "genres": "---".join(genres),
        "tags": "---".join(tags),
        "cast": "---".join(cast),
        "watched": watched,
        "avg_rating": avg_rating,
        "rating_counts": "---".join(rating_counts),
        "fans": fans,
        "liked": liked
    }

In [5]:
# hannes: I changed from chrom to safari because I don't have chrome
#options = Options()
#options.add_argument("--headless")
driver = webdriver.Safari()  # Hannes: might need to go back to driver = webdriver.Chrome(service=Service(), options=options
# Hannes: in your script you use the file letterboxd_movie_data.csv but I don't see it being created by another script. I will therefore delete this section and insted load the file that your URL_extracter.ipynb script creates

# Loop through movie URLs
with open("letterboxd_urls.txt", "r") as f:
    movie_urls = [line.strip() for line in f if line.strip() if line.strip() != "NO URL FOUND"]


movie_info = []
for i, url in enumerate(movie_urls):
    try:
        print(f"\n\nScraping: {url} ({i + 1}/{len(movie_urls)})")
        row = scrape_movie_data(driver, url)
        movie_info.append(row)
        df = pd.DataFrame(movie_info)
        df.to_csv(output_file, index=False)

    except Exception as e:
        print(f"\n\nFailed to scrape {url}: {e}")

driver.quit()



Scraping: https://letterboxd.com/film/time-for-revenge (1/5746)


Scraping: https://letterboxd.com/film/my-old-ass (2/5746)


Scraping: https://letterboxd.com/film/ohms (3/5746)


Scraping: https://letterboxd.com/film/double-platinum (4/5746)


Scraping: https://letterboxd.com/film/dia-branco (5/5746)


Scraping: https://letterboxd.com/film/rossini-la-donna-del-lago (6/5746)


Scraping: https://letterboxd.com/film/the-colours-of-my-father-a-portrait-of-sam-borenstein (7/5746)


Scraping: https://letterboxd.com/film/sesame-street-elmo-says-boo/** (8/5746)


Scraping: https://letterboxd.com/film/play-dead-2011 (9/5746)


Scraping: https://letterboxd.com/film/another-you-2017 (10/5746)


Scraping: https://letterboxd.com/film/50-nuts-2 (11/5746)


Scraping: https://letterboxd.com/film/wwe-history-of-the-hardcore-championship-247 (12/5746)


Scraping: https://letterboxd.com/film/the-fish-the-fly-4-streamers (13/5746)


Scraping: https://letterboxd.com/film/our-man-in-jamaica (14/5746)


S