In [None]:
import os
import csv
import re
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# ------------------------
# FUNCTION TO SCRAPE DATA
# ------------------------
def parse_shorthand_number(text):
    text = text.lower().replace(",", "").strip()
    if text.endswith("k"):
        return int(float(text[:-1]) * 1000)
    elif text.endswith("m"):
        return int(float(text[:-1]) * 1_000_000)
    else:
        return int(text)

def scrape_movie_data(driver, url):
    driver.get(url)
    time.sleep(3)

    # Title
    try:
        title = driver.find_element(By.CLASS_NAME, "headline-1").text
    except:
        title = "N/A"

    # Director
    try:
        director = driver.find_element(By.XPATH, '//p[@class="credits"]//span[@class="prettify"]').text
    except:
        director = "N/A"

    # Average Rating
    try:
        avg_rating = driver.find_element(By.CLASS_NAME, "average-rating").text
    except:
        avg_rating = "N/A"

    # Fans
    try: 
         fans_elem = driver.find_element(By.CSS_SELECTOR, 'section.ratings-histogram-chart a.all-link.more-link')
         print("Fans raw text:", fans_elem.text)
         fans_text = fans_elem.text.split()[0]
         fans = parse_shorthand_number(fans_text)

    except  Exception as e:
        print("Could not extract fans:", e)
        fans = "N/A"

    # Watched
    try:
        watched_attr = driver.find_element(
            By.XPATH,
            '//a[contains(@href, "/members/") and contains(@data-original-title, "Watched by")]'
        ).get_attribute("data-original-title")
        watched = re.sub(r"[^\d]", "", watched_attr)
    except:
        watched = "N/A"

    # Likes
    try:
        liked_attr = driver.find_element(
            By.XPATH,
            '//a[contains(@href, "/likes/") and contains(@data-original-title, "Liked by")]'
        ).get_attribute("data-original-title")
        liked = re.sub(r"[^\d]", "", liked_attr)
    except:
        liked = "N/A"

    # Genres
    try:
        genres = [a.get_attribute("textContent").strip() for a in driver.find_elements(
            By.XPATH, '//div[@id="tab-genres"]//h3[span[text()="Genres"]]/following-sibling::div[1]//a')]
    except:
        genres = []

    # Themes
    try:
        tags = [a.get_attribute("textContent").strip() for a in driver.find_elements(
            By.XPATH, '//div[@id="tab-genres"]//h3[span[text()="Themes"]]/following-sibling::div[1]//a')]
    except:
        tags = []

    # Cast
    try:
        cast_elements = driver.find_elements(By.XPATH, '//div[@class="cast-list text-sluglist"]//a[@class="text-slug tooltip"]')
        cast = [el.get_attribute("textContent").strip() for el in cast_elements if el.get_attribute("textContent").strip()]
    except:
        cast = ["N/A"]

    # Runtime
    try:
        footer_elem = driver.find_element(By.CLASS_NAME, "text-footer")
        full_footer_text = footer_elem.get_attribute("textContent") 
        runtime = re.search(r"\d+", full_footer_text).group()
    except:
        runtime = "N/A"

    # Subtitle/tagline
    try:
        subtitle = driver.find_element(By.CLASS_NAME, "tagline").text
    except:
        subtitle = "N/A"

    # Movie Description
    try:
        description_elem = driver.find_element(
            By.XPATH, '//section//div[contains(@class, "truncate")]/p'
        )
        description = description_elem.text.strip()
        print("Description:", description)
    except Exception as e:
        description = "N/A"
        print(" Description NOT found:", e)

    return [
        title,
        director,
        runtime,
        subtitle,
        description,
        "; ".join(genres),
        "; ".join(tags),
        "; ".join(cast),
        watched,
        avg_rating,
        fans,
        liked
    ]

# ------------------------
# MAIN SCRIPT
# ------------------------

# Movie URLs
movie_urls = [
    
]

# Set up headless browser
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(), options=options)

# CSV file path
desktop_path = os.path.expanduser("~/Desktop/letterboxd_movie_data.csv")
file_exists = os.path.exists(desktop_path)

existing_titles = set()
if file_exists:
    with open(desktop_path, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        next(reader, None)  # skip header
        for row in reader:
            if row and row[0] != "N/A":
                existing_titles.add(row[0].strip())

# Write to CSV (append mode)
with open(desktop_path, "a", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)

    # Write header only if the file is new
    if not file_exists:
        writer.writerow([
            "Movie", 
            "Director", 
            "Runtime (minutes)", 
            "Subtitle",
            "Description",
            "Genres",
            "Themes",
            "Cast",
            "Watched",
            "Average Rating",  
            "Fans",
            "Liked"
        ])

    # Loop through movie URLs
    for url in movie_urls:
        try:
            print(f"Scraping: {url}")
            row = scrape_movie_data(driver, url)

            # Skip if title is missing (which means the scrape failed)
            if not row or row[0] == "N/A":
                print(f"Skipping row for: {url} (scrape failed)")
                continue

            # skip duplicates
            if row[0] in existing_titles:
                print(f"Skipping duplicate movie: {row[0]}")
                continue

            writer.writerow(row)
            existing_titles.add(row[0].strip())

        except Exception as e:
            print(f"Failed to scrape {url}: {e}")


driver.quit()
print(f"\n Done! Data saved to: {desktop_path}")

Scraping: https://letterboxd.com/film/xxx-return-of-xander-cage/
Fans raw text: 32 FANS
Description: Xander Cage is left for dead after an incident, though he secretly returns to action for a new, tough assignment with his handler Augustus Gibbons.
Scraping: https://letterboxd.com/film/the-mist/
Fans raw text: 1.2K FANS
Description: After a violent storm, a dense cloud of mist envelops a small Maine town, trapping David Drayton and his five-year-old son in a local grocery store with other local residents. They soon discover that the mist conceals deadly horrors that threaten their lives, and worse, their sanity.
Scraping: https://letterboxd.com/film/toy-story-3/
Fans raw text: 5.4K FANS
Description: Woody, Buzz, and the rest of Andy’s toys haven’t been played with in years. With Andy about to go to college, the gang find themselves accidentally left at a nefarious day care center. The toys must band together to escape and return home to Andy.
Scraping: https://letterboxd.com/film/made/

In [2]:
def perplexity(prompt, system_prompt):
    YOUR_API_KEY = os.environ["PERPLEXITY_API_KEY"]
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}]
    client = OpenAI(api_key=YOUR_API_KEY, base_url="https://api.perplexity.ai")
    resp = client.chat.completions.create(
        model="llama-3.1-sonar-large-128k-online",
        messages=messages)
    try:
        return (resp.choices[0].message.content, resp.citations)
    except AttributeError as e:
        print(e)
        print(f"Error with response: {resp}")
        return ("", [])