# Table of Contents

- [Import Libraries](#section1)
- [load Datasets](#section2)
- [Scrap User Information](#section3)
- [Scrap Movie Information](#section4)
- [Aspects Sentiment](#section5)


## 1. Import necessary libraries <a id="section1"></a>

In [None]:
import numpy as np
import pandas as pd
from letterboxdpy.user import User
import time
import requests
from bs4 import BeautifulSoup
from letterboxdpy.movie import Movie, movie_details, movie_watchers
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import asyncio
import aiohttp
import json
from tqdm.asyncio import tqdm



## 2. Load Train and Test Datasets <a id="section2"></a>

In [7]:
df_test = pd.read_csv("C:/Users/melika/Desktop/master/Lisa_Thesis/Thesis/out-of-peirod/test_out_of_period.csv", index_col=0)
df_train = pd.read_csv("C:/Users/melika/Desktop/master/Lisa_Thesis/Thesis/out-of-peirod/Train_Big_out_of_period.csv", index_col=0)
df_train.head()

Unnamed: 0,user_id,film,comment,date,rating,year
4,/dustymoth/,jfk,It was so long that I stopped caring who kille...,2024-10-22,2.0,2024
6,/dustymoth/,salems-lot-2024,Garbage,2024-10-06,1.0,2024
7,/dustymoth/,alien-3,Assembly Cut,2024-09-07,4.0,2024
8,/dustymoth/,lock-stock-and-two-smoking-barrels,The tall tales of toxic men,2024-09-06,0.5,2024
9,/dustymoth/,morocco,Most of those points are for the tuxedo scene,2024-08-21,3.0,2024


In [8]:
print(f"length of train dataframe is: {len(df_train)} and length of test dataframe is: {len(df_test)}")

length of train dataframe is: 540039 and length of test dataframe is: 10021


#### Now, for convenience, we combine the two dataframes.

In [9]:
# Concatenate vertically (default axis=0)
df= pd.concat([df_train, df_test], axis=0, ignore_index=True)

print(len(df))
df.head()

550060


Unnamed: 0,user_id,film,comment,date,rating,year
0,/dustymoth/,jfk,It was so long that I stopped caring who kille...,2024-10-22,2.0,2024.0
1,/dustymoth/,salems-lot-2024,Garbage,2024-10-06,1.0,2024.0
2,/dustymoth/,alien-3,Assembly Cut,2024-09-07,4.0,2024.0
3,/dustymoth/,lock-stock-and-two-smoking-barrels,The tall tales of toxic men,2024-09-06,0.5,2024.0
4,/dustymoth/,morocco,Most of those points are for the tuxedo scene,2024-08-21,3.0,2024.0


#### Extract unique users and unique movies for the next step.

In [10]:
# Get unique user IDs
unique_users = df['user_id'].unique()

# Remove trailing slashes from each user_id
users= [user.strip("/") for user in unique_users]

#Print the number of unique users.
print(unique_users)
print(len(unique_users))

['/dustymoth/' '/gogor/' '/tomato33/' ... '/sawtvik/' '/shayjordann/'
 '/nobodywaldorf/']
6747


In [11]:
# Get unique film
films = df['film'].unique()

#Print the number of unique films.
print(films)
print(len(films))

['jfk' 'salems-lot-2024' 'alien-3' ... 'midnight-runners'
 'eye-of-the-beholder' 'elite-short-stories-carla-samuel']
9131


## 3. Scrape user information <a id="section3"></a>

In [24]:
# Creates and prints a Letterboxd user instance for "gogor" using letterboxdpy.  
user_instance = User("gogor")
print(user_instance)

{
    "username": "gogor",
    "watchlist_length": "67",
    "favorites": [
        [
            "Everything Everywhere All at Once",
            "everything-everywhere-all-at-once"
        ],
        [
            "Rebel Without a Cause",
            "rebel-without-a-cause"
        ],
        [
            "La La Land",
            "la-la-land"
        ],
        [
            "Schindler's List",
            "schindlers-list"
        ]
    ],
    "stats": {
        "Films": "2,577",
        "This year": "53",
        "Lists": "62",
        "Following": "46",
        "Followers": "49"
    }
}


In [27]:
total = len(users)  # Get total number of users to process
row = 1  # Initialize row counter for progress tracking
data = []  # Initialize an empty list to store user data

# Function to safely convert numbers to integers (removing commas)
def safe_int(value):
    if isinstance(value, str):
        return int(value.replace(",", ""))  # Remove commas and convert to int
    return int(value) if value else 0  # Default to 0 if None or empty

# Loop through each user and fetch their info
for user in users:
    print(f"{row}/{total}")  # Print progress
    row += 1
    try:
        user_instance = User(user)  # Create a User instance for each user

        # Extract user data safely
        watchlist_length = safe_int(user_instance.watchlist_length)
        favorites = [fav[0] for fav in user_instance.favorites] if user_instance.favorites else []
        
        stats = user_instance.stats or {}  # Default to empty dictionary if None
        films_watched = safe_int(stats.get("Films", 0))
        films_this_year = safe_int(stats.get("This year", 0))
        lists_created = safe_int(stats.get("Lists", 0))
        following = safe_int(stats.get("Following", 0))
        followers = safe_int(stats.get("Followers", 0))

        # Append the extracted data for each user to the list
        data.append({
            "user": user_instance.username,  # Changed "username" to "user"
            "watchlist_length": watchlist_length,
            "favorites": favorites,  
            "films_watched": films_watched,
            "films_this_year": films_this_year,
            "lists_created": lists_created,
            "following": following,
            "followers": followers,
        })

    except Exception as e:
        print(f"Error fetching data for {user}: {e}")  # Handle errors for individual users

# Create a DataFrame from the gathered data
users_info = pd.DataFrame(data)

# Display the DataFrame
users_info.head()


1/5
2/5
3/5
4/5
5/5


Unnamed: 0,user,watchlist_length,favorites,films_watched,films_this_year,lists_created,following,followers
0,dustymoth,73,"[Withnail & I, Eraserhead, The Naked Civil Ser...",2406,52,84,32,26
1,gogor,67,"[Everything Everywhere All at Once, Rebel With...",2577,53,62,46,49
2,tomato33,0,"[Camera Buff, The Empire Strikes Back, Frances...",108,10,3,4,3
3,areyoujoebanks,118,"[It's a Wonderful Life, Twin Peaks: The Return...",712,38,0,128,76
4,glynk,2,"[Mr. Smith Goes to Washington, Your Name., Mag...",2376,99,0,3,0


In [31]:
len(users_info)

6400

In [None]:
#save the user_info DataFrame to a CSV file
users_info.to_csv("C:/Users/melika/Desktop/master/Lisa_Thesis/Thesis/Machine_Learning/Data/users_info.csv")


## 4. Scrape movie information <a id="section4"></a>

In [36]:
# Initialize the session with retries
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount("https://", HTTPAdapter(max_retries=retries))

# Sample DataFrame with unique film names
unique_films = films
total_movies = len(unique_films)

# List to store data for all movies
all_movie_data = []

# Iterate through each unique film
for index, film in enumerate(unique_films, start=1):
    try:
        # Progress tracking
        print(f"[{index}/{total_movies}] Fetching data for: {film}...")
        
        # Fetch movie details from Letterboxd
        movie_instance = Movie(film)
        details = movie_details(movie_instance)
        watchers = movie_watchers(movie_instance)
        
        # Fetch additional data (scraping)
        movie_url = movie_instance.url
        response = session.get(movie_url, timeout=10)
        response.raise_for_status()  # Raise an error for bad responses (4xx and 5xx)
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Extract summary
        review_div = soup.find("div", class_="review body-text -prose -hero prettify")
        summary = None
        if review_div:
            summary_div = review_div.find("div", class_="truncate")
            if summary_div and summary_div.find("p"):
                summary = summary_div.find("p").text.strip()
        
        # Extract cast information
        cast_div = soup.find("div", id="tab-cast", class_="tabbed-content-block")
        cast_info = {}
        if cast_div:
            cast_links = cast_div.find_all("a", class_="text-slug tooltip")
            for link in cast_links:
                actor_name = link.text.strip()
                actor_url = "https://letterboxd.com" + link.get("href")
                cast_info[actor_name] = actor_url
        
        # Collect all the data
        movie_data = {
            "title": movie_instance.title,
            "url": movie_instance.url,
            "directors": movie_instance.directors,  # Adding directors
            "rating": movie_instance.rating,
            "year": movie_instance.year,
            "genres": movie_instance.genres,
            "country": details.get("Country", []),
            "studio": details.get("Studio", []),
            "language": details.get("Language", []),
            "watch_count": watchers.get("watch_count", "0"),
            "fan_count": watchers.get("fan_count", "0"),
            "like_count": watchers.get("like_count", "0"),
            "review_count": watchers.get("review_count", "0"),
            "list_count": watchers.get("list_count", "0"),
            "cast_info": cast_info,
            "summary": summary
        }

        all_movie_data.append(movie_data)
        print(f"[{index}/{total_movies}] Successfully fetched data for: {film}")  # Success message

        # Sleep to avoid hitting rate limits
        time.sleep(5)

    except Exception as e:
        print(f"[{index}/{total_movies}] Error processing {film}: {e}")

# Convert the collected data into a DataFrame
movies_info = pd.DataFrame(all_movie_data)
movies_info.head()


[1/9131] Fetching data for: jfk...
[1/9131] Successfully fetched data for: jfk
[2/9131] Fetching data for: salems-lot-2024...
[2/9131] Successfully fetched data for: salems-lot-2024
[3/9131] Fetching data for: alien-3...
[3/9131] Successfully fetched data for: alien-3
[4/9131] Fetching data for: lock-stock-and-two-smoking-barrels...
[4/9131] Successfully fetched data for: lock-stock-and-two-smoking-barrels
[5/9131] Fetching data for: morocco...
[5/9131] Successfully fetched data for: morocco
[6/9131] Fetching data for: maxxxine...
[6/9131] Successfully fetched data for: maxxxine
[7/9131] Fetching data for: a-view-to-a-kill...
[7/9131] Successfully fetched data for: a-view-to-a-kill
[8/9131] Fetching data for: gone-baby-gone...
[8/9131] Successfully fetched data for: gone-baby-gone
[9/9131] Fetching data for: 101-dalmatians...
[9/9131] Successfully fetched data for: 101-dalmatians
[10/9131] Fetching data for: poor-things-2023...
[10/9131] Successfully fetched data for: poor-things-2023


Unnamed: 0,title,url,directors,rating,year,genres,country,studio,language,watch_count,fan_count,like_count,review_count,list_count,cast_info,summary
0,jfk,https://letterboxd.com/film/jfk/,[Oliver Stone],4.05 out of 5,1991,"[Drama, History, Thriller]","[USA, France]","[Warner Bros. Pictures, Regency Enterprises, A...","[English, English, Spanish]",173490,1041,42468,21290,56259,{'Kevin Costner': 'https://letterboxd.com/acto...,Follows the investigation into the assassinati...
1,salems-lot-2024,https://letterboxd.com/film/salems-lot-2024/,[Gary Dauberman],2.39 out of 5,2024,[Horror],[USA],"[New Line Cinema, Atomic Monster, Vertigo Ente...",[English],104715,16,14213,32580,27655,{'Lewis Pullman': 'https://letterboxd.com/acto...,Author Ben Mears returns to his childhood home...
2,alien-3,https://letterboxd.com/film/alien-3/,[David Fincher],2.82 out of 5,1992,"[Horror, Science Fiction, Action]",[USA],"[20th Century Fox, Brandywine Productions]",[English],487939,269,68239,61964,116891,{'Sigourney Weaver': 'https://letterboxd.com/a...,After escaping with Newt and Hicks from the al...
3,lock-stock-and-two-smoking-barrels,https://letterboxd.com/film/lock-stock-and-two...,[Guy Ritchie],3.99 out of 5,1998,"[Comedy, Crime]","[UK, USA]","[The Steve Tisch Company, SKA Films, Handmade ...",[English],363721,3624,96479,21158,55552,{'Vinnie Jones': 'https://letterboxd.com/actor...,A card shark and his unwillingly-enlisted frie...
4,morocco,https://letterboxd.com/film/morocco/,[Josef von Sternberg],3.58 out of 5,1930,"[Romance, Drama]",[USA],[Paramount Pictures],"[English, Arabic, English, French, Italian, Sp...",19531,63,4734,3366,16305,{'Gary Cooper': 'https://letterboxd.com/actor/...,"Mogador, Morocco. Late 1920s. A complex romanc..."


In [None]:
movies_info.to_csv("C:/Users/melika/Desktop/master/Lisa_Thesis/Thesis/Machine_Learning/Data/movies_info.csv")

## 5. Aspect-Based Sentiment Analysis for Movies <a id="section5"></a>

#### First, we need to scrape reviews for each movie.

In [3]:
def get_reviews(film_name):
    # Construct the URL using the provided film name
    url = f'https://letterboxd.com/film/{film_name}/'
    
    # Send an HTTP GET request to the website
    response = requests.get(url)
    
    # Parse the page content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the element containing the reviews
    data = soup.find("ul", {"class": ["film-popular-review"]})
    
    # If reviews are found, extract them
    items = data.find_all("div", {"class": ["film-detail-content"]}) if data else []
    
    # Create a list to store the review texts
    reviews = []
    for item in items:
        x = item.find("div", {"class": ['body-text']})
        if x:
            reviews.append(x.text.strip())  # Add review text to the list
    
    return reviews

# Example usage:
film_name = 'anora'  # Replace with any film name
reviews = get_reviews(film_name)
print(reviews)


['A terrifying tale of dating a mama’s boy', "That's why you shouldn't trust twinks", 'it’s really sad to me that a character driven piece about a woman fails to make a real character out of her. we do not see anora’s desires or emotions outside of a punchline or even outside of her husband who she met the week before. as someone who has actually done sex work (in nyc) before, i was deeply disappointed in baker’s treatment of anora. in the film, the distinction between anora’s work and personal lives is blurred to the point where it’s nonexistent. i believe this is where the majority of my problems with the film arise. the only thing she really wants in this film is to find her husband and stay with him. the way it’s…', 'Sean Baker does for sex workers what Martin Scorsese did for Gangsters', 'Once Igor appears on screen the movie shifts to his perspective. (Or, in a way, it clarifies the perspective it already had.) Baker expects us to be impressed that Igor views Ani as a human being

In [12]:
# Initialize an empty list to store the results
results = []
total= len(films)
row=1
# Step 2: Scrape reviews for each film and store the results
for film in films:
    print(f'Processing film {row}/{total}')
    row += 1
    reviews = get_reviews(film)  # Call the function you already created to scrape reviews
    for review in reviews:
        results.append({'film': film, 'review': review})

# Step 3: Create a DataFrame from the results
reviews_df = pd.DataFrame(results)

reviews_df.head()

Processing film 1/9131
Processing film 2/9131
Processing film 3/9131
Processing film 4/9131
Processing film 5/9131
Processing film 6/9131
Processing film 7/9131
Processing film 8/9131
Processing film 9/9131
Processing film 10/9131
Processing film 11/9131
Processing film 12/9131
Processing film 13/9131
Processing film 14/9131
Processing film 15/9131
Processing film 16/9131
Processing film 17/9131
Processing film 18/9131
Processing film 19/9131
Processing film 20/9131
Processing film 21/9131
Processing film 22/9131
Processing film 23/9131
Processing film 24/9131
Processing film 25/9131
Processing film 26/9131
Processing film 27/9131
Processing film 28/9131
Processing film 29/9131
Processing film 30/9131
Processing film 31/9131
Processing film 32/9131
Processing film 33/9131
Processing film 34/9131
Processing film 35/9131
Processing film 36/9131
Processing film 37/9131
Processing film 38/9131
Processing film 39/9131
Processing film 40/9131
Processing film 41/9131
Processing film 42/9131
P

Unnamed: 0,film,review
0,jfk,Enough of this Letterboxd bullshit I gotta fig...
1,jfk,"100/100 ""It's a mystery wrapped in a riddle in..."
2,jfk,STUDY THE PASTBlown away that a movie that cos...
3,jfk,100Maybe one too many scenes of Kevin Costner ...
4,jfk,Joe Pesci being not only good but outstanding ...


In [13]:
print(len(reviews_df))

108177


In [14]:
reviews_df.to_csv("C:/Users/melika/Desktop/master/Lisa_Thesis/Thesis/Machine_Learning/Data/movies_reviews.csv")

#### Now that we have collected reviews for each movie, the volume is overwhelming. So, we randomly select three reviews per movie to keep it manageable.

In [None]:
# Group by 'film' and then apply random sampling, ensuring at most 3 reviews per movie
result_df = reviews_df.groupby('film').apply(lambda x: x.sample(min(len(x), 3)))

# Reset the index to flatten the DataFrame
result_df = result_df.reset_index(drop=True)

# Display the resulting DataFrame
result_df.head()

#### Now, we processes movie reviews by sending them to a language model (LLM-DeepSeek), which categorizes each review into predefined aspects (like Acting, Direction, Cinematography, etc.) and assigns a sentiment (positive or negative) to each aspect. For example, a review such as "The acting was fantastic, but the story was very weak" would be parsed into aspects like {"Acting": "positive", "Story": "negative"}.

In [None]:
# DeepSeek API settings
API_KEY = "your_api_key"
BASE_URL = "https://api.deepseek.com/v1/chat/completions"

# System prompt
system_prompt = """
The user will provide a review about a movie. Please parse the output in JSON format, identifying multiple aspects and their sentiments.

Valid aspects are: ['Acting', 'Direction', 'Screenplay', 'Cinematography', 'Editing', 'Music/Soundtrack', 'Special Effects', 'Production Design', 'Story', 'Movie Length', 'General'].

EXAMPLE INPUT: 
The acting was fantastic, but the story was very weak. The cinematography was stunning.

EXAMPLE JSON OUTPUT:
{
    "aspects": [
        {"aspect": "Acting", "sentiment": "positive"},
        {"aspect": "Story", "sentiment": "negative"},
        {"aspect": "Cinematography", "sentiment": "positive"}
    ]
}
"""

# Function to process a batch of reviews asynchronously
async def process_reviews_batch(session, reviews):
    tasks = [fetch_review_analysis(session, review) for review in reviews]
    return await asyncio.gather(*tasks)

# Function to fetch review analysis asynchronously
async def fetch_review_analysis(session, review):
    if not isinstance(review, str):  # Handle non-string values
        return None

    payload = {
        "model": "deepseek-chat",
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": review}
        ],
        "response_format": {"type": "json_object"}
    }

    try:
        async with session.post(BASE_URL, json=payload, headers={"Authorization": f"Bearer {API_KEY}"}) as response:
            result = await response.json()
            return json.loads(result.get("choices", [{}])[0].get("message", {}).get("content", "{}"))
    except Exception as e:
        print(f"Error processing review: {e}")
        return None  # Return None if an error occurs

# Main function to process selected rows in batches
async def process_selected_reviews(df_subset, batch_size=5):
    results = []
    async with aiohttp.ClientSession() as session:
        for i in tqdm(range(0, len(df_subset), batch_size), desc="Processing Batches"):
            batch = df_subset["review"].iloc[i:i+batch_size].tolist()  # Use the 'review' column directly
            batch_results = await process_reviews_batch(session, batch)
            results.extend(batch_results)
    return results

# Function to process your DataFrame result_df
async def process_result_df(result_df, batch_size=5):
    # Process the reviews in batches
    review_results = await process_selected_reviews(result_df, batch_size)
    
    # Collect results and add to DataFrame
    parsed_aspects = []
    for review_result in review_results:
        if review_result and "aspects" in review_result:
            parsed_aspects.append(review_result["aspects"])
        else:
            parsed_aspects.append([])  # Empty if no aspects found or error

    # Add the parsed aspects to the DataFrame
    result_df["parsed_aspects"] = parsed_aspects
    return result_df


In [None]:
result_df = await process_result_df(result_df, batch_size=10)

In [16]:
result_df.head()

Unnamed: 0,film,review,aspects_sentiment
0,10000-bc,"OK. I've endured 10 mins of this, now where's ...","[{'aspect': 'General', 'sentiment': 'negative'}]"
1,10000-bc,••••First time watching. iTunes rental. + D'le...,"[{'aspect': 'Acting', 'sentiment': 'neutral'},..."
2,10000-bc,10.000 BC... A tribe talking in english...What...,"[{'aspect': 'Story', 'sentiment': 'negative'},..."
3,10000-for-a-massacre,"Great off-brand Django. Violent, gruff, and ma...","[{'aspect': 'Acting', 'sentiment': 'positive'}..."
4,10000-for-a-massacre,"""Passionate - Violent - the man with no heart ...","[{'aspect': 'Acting', 'sentiment': 'positive'}..."


In [18]:
len(result_df)

27040

In [19]:
len(result_df["film"].unique())

9082

In [17]:
result_df.to_csv("C:/Users/melika/Desktop/master/Lisa_Thesis/Thesis/Machine_Learning/Data/aspects_sentiment.csv")