# Prep

## Imports

In [2]:
import pandas as pd
import numpy as np
import requests
import pprint
import json
import os
import re

from dotenv import load_dotenv
from bs4 import BeautifulSoup

## Auth

In [3]:
load_dotenv(r"C:\\Users\\User\\Documents\\GitHub\\movies\\tmdb_auth.env")

api_key = os.getenv("API_KEY")
access_token = os.getenv("ACCESS_TOKEN")

# TMDB API Data

In [4]:
tmdb_url = "https://api.themoviedb.org/3/account/21623434/rated/movies?language=en-US&page=1&sort_by=created_at.asc"

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {access_token}"
}

#response = requests.get(tmdb_url, headers=headers)


In [5]:
#data = json.loads(response.text)
#data['results']

# Letterboxd Scraping Data

## Functions

### Extraction and general structuring

In [6]:
# reads through a Letterboxd list and gets the url for each movie in it

def get_film_urls(list_url):
    content = requests.get(list_url).text
    soup = BeautifulSoup(content, 'html')

    url_list = [div['data-target-link'] for div in soup.find_all('div', class_='film-poster')]

    return url_list

In [7]:
# extracts the complete, raw HTML fom a URL

def get_raw_film_html(film_url):
    url = "https://letterboxd.com" + film_url
    content = requests.get(url).text
    soup = BeautifulSoup(content, 'html.parser')

    return soup

In [8]:
# goes through the raw HTML. extracts and structures general data and metadata about the film 

def get_general_film_data(soup):
    duration_string = soup.find(class_='text-footer').get_text().replace('\xa0', ' ').strip()

    general_data = {
        'letterboxd_id': soup.find(id='backdrop')['data-film-id'],
        'letterboxd_shorttitle': soup.find('h1', class_='filmtitle').get_text(),
        'letterboxd_longtitle': soup.find(property='og:title')['content'],
        'letterboxd_slug': soup.find(id='backdrop')['data-film-slug'],
        'letterboxd_url': soup.find(property='og:url')['content'],
        'imdb_url': soup.find('a', {'data-track-action': 'IMDb'})['href'],
        'tmdb_url': soup.find('a', {'data-track-action': 'TMDb'})['href'],
        'tmdb_id': '',
        'release_year': soup.find(class_='releaseyear').find('a').get_text(strip=True),
        'duration': re.search(r'(\d+)\s+mins', duration_string).group(1),
        'avg_rating': soup.find('meta', attrs={'name': 'twitter:data2'})['content'].split(' out')[0]
    }

    general_data['tmdb_id'] = general_data['tmdb_url'].split('/')[-2]

    return general_data

In [9]:
# goes through the raw HTML. extracts and structures data about the movie's cast

def get_film_cast(soup):
    cast_list = []
    cast = soup.find(name='div', class_='cast-list').find_all('a', class_='tooltip')

    for member in cast:
        cast_member_info = {
            'name': member.get_text(strip=True),
            'link': member['href']
            #'character_name': member['title']
        }

        try:
            cast_member_info['character_name'] = member['title']
        except:
            cast_member_info['character_name'] = None
        cast_list.append(cast_member_info)

    return cast_list

In [10]:
# goes through the raw HTML. extracts and structures data about the movie's crew

def get_film_crew(soup):
    crew_list = []
    crew = soup.find(id='tab-crew').find_all('a')

    for member in crew:
        split_link = member['href'].split('/')
        
        crew_member_info = {
            'name': member.get_text(strip=True),
            'role': split_link[1],
            'link': member['href'],
        }
        crew_list.append(crew_member_info)
    
    return crew_list

In [11]:
# goes through the raw HTML. extracts and structures data about other details concerning the movie

def get_film_details(soup):
    details_list = []
    details = soup.find(id='tab-details').find_all('a')

    for detail in details:
        split_link = detail['href'].split('/')

        detail_info = {
            'key': '',
            'value': detail.get_text(strip=True),
            'link': detail['href']
        }

        if 'studio' in detail['href']:
            detail_info['key'] = 'studio'
        elif 'country' in detail['href']:
            detail_info['key'] = 'country'
        elif 'language' in detail['href']:
            detail_info['key'] = 'language'
        else:
            detail_info['key'] = 'ERROR'
        details_list.append(detail_info)

    return details_list

In [12]:
# goes through the raw HTML. extracts and structures data about the movie's genres and themes

def get_film_genres(soup):
    genres = [a_tag.get_text(strip=True) for a_tag in soup.find(id='tab-genres').find_all('a')]

    return genres[:-1]

In [13]:
# creates a loop using the previous functions to extract all the relevant data and unify it in a dict

def get_complete_film_data(film_url):
    film_soup = get_raw_film_html(film_url)

    film_data = {
        'general_data': get_general_film_data(film_soup),
        'cast': get_film_cast(film_soup),
        'crew': get_film_crew(film_soup),
        'details': get_film_details(film_soup),
        'genres_and_themes': get_film_genres(film_soup)
    }

    return film_data

In [14]:
# loops through all URLs in a list, extracting and structuring data from all of them

def get_all_films(url_list):
    whole_data = []

    counter = 0
    for film in url_list:
        #print(f"Extracting from URL #{counter}:\n{film}\n")
        whole_data.append(get_complete_film_data(film))
        counter += 1
    
    return whole_data

In [15]:
# transforms the data dictionaries into dataframes

def dicts_to_dfs(data):
    all_dfs_gdata = [] # general data
    all_dfs_cast = []
    all_dfs_crew = []
    all_dfs_details = []
    all_dfs_gthemes = []


    for film in data:
        id = film['general_data']['letterboxd_id']
        title = film['general_data']['letterboxd_shorttitle']
        
        single_df_gdata = pd.DataFrame.from_dict([film['general_data']])
        all_dfs_gdata.append(single_df_gdata)

        single_df_cast = pd.DataFrame.from_dict(film['cast']).assign(film_id = id, film_title = title)
        all_dfs_cast.append(single_df_cast)

        single_df_crew = pd.DataFrame.from_dict(film['crew']).assign(film_id = id, film_title = title)
        all_dfs_crew.append(single_df_crew)

        single_df_details = pd.DataFrame.from_dict(film['details']).assign(film_id = id, film_title = title)
        all_dfs_details.append(single_df_details)

        single_df_gthemes = pd.DataFrame.from_dict(film['genres_and_themes']).assign(film_id = id, film_title = title)
        all_dfs_gthemes.append(single_df_gthemes)

    all_dfs_dict = {
        'df_gdata': pd.concat(all_dfs_gdata),
        'df_cast': pd.concat(all_dfs_cast),
        'df_crew': pd.concat(all_dfs_crew),
        'df_details': pd.concat(all_dfs_details),
        'df_gthemes': pd.concat(all_dfs_gthemes)
    }

    return all_dfs_dict

### Other treatments

## Extract, create and treat DFs

In [16]:
film_urls = get_film_urls("https://letterboxd.com/dromemario/list/fff-film-fueled-friends/")

#films_data = get_all_films(film_urls)

#with open("films_data.json", "w") as json_file:
#    json.dump(films_data, json_file, indent=4)

In [17]:
with open("films_data.json", "r") as json_file:
    films_data = json.load(json_file)

In [18]:
all_dfs_dict = dicts_to_dfs(films_data)

In [68]:
df_gdata = (
    all_dfs_dict['df_gdata'][[
        'letterboxd_id',
        'letterboxd_shorttitle',
        'letterboxd_longtitle',
        'letterboxd_slug',
        'tmdb_id',
        'release_year',
        'duration',
        'avg_rating',
        'letterboxd_url',
        'tmdb_url',
        'imdb_url'
        ]]
    .astype({
        'release_year': 'int64',
        'duration': 'int64',
        'avg_rating': 'float64',
        })
    .reset_index(drop=True)
)

df_cast = (
    all_dfs_dict['df_cast'][[
        'film_id',
        'film_title',
        'name',
        'link',
        'character_name'
    ]]
)

df_crew = (
    all_dfs_dict['df_crew'][[
        'film_id',
        'film_title',
        'name',
        'role',
        'link',
    ]]
)

df_details = (
    all_dfs_dict['df_details'][[
        'film_id',
        'film_title',
        'key',
        'value',
        'link',
    ]]
)

df_gthemes = (
    all_dfs_dict['df_gthemes'].rename(columns={0: 'value'})[[
        'film_id',
        'film_title',
        'value'
    ]]
)

# Analysis

In [93]:
median_rating = df_gdata['avg_rating'].median()
closest_to_median_row = df_gdata.iloc[(df_gdata['avg_rating'] - median_rating).abs().idxmin()]

metrics_dict = {
    'movies_watched': len(df_gdata),
    'minutes_watched': df_gdata['duration'].sum(),
    'hours_watched': (df_gdata['duration'].sum() / 60).round(2),
    'avg_movie_length': df_gdata['duration'].mean().round(2),
    'avg_lbxd_rating': df_gdata['avg_rating'].mean().round(2),
    'best_lbxd_rating': df_gdata.loc[df_gdata['avg_rating'].idxmax()]['letterboxd_shorttitle'],
    'worst_lbxd_rating': df_gdata.loc[df_gdata['avg_rating'].idxmin()]['letterboxd_shorttitle'],
    'median_lbxd_rating': closest_to_median_row['letterboxd_shorttitle']
}


In [94]:
for k in metrics_dict:
    print(f"{k}\n{metrics_dict[k]}\n")

movies_watched
70

minutes_watched
7531

hours_watched
125.52

avg_movie_length
107.59

avg_lbxd_rating
3.64

best_lbxd_rating
Dune: Part Two

worst_lbxd_rating
Cats

median_lbxd_rating
Pig



In [30]:
df_gdata

Unnamed: 0,letterboxd_id,letterboxd_shorttitle,letterboxd_longtitle,letterboxd_slug,tmdb_id,release_year,duration,avg_rating,letterboxd_url,tmdb_url,imdb_url
0,51869,The Wizard,The Wizard (1989),the-wizard,183,1989,100,2.92,https://letterboxd.com/film/the-wizard/,https://www.themoviedb.org/movie/183/,http://www.imdb.com/title/tt0098663/maindetails
0,565852,The Northman,The Northman (2022),the-northman,639933,2022,137,3.79,https://letterboxd.com/film/the-northman/,https://www.themoviedb.org/movie/639933/,http://www.imdb.com/title/tt11138512/maindetails
0,43552,Rad,Rad (1986),rad,13841,1986,93,3.23,https://letterboxd.com/film/rad/,https://www.themoviedb.org/movie/13841/,http://www.imdb.com/title/tt0091817/maindetails
0,353117,Get Out,Get Out (2017),get-out-2017,419430,2017,104,4.16,https://letterboxd.com/film/get-out-2017/,https://www.themoviedb.org/movie/419430/,http://www.imdb.com/title/tt5052448/maindetails
0,508037,RRR,RRR (2022),rrr,579974,2022,185,4.20,https://letterboxd.com/film/rrr/,https://www.themoviedb.org/movie/579974/,http://www.imdb.com/title/tt8178634/maindetails
...,...,...,...,...,...,...,...,...,...,...,...
0,223818,Dude Bro Party Massacre III,Dude Bro Party Massacre III (2015),dude-bro-party-massacre-iii,296503,2015,101,3.34,https://letterboxd.com/film/dude-bro-party-mas...,https://www.themoviedb.org/movie/296503/,http://www.imdb.com/title/tt3699692/maindetails
0,47171,The Fly,The Fly (1986),the-fly-1986,9426,1986,96,3.99,https://letterboxd.com/film/the-fly-1986/,https://www.themoviedb.org/movie/9426/,http://www.imdb.com/title/tt0091064/maindetails
0,890105,Humanist Vampire Seeking Consenting Suicidal P...,Humanist Vampire Seeking Consenting Suicidal P...,humanist-vampire-seeking-consenting-suicidal-p...,988402,2023,91,3.89,https://letterboxd.com/film/humanist-vampire-s...,https://www.themoviedb.org/movie/988402/,http://www.imdb.com/title/tt24216998/maindetails
0,51471,Nosferatu,Nosferatu (1922),nosferatu,653,1922,95,3.88,https://letterboxd.com/film/nosferatu/,https://www.themoviedb.org/movie/653/,http://www.imdb.com/title/tt0013442/maindetails


In [23]:
print(df_gdata['duration'].sum())

7531


# Tests

In [187]:
df_gthemes.head()

Unnamed: 0,0,film_id,film_title
0,Adventure,51869,The Wizard
1,Family,51869,The Wizard
2,Drama,51869,The Wizard
3,Comedy,51869,The Wizard
4,Underdogs and coming of age,51869,The Wizard


In [180]:
pprint.pprint(films_data[0]['general_data'], sort_dicts=False)

{'letterboxd_id': '51869',
 'letterboxd_shorttitle': 'The Wizard',
 'letterboxd_longtitle': 'The Wizard (1989)',
 'letterboxd_slug': 'the-wizard',
 'letterboxd_url': 'https://letterboxd.com/film/the-wizard/',
 'imdb_url': 'http://www.imdb.com/title/tt0098663/maindetails',
 'tmdb_url': 'https://www.themoviedb.org/movie/183/',
 'tmdb_id': '183',
 'release_year': '1989',
 'duration': '100',
 'avg_rating': '2.92'}
