# Prep

## Imports

In [2]:
import pandas as pd
import numpy as np
import requests
import pprint
import json
import os
import re

from dotenv import load_dotenv
from bs4 import BeautifulSoup

## Auth

In [3]:
load_dotenv(r"C:\\Users\\User\\Documents\\GitHub\\movies\\tmdb_auth.env")

api_key = os.getenv("API_KEY")
access_token = os.getenv("ACCESS_TOKEN")

# TMDB API Data

In [4]:
tmdb_url = "https://api.themoviedb.org/3/account/21623434/rated/movies?language=en-US&page=1&sort_by=created_at.asc"

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {access_token}"
}

#response = requests.get(tmdb_url, headers=headers)


In [5]:
#data = json.loads(response.text)
#data['results']

# Letterboxd Scraping Data

## Functions

### Extraction and general structuring

In [6]:
# reads through a Letterboxd list and gets the url for each movie in it

def get_film_urls(list_url):
    content = requests.get(list_url).text
    soup = BeautifulSoup(content, 'html')

    url_list = [div['data-target-link'] for div in soup.find_all('div', class_='film-poster')]

    return url_list

In [7]:
# extracts the complete, raw HTML fom a URL

def get_raw_film_html(film_url):
    url = "https://letterboxd.com" + film_url
    content = requests.get(url).text
    soup = BeautifulSoup(content, 'html.parser')

    return soup

In [8]:
# goes through the raw HTML. extracts and structures general data and metadata about the film 

def get_general_film_data(soup):
    duration_string = soup.find(class_='text-footer').get_text().replace('\xa0', ' ').strip()

    general_data = {
        'letterboxd_id': soup.find(id='backdrop')['data-film-id'],
        'letterboxd_shorttitle': soup.find('h1', class_='filmtitle').get_text(),
        'letterboxd_longtitle': soup.find(property='og:title')['content'],
        'letterboxd_slug': soup.find(id='backdrop')['data-film-slug'],
        'letterboxd_url': soup.find(property='og:url')['content'],
        'imdb_url': soup.find('a', {'data-track-action': 'IMDb'})['href'],
        'tmdb_url': soup.find('a', {'data-track-action': 'TMDb'})['href'],
        'tmdb_id': '',
        'release_year': soup.find(class_='releaseyear').find('a').get_text(strip=True),
        'duration': re.search(r'(\d+)\s+mins', duration_string).group(1),
        'avg_rating': soup.find('meta', attrs={'name': 'twitter:data2'})['content'].split(' out')[0]
    }

    general_data['tmdb_id'] = general_data['tmdb_url'].split('/')[-2]

    return general_data

In [9]:
# goes through the raw HTML. extracts and structures data about the movie's cast

def get_film_cast(soup):
    cast_list = []
    cast = soup.find(name='div', class_='cast-list').find_all('a', class_='tooltip')

    for member in cast:
        cast_member_info = {
            'name': member.get_text(strip=True),
            'link': member['href']
            #'character_name': member['title']
        }

        try:
            cast_member_info['character_name'] = member['title']
        except:
            cast_member_info['character_name'] = None
        cast_list.append(cast_member_info)

    return cast_list

In [10]:
# goes through the raw HTML. extracts and structures data about the movie's crew

def get_film_crew(soup):
    crew_list = []
    crew = soup.find(id='tab-crew').find_all('a')

    for member in crew:
        split_link = member['href'].split('/')
        
        crew_member_info = {
            'name': member.get_text(strip=True),
            'role': split_link[1],
            'link': member['href'],
        }
        crew_list.append(crew_member_info)
    
    return crew_list

In [11]:
# goes through the raw HTML. extracts and structures data about other details concerning the movie

def get_film_details(soup):
    details_list = []
    details = soup.find(id='tab-details').find_all('a')

    for detail in details:
        split_link = detail['href'].split('/')

        detail_info = {
            'key': '',
            'value': detail.get_text(strip=True),
            'link': detail['href']
        }

        if 'studio' in detail['href']:
            detail_info['key'] = 'studio'
        elif 'country' in detail['href']:
            detail_info['key'] = 'country'
        elif 'language' in detail['href']:
            detail_info['key'] = 'language'
        else:
            detail_info['key'] = 'ERROR'
        details_list.append(detail_info)

    return details_list

In [12]:
# goes through the raw HTML. extracts and structures data about the movie's genres and themes

def get_film_genres(soup):
    genres = [a_tag.get_text(strip=True) for a_tag in soup.find(id='tab-genres').find_all('a')]

    return genres[:-1]

In [13]:
# creates a loop using the previous functions to extract all the relevant data and unify it in a dict

def get_complete_film_data(film_url):
    film_soup = get_raw_film_html(film_url)

    film_data = {
        'general_data': get_general_film_data(film_soup),
        'cast': get_film_cast(film_soup),
        'crew': get_film_crew(film_soup),
        'details': get_film_details(film_soup),
        'genres_and_themes': get_film_genres(film_soup)
    }

    return film_data

In [14]:
# loops through all URLs in a list, extracting and structuring data from all of them

def get_all_films(url_list):
    whole_data = []

    counter = 0
    for film in url_list:
        #print(f"Extracting from URL #{counter}:\n{film}\n")
        whole_data.append(get_complete_film_data(film))
        counter += 1
    
    return whole_data

In [15]:
# transforms the data dictionaries into dataframes

def dicts_to_dfs(data):
    all_dfs_gdata = [] # general data
    all_dfs_cast = []
    all_dfs_crew = []
    all_dfs_details = []
    all_dfs_gthemes = []


    for film in data:
        id = film['general_data']['letterboxd_id']
        title = film['general_data']['letterboxd_shorttitle']
        
        single_df_gdata = pd.DataFrame.from_dict([film['general_data']])
        all_dfs_gdata.append(single_df_gdata)

        single_df_cast = pd.DataFrame.from_dict(film['cast']).assign(film_id = id, film_title = title)
        all_dfs_cast.append(single_df_cast)

        single_df_crew = pd.DataFrame.from_dict(film['crew']).assign(film_id = id, film_title = title)
        all_dfs_crew.append(single_df_crew)

        single_df_details = pd.DataFrame.from_dict(film['details']).assign(film_id = id, film_title = title)
        all_dfs_details.append(single_df_details)

        single_df_gthemes = pd.DataFrame.from_dict(film['genres_and_themes']).assign(film_id = id, film_title = title)
        all_dfs_gthemes.append(single_df_gthemes)

    all_dfs_dict = {
        'df_gdata': pd.concat(all_dfs_gdata),
        'df_cast': pd.concat(all_dfs_cast),
        'df_crew': pd.concat(all_dfs_crew),
        'df_details': pd.concat(all_dfs_details),
        'df_gthemes': pd.concat(all_dfs_gthemes)
    }

    return all_dfs_dict

### Other treatments

## Extract, create and treat DFs

In [16]:
film_urls = get_film_urls("https://letterboxd.com/dromemario/list/fff-film-fueled-friends/")

#films_data = get_all_films(film_urls)

#with open("films_data.json", "w") as json_file:
#    json.dump(films_data, json_file, indent=4)

In [17]:
with open("films_data.json", "r") as json_file:
    films_data = json.load(json_file)

In [18]:
all_dfs_dict = dicts_to_dfs(films_data)

In [170]:
df_gdata = (
    all_dfs_dict['df_gdata'][[
        'letterboxd_id',
        'letterboxd_shorttitle',
        'letterboxd_longtitle',
        'letterboxd_slug',
        'tmdb_id',
        'release_year',
        'duration',
        'avg_rating',
        'letterboxd_url',
        'tmdb_url',
        'imdb_url'
        ]]
    .astype({
        'release_year': 'int64',
        'duration': 'int64',
        'avg_rating': 'float64',
        })
    .reset_index(drop=True)
)

df_cast = (
    all_dfs_dict['df_cast'][[
        'film_id',
        'film_title',
        'name',
        'link',
        'character_name'
    ]]
    .reset_index(drop=True)
)

df_crew = (
    all_dfs_dict['df_crew'][[
        'film_id',
        'film_title',
        'name',
        'role',
        'link',
    ]]
    .reset_index(drop=True)
)

df_details = (
    all_dfs_dict['df_details'][[
        'film_id',
        'film_title',
        'key',
        'value',
        'link',
    ]]
    .reset_index(drop=True)
)

df_gthemes = (
    all_dfs_dict['df_gthemes'].rename(columns={0: 'value'})[[
        'film_id',
        'film_title',
        'value'
    ]]
    .reset_index(drop=True)
)

# Analysis

## General data

In [171]:
median_rating = df_gdata['avg_rating'].median()
closest_to_median_row = df_gdata.iloc[(df_gdata['avg_rating'] - median_rating).abs().idxmin()]

metrics_dict = {
    'movies_watched': len(df_gdata),
    'minutes_watched': df_gdata['duration'].sum(),
    'hours_watched': (df_gdata['duration'].sum() / 60).round(2),
    'avg_movie_length': df_gdata['duration'].mean().round(2),
    'avg_lbxd_rating': df_gdata['avg_rating'].mean().round(2),
    'best_lbxd_rating': df_gdata.loc[df_gdata['avg_rating'].idxmax()]['letterboxd_shorttitle'],
    'worst_lbxd_rating': df_gdata.loc[df_gdata['avg_rating'].idxmin()]['letterboxd_shorttitle'],
    'median_lbxd_rating': closest_to_median_row['letterboxd_shorttitle']
}


In [96]:
#[print(f"{k}\n{metrics_dict[k]}\n") for k in metrics_dict]

## Cast

In [172]:
df_cast['movie_count'] = df_cast.groupby('link')['link'].transform('count')

In [173]:
# people that appeared in most movies, as well as which movies these were

df_cast.loc[df_cast['movie_count']>2].sort_values(by=['movie_count', 'name'], ascending=[False, True])

Unnamed: 0,film_id,film_title,name,link,character_name,movie_count
800,51387,Evil Dead II,Bruce Campbell,/actor/bruce-campbell/,Ash,4
1241,51386,Army of Darkness,Bruce Campbell,/actor/bruce-campbell/,Ash,4
1380,46897,Bubba Ho-tep,Bruce Campbell,/actor/bruce-campbell/,Elvis Presley / Sebastian Haff,4
1458,51781,Fargo,Bruce Campbell,/actor/bruce-campbell/,Soap Opera Actor on TV (uncredited),4
1132,598882,The Banshees of Inisherin,Brendan Gleeson,/actor/brendan-gleeson/,Colm Doherty,3
1707,277023,Paddington 2,Brendan Gleeson,/actor/brendan-gleeson/,Knuckles McGinty,3
1882,51881,28 Days Later,Brendan Gleeson,/actor/brendan-gleeson/,Frank,3
1175,44952,Invasion of the Body Snatchers,Jeff Goldblum,/actor/jeff-goldblum/,Jack Bellicec,3
2408,333448,Isle of Dogs,Jeff Goldblum,/actor/jeff-goldblum/,Duke (voice),3
3063,47171,The Fly,Jeff Goldblum,/actor/jeff-goldblum/,Seth Brundle,3


## Crew

In [174]:
df_crew['movie_count'] = df_crew.groupby('link')['link'].transform('count')

In [175]:
df_crew.head(3)

Unnamed: 0,film_id,film_title,name,role,link,movie_count
0,51869,The Wizard,Todd Holland,director,/director/todd-holland-1/,1
1,51869,The Wizard,David Chisholm,producer,/producer/david-chisholm/,1
2,51869,The Wizard,Ken Topolsky,producer,/producer/ken-topolsky/,1


In [205]:
main_roles = ['director', 'producer', 'executive-producer', 'writer', 'editor', 'cinematography', 'composer']

df_crew_mainroles = df_crew.loc[df_crew['role'].isin(main_roles)]
df_crew_secondaryroles = df_crew.loc[~df_crew['role'].isin(main_roles)]

In [206]:
dfs_mainroles = {}

for role in main_roles:
    dfs_mainroles[role] = df_crew_mainroles.loc[(df_crew_mainroles['role'] == role) & (df_crew_mainroles['movie_count'] > 1)].sort_values(['movie_count', 'name'], ascending=False)

In [209]:
df_crew_secondaryroles.loc[df_crew_secondaryroles['movie_count'] > 2].sort_values(['movie_count', 'name'], ascending=False).iloc[90:110]

Unnamed: 0,film_id,film_title,name,role,link,movie_count
549,46759,Cobra,Ben Scott,stunts,/stunts/ben-scott-2/,3
815,574385,The Unbearable Weight of Massive Talent,Ben Scott,stunts,/stunts/ben-scott-2/,3
494,47798,They Live,Becky Sullivan,sound,/sound/becky-sullivan/,3
1247,46087,Child's Play,Becky Sullivan,sound,/sound/becky-sullivan/,3
3479,48823,The Fugitive,Becky Sullivan,sound,/sound/becky-sullivan/,3
1858,276291,The Meg,Augie Davis,stunts,/stunts/augie-davis/,3
3265,680358,X,Augie Davis,stunts,/stunts/augie-davis/,3
3329,853822,Pearl,Augie Davis,stunts,/stunts/augie-davis/,3
243,353117,Get Out,Aaron Becker,title-design,/title-design/aaron-becker/,3
2489,591053,Past Lives,Aaron Becker,title-design,/title-design/aaron-becker/,3


## Details

## Genres and Themes

# Tests