# Prep

## Imports

In [71]:
import pandas as pd
import numpy as np
import requests
import pprint
import json
import os
import re

from dotenv import load_dotenv
from bs4 import BeautifulSoup

## Constants

In [72]:
FILMCLUB_FOLDER = r"C:\\Users\\User\\Documents\\GitHub\\movies\\film_club_data\\"

## Auth

In [73]:
load_dotenv(r"C:\\Users\\User\\Documents\\GitHub\\movies\\tmdb_auth.env")

api_key = os.getenv("API_KEY")
access_token = os.getenv("ACCESS_TOKEN")

# TMDB API Data

In [74]:
hold = '''
tmdb_url = "https://api.themoviedb.org/3/account/21623434/rated/movies?language=en-US&page=1&sort_by=created_at.asc"

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {access_token}"
}
'''

#response = requests.get(tmdb_url, headers=headers)


In [75]:
#data = json.loads(response.text)
#data['results']

# Scrape Letterboxd Data

## Functions

### Extract raw HTML data

In [76]:
# reads through a Letterboxd list and gets the url for each movie in it

def get_film_urls_lbxdlist(list_url):
    content = requests.get(list_url).text
    soup = BeautifulSoup(content, 'html')

    url_list = [div['data-target-link'] for div in soup.find_all('div', class_='film-poster')]

    return url_list

In [77]:
# loops through all the pages of films watched in a user's account and gets the urls of each one

def get_film_urls_lbxduser(username):
    pages = int(
        BeautifulSoup(
            requests.get(f'https://letterboxd.com/{username}/films/').text, 'html.parser')
            .find_all('li', 'paginate-page')[-1].get_text()
        )

    url_list = []
    for page in range(1, (pages+1)):
        print(f"Extracting page {page} out of {pages}.")
        url = f'https://letterboxd.com/{username}/films/page/{page}/'
        content = requests.get(url).text
        soup = BeautifulSoup(content, 'html.parser')

        page_url_list = [div['data-target-link'] for div in soup.find_all('div', 'film-poster')]
        url_list += page_url_list
    
    print("Finished.")

    return url_list

In [78]:
# extracts the complete, raw HTML fom a URL

def get_raw_film_html(film_url):
    url = "https://letterboxd.com" + film_url
    content = requests.get(url).text
    soup = BeautifulSoup(content, 'html.parser')

    return soup

### Structuring

In [96]:
# goes through the raw HTML. extracts and structures general data and metadata about the film 

def get_general_film_data(soup):
    duration_string = soup.find(class_='text-footer').get_text().replace('\xa0', ' ').strip()

    general_data = {
        'letterboxd_id': soup.find(id='backdrop')['data-film-id'],
        'letterboxd_shorttitle': soup.find('h1', class_='filmtitle').get_text(),
        'letterboxd_longtitle': soup.find(property='og:title')['content'],
        'letterboxd_slug': soup.find(id='backdrop')['data-film-slug'],
        'letterboxd_url': soup.find(property='og:url')['content'],
        'imdb_url': '',
        'tmdb_url': soup.find('a', {'data-track-action': 'TMDb'})['href'],
        'tmdb_id': '',
        'release_year': soup.find(class_='releaseyear').find('a').get_text(strip=True),
        'duration': '',
        'avg_rating': ''
    }

    try:
        general_data['duration'] = re.search(r'(\d+)\s+mins', duration_string).group(1)
    except:
        general_data['duration'] = ''
    
    try:
        general_data['avg_rating'] = soup.find('meta', attrs={'name': 'twitter:data2'})['content'].split(' out')[0]
    except:
        general_data['avg_rating'] = ''

    try:
        general_data['imdb_url'] = soup.find('a', {'data-track-action': 'IMDb'})['href']
    except:
        general_data['imdb_url'] = ''

    general_data['tmdb_id'] = general_data['tmdb_url'].split('/')[-2]

    return general_data

In [80]:
# goes through the raw HTML. extracts and structures data about the movie's cast

def get_film_cast(soup):
    cast_list = []

    try:
        cast = soup.find(name='div', class_='cast-list').find_all('a', class_='tooltip')

        for member in cast:
            cast_member_info = {
                'name': member.get_text(strip=True),
                'link': member['href']
                #'character_name': member['title']
            }

            try:
                cast_member_info['character_name'] = member['title']
            except:
                cast_member_info['character_name'] = None
            cast_list.append(cast_member_info)
    except:
        cast_list = []

    return cast_list

In [90]:
# goes through the raw HTML. extracts and structures data about the movie's crew

def get_film_crew(soup):
    crew_list = []

    try:
        crew = soup.find(id='tab-crew').find_all('a')

        for member in crew:
            split_link = member['href'].split('/')
            
            crew_member_info = {
                'name': member.get_text(strip=True),
                'role': split_link[1],
                'link': member['href'],
            }
            crew_list.append(crew_member_info)
    except:
        crew_list = []
    
    return crew_list

In [82]:
# goes through the raw HTML. extracts and structures data about other details concerning the movie

def get_film_details(soup):
    details_list = []
    details = soup.find(id='tab-details').find_all('a')

    for detail in details:
        split_link = detail['href'].split('/')

        detail_info = {
            'key': '',
            'value': detail.get_text(strip=True),
            'link': detail['href']
        }

        if 'studio' in detail['href']:
            detail_info['key'] = 'studio'
        elif 'country' in detail['href']:
            detail_info['key'] = 'country'
        elif 'language' in detail['href']:
            detail_info['key'] = 'language'
        else:
            detail_info['key'] = 'ERROR'
        details_list.append(detail_info)

    return details_list

In [83]:
# goes through the raw HTML. extracts and structures data about the movie's genres and themes

def get_film_genres(soup):
    genres = [a_tag.get_text(strip=True) for a_tag in soup.find(id='tab-genres').find_all('a')]

    return genres[:-1]

### Extraction Loops

In [84]:
# creates a loop using the previous functions to extract all the relevant data and unify it in a dict

def get_complete_film_data(film_url):
    film_soup = get_raw_film_html(film_url)

    film_data = {
        'general_data': get_general_film_data(film_soup),
        'cast': get_film_cast(film_soup),
        'crew': get_film_crew(film_soup),
        'details': get_film_details(film_soup),
        'genres_and_themes': get_film_genres(film_soup)
    }

    return film_data

In [85]:
# loops through all URLs in a list, extracting and structuring data from all of them

def get_all_films(url_list):
    whole_data = []

    counter = 0
    for film in url_list:
        print(f"Extracting from URL #{counter}:\n{film}\n")
        whole_data.append(get_complete_film_data(film))
        counter += 1
    
    return whole_data

### Dataframing

In [86]:
# transforms the data dictionaries into dataframes

def dicts_to_dfs(data):
    all_dfs_gdata = [] # general data
    all_dfs_cast = []
    all_dfs_crew = []
    all_dfs_details = []
    all_dfs_gthemes = []


    for film in data:
        id = film['general_data']['letterboxd_id']
        title = film['general_data']['letterboxd_shorttitle']
        
        single_df_gdata = pd.DataFrame.from_dict([film['general_data']])
        all_dfs_gdata.append(single_df_gdata)

        single_df_cast = pd.DataFrame.from_dict(film['cast']).assign(film_id = id, film_title = title)
        all_dfs_cast.append(single_df_cast)

        single_df_crew = pd.DataFrame.from_dict(film['crew']).assign(film_id = id, film_title = title)
        all_dfs_crew.append(single_df_crew)

        single_df_details = pd.DataFrame.from_dict(film['details']).assign(film_id = id, film_title = title)
        all_dfs_details.append(single_df_details)

        single_df_gthemes = pd.DataFrame.from_dict(film['genres_and_themes']).assign(film_id = id, film_title = title)
        all_dfs_gthemes.append(single_df_gthemes)

    all_dfs_dict = {
        'df_gdata': pd.concat(all_dfs_gdata),
        'df_cast': pd.concat(all_dfs_cast),
        'df_crew': pd.concat(all_dfs_crew),
        'df_details': pd.concat(all_dfs_details),
        'df_gthemes': pd.concat(all_dfs_gthemes)
    }

    return all_dfs_dict

# Extract, create and treat DFs - Film Club Movies

In [87]:
#filmclub_film_urls = get_film_urls_lbxdlist("https://letterboxd.com/dromemario/list/fff-film-fueled-friends/")

#filmclub_films_data = get_all_films(film_urls)

#all_dfs_dict = dicts_to_dfs(films_data)

In [88]:
notfornow = '''
df_generaldata = (
    all_dfs_dict['df_gdata'][[
        'letterboxd_id',
        'letterboxd_shorttitle',
        'letterboxd_longtitle',
        'letterboxd_slug',
        'tmdb_id',
        'release_year',
        'duration',
        'avg_rating',
        'letterboxd_url',
        'tmdb_url',
        'imdb_url'
        ]]
    .astype({
        'release_year': 'int64',
        'duration': 'int64',
        'avg_rating': 'float64',
        'letterboxd_url': 'string',
        'tmdb_url': 'string',
        'imdb_url': 'string'
        })
    .reset_index(drop=True)
)

df_cast = (
    all_dfs_dict['df_cast'][[
        'film_id',
        'film_title',
        'name',
        'link',
        'character_name'
    ]]
    .assign(link = 'letterboxd.com' + all_dfs_dict['df_cast']['link'])
    .reset_index(drop=True)
    .astype({'link': 'string'})
)

df_crew = (
    all_dfs_dict['df_crew'][[
        'film_id',
        'film_title',
        'name',
        'role',
        'link',
    ]]
    .assign(link = 'letterboxd.com' + all_dfs_dict['df_crew']['link'])
    .reset_index(drop=True)
    .astype({'link': 'string'})
)

df_details = (
    all_dfs_dict['df_details'][[
        'film_id',
        'film_title',
        'key',
        'value',
        'link',
    ]]
    .assign(link = 'letterboxd.com' + all_dfs_dict['df_details']['link'])
    .reset_index(drop=True)
    .astype({'link': 'string'})
)

df_genresthemes = (
    all_dfs_dict['df_gthemes'].rename(columns={0: 'value'})[[
        'film_id',
        'film_title',
        'value'
    ]]
    .reset_index(drop=True)
)
'''

In [54]:
notfornowagain = '''
df_generaldata.to_csv(f'{FILMCLUB_FOLDER}fc_generaldata.csv', sep=';', index=False)
df_cast.to_csv(f'{FILMCLUB_FOLDER}fc_cast.csv', sep=';', index=False)
df_crew.to_csv(f'{FILMCLUB_FOLDER}fc_crew.csv', sep=';', index=False)
df_details.to_csv(f'{FILMCLUB_FOLDER}fc_details.csv', sep=';', index=False)
df_genresthemes.to_csv(f'{FILMCLUB_FOLDER}fc_genresthemes.csv', sep=';', index=False)
'''

# Extract, create and treat DFs - Single User

In [55]:
user_film_urls = get_film_urls_lbxduser('dromemario')

Extracting page 1 out of 15.
Extracting page 2 out of 15.
Extracting page 3 out of 15.
Extracting page 4 out of 15.
Extracting page 5 out of 15.
Extracting page 6 out of 15.
Extracting page 7 out of 15.
Extracting page 8 out of 15.
Extracting page 9 out of 15.
Extracting page 10 out of 15.
Extracting page 11 out of 15.
Extracting page 12 out of 15.
Extracting page 13 out of 15.
Extracting page 14 out of 15.
Extracting page 15 out of 15.
Finished.


In [None]:
wholedata = get_all_films(user_film_urls)

Extracting from URL #0:
/film/repulsion/

Extracting from URL #1:
/film/the-umbrellas-of-cherbourg/

Extracting from URL #2:
/film/dr-strangelove-or-how-i-learned-to-stop-worrying-and-love-the-bomb/

Extracting from URL #3:
/film/from-russia-with-love/

Extracting from URL #4:
/film/barren-lives/

Extracting from URL #5:
/film/8-half/

Extracting from URL #6:
/film/harakiri/

Extracting from URL #7:
/film/antoine-and-colette/

Extracting from URL #8:
/film/cleo-from-5-to-7/

Extracting from URL #9:
/film/la-jetee/

Extracting from URL #10:
/film/through-a-glass-darkly/

Extracting from URL #11:
/film/psycho/

Extracting from URL #12:
/film/pickpocket/

Extracting from URL #13:
/film/hiroshima-mon-amour/

Extracting from URL #14:
/film/the-400-blows/

Extracting from URL #15:
/film/vertigo/

Extracting from URL #16:
/film/witness-for-the-prosecution-1957/

Extracting from URL #17:
/film/12-angry-men/

Extracting from URL #18:
/film/the-seventh-seal/

Extracting from URL #19:
/film/forbi

# Tests

In [97]:
testsoup = get_raw_film_html(user_film_urls[66])

In [98]:
get_general_film_data(testsoup)

{'letterboxd_id': '882450',
 'letterboxd_shorttitle': 'Top Gunner: Danger Zone',
 'letterboxd_longtitle': 'Top Gunner: Danger Zone (2022)',
 'letterboxd_slug': 'top-gunner-danger-zone',
 'letterboxd_url': 'https://letterboxd.com/film/top-gunner-danger-zone/',
 'imdb_url': 'http://www.imdb.com/title/tt20726444/maindetails',
 'tmdb_url': 'https://www.themoviedb.org/movie/980083/',
 'tmdb_id': '980083',
 'release_year': '2022',
 'duration': '86',
 'avg_rating': ''}