# Prep

## Imports

In [156]:
import pandas as pd
import numpy as np
import requests
import pprint
import json
import os
import re

from dotenv import load_dotenv
from bs4 import BeautifulSoup

## Auth

In [157]:
load_dotenv(r"C:\\Users\\User\\Documents\\GitHub\\movies\\tmdb_auth.env")

api_key = os.getenv("API_KEY")
access_token = os.getenv("ACCESS_TOKEN")

# TMDB API Data

In [158]:
tmdb_url = "https://api.themoviedb.org/3/account/21623434/rated/movies?language=en-US&page=1&sort_by=created_at.asc"

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {access_token}"
}

#response = requests.get(tmdb_url, headers=headers)


In [159]:
#data = json.loads(response.text)
#data['results']

# Letterboxd Scraping Data

## Functions

In [160]:
def get_film_urls(list_url):
    content = requests.get(list_url).text
    soup = BeautifulSoup(content, 'html')

    url_list = [div['data-target-link'] for div in soup.find_all('div', class_='film-poster')]

    return url_list

In [161]:
def get_raw_film_html(film_url):
    url = "https://letterboxd.com" + film_url
    content = requests.get(url).text
    soup = BeautifulSoup(content, 'html.parser')

    return soup

In [162]:
def get_general_film_data(soup):
    duration_string = soup.find(class_='text-footer').get_text().replace('\xa0', ' ').strip()

    general_data = {
        'letterboxd_id': soup.find(id='backdrop')['data-film-id'],
        'letterboxd_shorttitle': soup.find('h1', class_='filmtitle').get_text(),
        'letterboxd_longtitle': soup.find(property='og:title')['content'],
        'letterboxd_slug': soup.find(id='backdrop')['data-film-slug'],
        'letterboxd_url': soup.find(property='og:url')['content'],
        'imdb_url': soup.find('a', {'data-track-action': 'IMDb'})['href'],
        'tmdb_url': soup.find('a', {'data-track-action': 'TMDb'})['href'],
        'tmdb_id': '',
        'release_year': soup.find(class_='releaseyear').find('a').get_text(strip=True),
        'duration': re.search(r'(\d+)\s+mins', duration_string).group(1),
        'avg_rating': soup.find('meta', attrs={'name': 'twitter:data2'})['content'].split(' out')[0]
    }

    general_data['tmdb_id'] = general_data['tmdb_url'].split('/')[-2]

    return general_data

In [163]:
def get_film_cast(soup):
    cast_list = []
    cast = soup.find(name='div', class_='cast-list').find_all('a', class_='tooltip')

    for member in cast:
        cast_member_info = {
            'name': member.get_text(strip=True),
            'link': member['href']
            #'character_name': member['title']
        }

        try:
            cast_member_info['character_name'] = member['title']
        except:
            cast_member_info['character_name'] = None
        cast_list.append(cast_member_info)

    return cast_list

In [164]:
def get_film_crew(soup):
    crew_list = []
    crew = soup.find(id='tab-crew').find_all('a')

    for member in crew:
        split_link = member['href'].split('/')
        
        crew_member_info = {
            'name': member.get_text(strip=True),
            'role': split_link[1],
            'link': member['href'],
        }
        crew_list.append(crew_member_info)
    
    return crew_list

In [165]:
def get_film_details(soup):
    details_list = []
    details = soup.find(id='tab-details').find_all('a')

    for detail in details:
        split_link = detail['href'].split('/')

        detail_info = {
            'key': '',
            'value': detail.get_text(strip=True),
            'link': detail['href']
        }

        if 'studio' in detail['href']:
            detail_info['key'] = 'studio'
        elif 'country' in detail['href']:
            detail_info['key'] = 'country'
        elif 'language' in detail['href']:
            detail_info['key'] = 'language'
        else:
            detail_info['key'] = 'ERROR'
        details_list.append(detail_info)

    return details_list

In [166]:
def get_film_genres(soup):
    genres = [a_tag.get_text(strip=True) for a_tag in soup.find(id='tab-genres').find_all('a')]

    return genres[:-1]

In [167]:
def get_complete_film_data(film_url):
    film_soup = get_raw_film_html(film_url)

    film_data = {
        'general_data': get_general_film_data(film_soup),
        'cast': get_film_cast(film_soup),
        'crew': get_film_crew(film_soup),
        'details': get_film_details(film_soup),
        'genres_and_themes': get_film_genres(film_soup)
    }

    return film_data

In [168]:
def get_all_films(url_list):
    whole_data = []

    counter = 0
    for film in url_list:
        #print(f"Extracting from URL #{counter}:\n{film}\n")
        whole_data.append(get_complete_film_data(film))
        counter += 1
    
    return whole_data

In [181]:
def dicts_to_dfs(data):
    all_dfs_gdata = [] # general data
    all_dfs_cast = []
    all_dfs_crew = []
    all_dfs_details = []
    all_dfs_gthemes = []


    for film in data:
        id = film['general_data']['letterboxd_id']
        title = film['general_data']['letterboxd_shorttitle']
        
        single_df_gdata = pd.DataFrame.from_dict([film['general_data']])
        all_dfs_gdata.append(single_df_gdata)

        single_df_cast = pd.DataFrame.from_dict(film['cast']).assign(film_id = id, film_title = title)
        all_dfs_cast.append(single_df_cast)

        single_df_crew = pd.DataFrame.from_dict(film['crew']).assign(film_id = id, film_title = title)
        all_dfs_crew.append(single_df_crew)

        single_df_details = pd.DataFrame.from_dict(film['details']).assign(film_id = id, film_title = title)
        all_dfs_details.append(single_df_details)

        single_df_gthemes = pd.DataFrame.from_dict(film['genres_and_themes']).assign(film_id = id, film_title = title)
        all_dfs_gthemes.append(single_df_gthemes)

    all_dfs_dict = {
        'df_gdata': pd.concat(all_dfs_gdata),
        'df_cast': pd.concat(all_dfs_cast),
        'df_crew': pd.concat(all_dfs_crew),
        'df_details': pd.concat(all_dfs_details),
        'df_gthemes': pd.concat(all_dfs_gthemes)
    }

    return all_dfs_dict

## Extraction

In [175]:
film_urls = get_film_urls("https://letterboxd.com/dromemario/list/fff-film-fueled-friends/")

#films_data = get_all_films(film_urls)

#with open("films_data.json", "w") as json_file:
#    json.dump(films_data, json_file, indent=4)

In [182]:
with open("films_data.json", "r") as json_file:
    films_data = json.load(json_file)

In [183]:
all_dfs_dict = dicts_to_dfs(films_data)

In [184]:
df_gdata = all_dfs_dict['df_gdata']
df_cast = all_dfs_dict['df_cast']
df_crew = all_dfs_dict['df_crew']
df_details = all_dfs_dict['df_details']
df_gthemes = all_dfs_dict['df_gthemes']

In [185]:
df_cast

Unnamed: 0,name,link,character_name,film_id,film_title
0,Luke Edwards,/actor/luke-edwards/,Jimmy,51869,The Wizard
1,Vince Trankina,/actor/vince-trankina/,Tate,51869,The Wizard
2,Wendy Phillips,/actor/wendy-phillips/,Christine,51869,The Wizard
3,Dea McAllister,/actor/dea-mcallister/,Counselor,51869,The Wizard
4,Sam McMurray,/actor/sam-mcmurray/,Bateman,51869,The Wizard
...,...,...,...,...,...
18,Wong Kim-Wai,/actor/wong-kim-wai/,Hitman,45489,Fallen Angels
19,Wong Kim-Bun,/actor/wong-kim-bun/,Hitman,45489,Fallen Angels
20,Choi Kwok-Keung,/actor/choi-kwok-keung/,Hitman,45489,Fallen Angels
21,Lee Tat-Chiu,/actor/lee-tat-chiu-1/,,45489,Fallen Angels


# Tests

In [187]:
df_gthemes.head()

Unnamed: 0,0,film_id,film_title
0,Adventure,51869,The Wizard
1,Family,51869,The Wizard
2,Drama,51869,The Wizard
3,Comedy,51869,The Wizard
4,Underdogs and coming of age,51869,The Wizard


In [180]:
pprint.pprint(films_data[0]['general_data'], sort_dicts=False)

{'letterboxd_id': '51869',
 'letterboxd_shorttitle': 'The Wizard',
 'letterboxd_longtitle': 'The Wizard (1989)',
 'letterboxd_slug': 'the-wizard',
 'letterboxd_url': 'https://letterboxd.com/film/the-wizard/',
 'imdb_url': 'http://www.imdb.com/title/tt0098663/maindetails',
 'tmdb_url': 'https://www.themoviedb.org/movie/183/',
 'tmdb_id': '183',
 'release_year': '1989',
 'duration': '100',
 'avg_rating': '2.92'}
