# Prep

## Imports

In [115]:
import pandas as pd
import numpy as np
import requests
import json
import os
import re

from dotenv import load_dotenv
from bs4 import BeautifulSoup

## Auth

In [27]:
load_dotenv(r"C:\\Users\\User\\Documents\\GitHub\\movies\\tmdb_auth.env")

api_key = os.getenv("API_KEY")
access_token = os.getenv("ACCESS_TOKEN")

# TMDB API Data

In [None]:
tmdb_url = "https://api.themoviedb.org/3/account/21623434/rated/movies?language=en-US&page=1&sort_by=created_at.asc"

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {access_token}"
}

#response = requests.get(tmdb_url, headers=headers)


In [77]:
#data = json.loads(response.text)
#data['results']

# Letterboxd Scraping Data

## Functions

In [132]:
def get_film_urls(list_url):
    content = requests.get(list_url).text
    soup = BeautifulSoup(content, 'html', from_encoding='UTF-8') ## check encoding

    url_list = [div['data-target-link'] for div in soup.find_all('div', class_='film-poster')]

    return url_list

In [123]:
def get_raw_film_html(film_url):
    url = "https://letterboxd.com" + film_url
    content = requests.get(url).text
    soup = BeautifulSoup(content, 'html.parser')

    return soup

In [144]:
def get_general_film_data(soup):
    duration_string = soup.find(class_='text-footer').get_text().replace('\xa0', ' ').strip()

    general_data = {
        'letterboxd_id': soup.find(id='backdrop')['data-film-id'],
        'letterboxd_link': '',
        'imdb_link': soup.find('a', {'data-track-action': 'IMDb'})['href'],
        'tmdb_link': soup.find('a', {'data-track-action': 'TMDb'})['href'],
        'tmdb_id': '',
        'release_year': soup.find(class_='releaseyear').find('a').get_text(strip=True),
        'duration': re.search(r'(\d+)\s+mins', duration_string).group(1)
    }

    general_data['tmdb_id'] = general_data['tmdb_link'].split('/')[-2]

    return general_data

In [None]:
def get_film_cast(soup):
    cast_list = []
    cast = soup.find(name='div', class_='cast-list').find_all('a', class_='tooltip')

    for member in cast:
        cast_member_info = {
            'name': member.get_text(strip=True),
            'link': member['href'],
            'character_name': member['title']
        }
        cast_list.append(cast_member_info)

    return cast_list

In [53]:
def get_film_crew(soup):
    crew_list = []
    crew = soup.find(id='tab-crew').find_all('a')

    for member in crew:
        split_link = member['href'].split('/')
        
        crew_member_info = {
            'name': member.get_text(strip=True),
            'role': split_link[1],
            'link': member['href'],
        }
        crew_list.append(crew_member_info)
    
    return crew_list

In [58]:
def get_film_details(soup):
    details_list = []
    details = soup.find(id='tab-details').find_all('a')

    for detail in details:
        split_link = detail['href'].split('/')

        detail_info = {
            'key': '',
            'value': detail.get_text(strip=True),
            'link': detail['href']
        }

        if 'studio' in detail['href']:
            detail_info['key'] = 'studio'
        elif 'country' in detail['href']:
            detail_info['key'] = 'country'
        elif 'language' in detail['href']:
            detail_info['key'] = 'language'
        else:
            detail_info['key'] = 'ERROR'
        details_list.append(detail_info)

    return details_list

In [33]:
def get_film_genres(soup):
    genres = [a_tag.get_text(strip=True) for a_tag in soup.find(id='tab-genres').find_all('a')]

    return genres[:-1]

In [145]:
def get_all_film_data(film_url):
    film_soup = get_raw_film_html(film_url)

    film_data = {
        'general_data': get_general_film_data(film_soup),
        'cast': get_film_cast(film_soup),
        'crew': get_film_crew(film_soup),
        'genres_and_themes': get_film_genres(film_soup)
    }

    return film_data

## Extraction

In [35]:
film_urls = get_film_urls("https://letterboxd.com/dromemario/list/fff-film-fueled-friends/")

In [136]:
soup_wizard = get_raw_film_html(film_urls[0])

In [None]:
gdata_wizard = get_general_film_data(soup_wizard)
cast_wizard = get_film_cast(soup_wizard)
crew_wizard = get_film_crew(soup_wizard)
details_wizard = get_film_details(soup_wizard)
genres_wizard = get_film_genres(soup_wizard)

In [147]:
#soup_wizard.find(name='div', class_='cast-list')#.find_all('a', class_='tooltip')
#soup_wizard.find(id='tab-crew')#.find_all('a')
#soup_wizard.find(id='tab-details')#.find_all('a')
#soup_wizard.find(id='tab-genres')#.find_all('a')