In [2]:
from dotenv import load_dotenv
import requests
import numpy as np
import pandas as pd
import os
import re
from bs4 import BeautifulSoup
import time

In [None]:
load_dotenv() 
THE_MOVIE_DB_API_KEY = os.getenv("THE_MOVIE_DB_API_KEY")
THE_MOVIE_DB_ACCESS_TOKEN = os.getenv("THE_MOVIE_DB_ACCESS_TOKEN")
THE_MOVIE_DB_BASE_URL = os.getenv("THE_MOVIE_DB_BASE_URL")

THE_MOVIE_DB_CONVERTER_ID = 'https://api.themoviedb.org/3/movie/{id}/external_ids?api_key={key}'

In [4]:
headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {THE_MOVIE_DB_ACCESS_TOKEN}"
}

In [6]:
#initial crawl
def get_movies(num_pages: int, max_retries: int = 5) -> pd.DataFrame:

    entries = []

    url = THE_MOVIE_DB_BASE_URL + "/discover/movie?include_adult=false&include_video=false&language=en-US&page={page}&sort_by=popularity.desc" #type: ignore
    for page in range(1, num_pages + 1):
        for _ in range(max_retries):
            response = requests.get(url.format(page=page), headers=headers)
            if response.status_code == 200:
                break
            time.sleep(1)
        else:
            raise Exception("Failed to get data after maximum retries")
        
        data = response.json()["results"]

        entries.extend(data)

    return pd.DataFrame(entries)

In [None]:
raw_data = get_movies(30)

In [8]:
raw_data.to_csv("raw_data.csv", index=False)

In [None]:
def parse_money(money_tag):
    if money_tag is None:
        return None
    if hasattr(money_tag, 'text'):
        money_tag = money_tag.text
    return int(money_tag.replace('$', '').replace(',', ''))

def parse_summary(summary_tag):
    summary_dict = {}
    for item in summary_tag:
        contents = item.find_all('span')
        key = contents[0].get_text(strip=True)
        value = contents[1].contents[0].get_text(strip=True) if len(contents[1].contents) > 0 else None
        summary_dict[key] = value
    return summary_dict

def parse_duration(duration_str):
    if duration_str:
        match = re.search(r'(\d+)\s*hr\s*(\d+)\s*min', duration_str)
        if match:
            hours = int(match.group(1))
            minutes = int(match.group(2))
            total_minutes = hours * 60 + minutes
            return total_minutes
    return None

def extract_release_date(release_date_str) -> pd.Timestamp | None:
    if release_date_str and re.search(r'\w+ \d{1,2}, \d{4}', release_date_str):
        date_str = re.search(r'\w+ \d{1,2}, \d{4}', release_date_str).group() #type: ignore
        return pd.to_datetime(date_str)
    return None

gross_selector = '.a-section.a-spacing-none.mojo-performance-summary-table .a-section.a-spacing-none'
summary_selector = '.a-section.a-spacing-none.mojo-summary-values.mojo-hidden-from-mobile .a-section.a-spacing-none'

In [None]:
def box_office_mojo_linker(themoviedbID: str, max_retries: int = 5) -> dict | None:
    url = THE_MOVIE_DB_CONVERTER_ID.format(id=themoviedbID, key=THE_MOVIE_DB_API_KEY)
    response = requests.get(url)
    imdb_id = response.json().get('imdb_id')
    if not imdb_id:
        return None
    imdb_url = f'https://www.boxofficemojo.com/title/{imdb_id}'
    for _ in range(max_retries):
        response = requests.get(imdb_url)
        if response.status_code == 200:
            break
        time.sleep(1)
    else :
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    gross_tags = soup.select(gross_selector)
    summary = parse_summary(soup.select(summary_selector))
    domestic_gross, international_gross, worldwide_gross = [parse_money(i.select_one('.money')) for i in gross_tags] #type: ignore
    release_date = extract_release_date(summary.get('Release Date') or summary.get('Earliest Release Date'))
    duration_minutes = parse_duration(summary.get('Running Time'))
    domestic_openening = parse_money(summary.get('Domestic Opening'))

    return {
        'id' : id,
        'imdb_id': imdb_id,
        'domestic_gross': domestic_gross,
        'international_gross': international_gross,
        'worldwide_gross': worldwide_gross,
        'release_date': release_date,
        'duration_minutes': duration_minutes,
        'domestic_openening': domestic_openening
    }


In [None]:
csv_file_path = "raw_data.csv"
df = pd.read_csv(csv_file_path)

def add_box_office_info(row):
    themoviedb_id = row['id']
    box_office_data = box_office_mojo_linker(themoviedb_id)

    if box_office_data:
        for key, value in box_office_data.items():
            row[key] = value
    return row

df = df.apply(add_box_office_info, axis=1)

df.to_csv("updated_file.csv", index=False)

print("CSV file updated successfully.")