# _Map_ ANCINE _and_ TMDB _movies_ _identifiers_

In [None]:
import os
from urllib.parse import urljoin

import dotenv
import pandas as pd
import requests

import utils
from utils import tmdb

dotenv.load_dotenv()

TMDB_BASE_URL: str = "https://api.themoviedb.org/3/"
TMDB_REQUEST_HEADER: dict[str, str] = {"Authorization": f"Bearer {os.getenv('TMDB_BEARER_TOKEN')}"}

In [None]:
RAW_ANCINE_MOVIES: pd.DataFrame = (
    pd.read_csv(
        "../data/raw/ancine-movies.csv",
        sep=";",
        header=1,
        names=["title", "id_ancine", "release_date", "public"],
        usecols=[1, 2, 6, 11],
        nrows=4842,
        na_values=["Sem CPB", "Sem ROE", "ND"],
        parse_dates=["release_date"],
        dayfirst=True,
        thousands=".",
        decimal=",",
        dtype_backend="pyarrow",
    )
    .dropna()
    .set_index("id_ancine")
)

RAW_ANCINE_DIRECTORS_BRAZIL: pd.Series = pd.read_csv(
    "../data/raw/ancine-directors-brazil.csv",
    sep=r"\s{0,};",
    header=0,
    names=["id_ancine", "director"],
    index_col="id_ancine",  # movie identifier
    usecols=[0, 2],
    engine="python",
    encoding="mbcs",
    dtype_backend="pyarrow",
).squeeze("columns")

RAW_ANCINE_DIRECTORS_FOREIGN: pd.Series = pd.read_csv(
    "../data/raw/ancine-directors-foreign.csv",
    sep=";",
    header=0,
    names=["id_ancine", "director"],
    index_col="id_ancine",  # movie identifier
    usecols=[0, 2],
    encoding="mbcs",
    dtype_backend="pyarrow",
).squeeze("columns")

IDS_ANCINE_TMDB_MANUALLY_MAPPED: pd.Series = pd.read_csv(
    "../data/manually-mapped/movies-ids-ancine-tmdb.csv",
    index_col="id_ancine",
    dtype_backend="pyarrow",
).squeeze("columns")

In [None]:
RAW_ANCINE_DIRECTORS: pd.Series = (
    pd.concat(
        [
            RAW_ANCINE_DIRECTORS_BRAZIL,
            RAW_ANCINE_DIRECTORS_FOREIGN,
        ]
    )
    .groupby("id_ancine")
    .aggregate(" ".join)
)

In [None]:
movies_ancine: pd.DataFrame = RAW_ANCINE_MOVIES.copy()
movies_ancine: pd.DataFrame = movies_ancine.join(RAW_ANCINE_DIRECTORS)

movies_ancine["title"] = movies_ancine["title"].str.replace(r"\([0-9]{4}\)", "", regex=True)
movies_ancine["title"] = movies_ancine["title"].apply(utils.sanitize_movie_title)
movies_ancine["director"] = movies_ancine["director"].apply(utils.sanitize_director_name)

In [None]:
URL: str = urljoin(TMDB_BASE_URL, "search/movie")

with requests.Session() as session:
    session.headers.update(TMDB_REQUEST_HEADER)
    for index, row in movies_ancine.iterrows():
        response: requests.Response = session.get(
            URL,
            params={
                "query": row["title"],
                "year": row["release_date"].year,
                "region": "BR",  # needed to find movies by release year in the specified region
            },
        )
        response.raise_for_status()
        content: dict = response.json()
        movies_ancine.at[index, "id_tmdb"] = tmdb.extract_first_movie_tmdb_id(content)

In [None]:
MASK: pd.Series = movies_ancine["id_tmdb"].notna()
URL: str = urljoin(TMDB_BASE_URL, "movie/{}/credits")

with requests.Session() as session:
    session.headers.update(TMDB_REQUEST_HEADER)
    for index, row in movies_ancine[MASK].iterrows():
        response: requests.Response = session.get(URL.format(row["id_tmdb"]))
        response.raise_for_status()
        content: dict = response.json()
        movies_ancine.at[index, "director_tmdb"] = tmdb.extract_movie_director(content)

In [None]:
movies_ancine["director_tmdb"] = movies_ancine["director_tmdb"].apply(utils.sanitize_director_name)

MASK: pd.Series = movies_ancine["director"].notna() & movies_ancine["director_tmdb"].notna()

for index, row in movies_ancine[MASK].iterrows():
    movies_ancine.at[index, "same_director"] = utils.is_same_director(
        row["director"],
        row["director_tmdb"],
    )

movies_ancine["id_tmdb"] = movies_ancine["id_tmdb"].where(movies_ancine["same_director"], None)

In [None]:
ids_ancine_tmdb: pd.Series = movies_ancine["id_tmdb"].copy()

ids_ancine_tmdb.update(IDS_ANCINE_TMDB_MANUALLY_MAPPED)

ids_ancine_tmdb.dropna().to_csv("../data/movies-ids-ancine-tmdb.csv")