# _Map_ ANCINE _and_ IMDB _movies_ _identifiers_


In [1]:
import os
from urllib.parse import urljoin

import dotenv
import pandas as pd
import requests

from utils import tmdb

dotenv.load_dotenv()

TMDB_BASE_URL: str = "https://api.themoviedb.org/3/"
TMDB_REQUEST_HEADER: dict[str, str] = {"Authorization": f"Bearer {os.getenv('TMDB_BEARER_TOKEN')}"}

In [2]:
RAW_ANCINE_MOVIES: pd.DataFrame = (
    pd.read_csv(
        "../data/raw/ancine-movies.csv",
        sep=";",
        header=1,
        names=["title", "id_ancine", "release_date", "public"],
        usecols=[1, 2, 6, 11],
        nrows=4842,
        na_values=["Sem CPB", "Sem ROE", "ND"],
        parse_dates=["release_date"],
        dayfirst=True,
        thousands=".",
        decimal=",",
        dtype_backend="pyarrow",
    )
    .dropna()
    .set_index("id_ancine")
)

IDS_ANCINE_TMDB: pd.Series = pd.read_csv(
    "../data/movies-ids-ancine-tmdb.csv",
    index_col="id_ancine",
    dtype_backend="pyarrow",
).squeeze("columns")

IDS_ANCINE_IMDB_MANUALLY_MAPPED: pd.Series = pd.read_csv(
    "../data/manually-mapped/movies-ids-ancine-imdb.csv",
    index_col="id_ancine",
    dtype_backend="pyarrow",
).squeeze("columns")

In [3]:
movies_ancine: pd.DataFrame = RAW_ANCINE_MOVIES.copy()
movies_ancine: pd.DataFrame = movies_ancine.join(IDS_ANCINE_TMDB)

In [4]:
MASK: pd.Series = movies_ancine["id_tmdb"].notna()
URL: str = urljoin(TMDB_BASE_URL, "movie/{}/external_ids")

with requests.Session() as session:
    session.headers.update(TMDB_REQUEST_HEADER)
    for index, row in movies_ancine[MASK].iterrows():
        response: requests.Response = session.get(URL.format(row["id_tmdb"]))
        response.raise_for_status()
        content: dict = response.json()
        movies_ancine.at[index, "id_imdb"] = tmdb.extract_movie_imdb_id(content)

In [5]:
ids_ancine_imdb: pd.Series = movies_ancine["id_imdb"].copy()

ids_ancine_imdb.update(IDS_ANCINE_IMDB_MANUALLY_MAPPED)

ids_ancine_imdb.dropna().to_csv("../data/movies-ids-ancine-imdb.csv")