# _Map_ ANCINE _and_ Classind _movie_ _identifiers_

In [1]:
import pandas as pd

import utils

In [2]:
RAW_ANCINE_MOVIES: pd.DataFrame = (
    pd.read_csv(
        "../data/raw/ancine-movies.csv",
        sep=";",
        header=1,
        names=["title", "id_ancine", "release_date", "public"],
        usecols=[1, 2, 6, 11],
        nrows=4842,
        na_values=["Sem CPB", "Sem ROE", "ND"],
        parse_dates=["release_date"],
        dayfirst=True,
        thousands=".",
        decimal=",",
        dtype_backend="pyarrow",
    )
    .dropna()
    .set_index("id_ancine")
)

RAW_ANCINE_DIRECTORS_BRAZIL: pd.Series = pd.read_csv(
    "../data/raw/ancine-directors-brazil.csv",
    sep=r"\s{0,};",
    header=0,
    names=["id_ancine", "director"],
    index_col="id_ancine",  # movie identifier
    usecols=[0, 2],
    engine="python",
    encoding="mbcs",
    dtype_backend="pyarrow",
).squeeze("columns")

RAW_ANCINE_DIRECTORS_FOREIGN: pd.Series = pd.read_csv(
    "../data/raw/ancine-directors-foreign.csv",
    sep=";",
    header=0,
    names=["id_ancine", "director"],
    index_col="id_ancine",  # movie identifier
    usecols=[0, 2],
    encoding="mbcs",
    dtype_backend="pyarrow",
).squeeze("columns")

RAW_CLASSIND_MOVIES: pd.DataFrame = pd.read_csv(
    "../data/raw/classind-movies.csv",
    header=None,
    names=[
        "id_classind",
        "title",
        "director",
        "category",
        "market",
        "rating_intended",
        "rating_assigned",
    ],
    index_col="id_classind",
    usecols=[1, 2, 7, 11, 13, 15, 16],
    dtype_backend="pyarrow",
)

IDS_ANCINE_CLASSIND_MANUALLY_MAPPED: pd.Series = pd.read_csv(
    "../data/manually-mapped/movies-ids-ancine-classind.csv",
    index_col="id_ancine",
    dtype_backend="pyarrow",
).squeeze("columns")

In [3]:
RAW_ANCINE_DIRECTORS: pd.Series = (
    pd.concat(
        [
            RAW_ANCINE_DIRECTORS_BRAZIL,
            RAW_ANCINE_DIRECTORS_FOREIGN,
        ]
    )
    .groupby("id_ancine")
    .aggregate(" ".join)  # aggregating directors of the same movie
)

In [None]:
movies_ancine: pd.DataFrame = RAW_ANCINE_MOVIES.copy()
movies_ancine: pd.DataFrame = movies_ancine.join(RAW_ANCINE_DIRECTORS)

movies_ancine["title"] = movies_ancine["title"].str.replace(r"\([0-9]{4}\)", "", regex=True)
movies_ancine["title"] = movies_ancine["title"].apply(utils.sanitize_movie_title)
movies_ancine["director"] = movies_ancine["director"].apply(utils.sanitize_director_name)

In [None]:
CATEGORIES: list[str] = [
    "Curta Metragem",
    "Documentário",
    "Filme",
    "Longa Metragem e Trailer",
    "Longa Metragem",
    "Média Metragem",
]

MARKETS: list[str] = [
    "Cinema",
    "Mostra/Festival",
]

movies_classind: pd.DataFrame = RAW_CLASSIND_MOVIES.reset_index().copy()

movies_classind["category"] = movies_classind["category"].str.strip()
movies_classind["director"] = movies_classind["director"].apply(utils.sanitize_director_name)
movies_classind["market"] = movies_classind["market"].str.strip()
movies_classind["title"] = movies_classind["title"].apply(utils.sanitize_movie_title)

movies_classind: pd.DataFrame = (
    movies_classind.dropna(subset=["title", "director", "category", "market"])
    .query("category in @CATEGORIES and market in @MARKETS")
    .drop_duplicates(subset="id_classind")
    .merge(movies_ancine.reset_index(), on="title", suffixes=["_classind", "_ancine"])
)

In [None]:
mask: pd.Series = (
    movies_classind["director_classind"].notna() & movies_classind["director_ancine"].notna()
)

for index, row in movies_classind[mask].iterrows():
    movies_classind.loc[index, "same_director"] = utils.is_same_director(
        row["director_classind"],
        row["director_ancine"],
    )

ancine_movies: pd.DataFrame = movies_ancine.join(
    movies_classind.loc[movies_classind["same_director"], ["id_ancine", "id_classind"]]
    .drop_duplicates(subset="id_ancine")
    .set_index("id_ancine")
)

In [None]:
ids_ancine_classind: pd.Series = ancine_movies["id_classind"].copy()

ids_ancine_classind.update(IDS_ANCINE_CLASSIND_MANUALLY_MAPPED)

ids_ancine_classind.dropna().to_csv("../data/movies-ids-ancine-classind.csv")