## update_scripts_dataset.ipynb

Description: 
- Update film script dataset to correct missing or incorrect data.

External Dependencies:
- springfield_movie_scripts.csv from "dataset_film_scripts\\springfield_movie_scripts.csv"

Returns:
- updated springfield_movie_scripts.csv"
- updated df_spaces_upload.csv"

In [None]:
import pandas as pd
import numpy as np
import requests
from build_df_spaces_upload import build_df_spaces_upload

In [None]:
# define updating functions
def df_update(df, movie_id, update_dict):
    for jcol in update_dict:
        if jcol in df.columns:
            # make the update
            df.loc[movie_id, jcol] = update_dict[jcol]

            # if jcol == "imdb_id", update imdb_link
            if jcol == "imdb_id":
                df.loc[movie_id, "imdb_link"] = construct_imdb_link(
                    df.loc[movie_id, "imdb_id"]
                )

            # if jcol == "tmdb_id", update tmdb_poster_link
            if jcol == "tmdb_id":
                df.loc[movie_id, "tmdb_poster_link"] = fetch_tmdb_poster_link(
                    df.loc[movie_id, "tmdb_id"]
                )
        else:
            print("WARNING:" + jcol + " is not a valid column name.")

    return df


def construct_imdb_link(imdb_id):
    return "https://www.imdb.com/title/" + imdb_id + "/"


def locate_movie_id(df, col_name, col_value):
    return df.query(col_name + " == @col_value")


def fetch_tmdb_poster_link(tmdb_id):
    # make api call to themoviedb.org using tmdb_id to get link to poster
    # return "NULL" if 404 error is raised when making api request
    url = "https://api.themoviedb.org/3/movie/{}?api_key=075d83b3063def6fdd12763959a9086e&language=en-US".format(
        tmdb_id
    )
    try:
        data = requests.get(url)
        data.raise_for_status()
        data = data.json()
        return "https://image.tmdb.org/t/p/w500/" + data["poster_path"]
    except:
        return "NULL"

In [None]:
# Step 1: Load in current script dataset
df = pd.read_csv(
    "dataset_film_scripts\\springfield_movie_scripts.csv",
    index_col=[0],
)
col_names = df.columns.tolist()
print(f"Valid Column names to update:\n{col_names}")

In [None]:
# Step2: Locate movie_id by movie name, imdb_id, or tmdb_id
col_value = "Elf"
col_name = "movie_title"  # 'imdb_id', 'tmdb_id', or 'movie_name'
locate_movie_id(df, col_name=col_name, col_value=col_value)

In [None]:
# Step 3: Perform update
movie_id = 33385
update_dict = {"movie_title": "WALL-E", "imdb_id": "tt0910970", "tmdb_id": 10681}

print(
    f'Original entry:\n {df.loc[movie_id].drop("script_text").to_markdown(tablefmt="grid")}\n'
)

df = df_update(df=df, movie_id=movie_id, update_dict=update_dict)

print(
    f'Updated entry:\n{df.loc[movie_id].drop("script_text").to_markdown(tablefmt="grid")}'
)

In [None]:
# Step 4: Push changes to the csv file and update synthesized scripts and imdb dataset
df.to_csv("dataset_film_scripts\\springfield_movie_scripts.csv")

df_new = build_df_spaces_upload()
df_new.to_csv("df_spaces_upload.csv")