## Update film script dataset as needed

Description: Functions to continually update the dataset are provided (similar to SQLs UPDATE command)

In [1]:
import pandas as pd
import numpy as np
import requests
from dataset_recsys_upload import dataset_recsys_upload

In [2]:
# define updating functions
def df_update(df, movie_id, update_dict):
    for jcol in update_dict:
        if jcol in df.columns:
            # make the update
            df.loc[movie_id, jcol] = update_dict[jcol]

            # if jcol == "imdb_id", update imdb_link
            if jcol == "imdb_id":
                df.loc[movie_id, "imdb_link"] = construct_imdb_link(df.loc[movie_id, "imdb_id"])
                
            # if jcol == "tmdb_id", update tmdb_poster_link
            if jcol == "tmdb_id":
                df.loc[movie_id, "tmdb_poster_link"] = fetch_tmdb_poster_link(df.loc[movie_id, "tmdb_id"])
        else:
            print("WARNING:" + jcol + " is not a valid column name.")

    return df

def construct_imdb_link(imdb_id):
    return "https://www.imdb.com/title/" + imdb_id + "/"

def locate_movie_id(df, col_name, col_value):
    return df.query(col_name + ' == @col_value')

def fetch_tmdb_poster_link(tmdb_id):
    # make api call to themoviedb.org using tmdb_id to get link to poster
    # return "NULL" if 404 error is raised when making api request
    url = "https://api.themoviedb.org/3/movie/{}?api_key=075d83b3063def6fdd12763959a9086e&language=en-US".format(
        tmdb_id
    )
    try:
        data = requests.get(url)
        data.raise_for_status()
        data = data.json()
        return "https://image.tmdb.org/t/p/w500/" + data["poster_path"]
    except:
        return "NULL"

In [3]:
# Step 1: Load in current script dataset 
df = pd.read_csv('data_cleaning_and_synthesis_out\\springfield_movie_scripts_2023_01_13_clean.csv', index_col = [0])
col_names = df.columns.tolist()
print(f'Valid Column names to update:\n{col_names}')

Valid Column names to update:
['movie_title', 'movie_year', 'springfield_link', 'script_text', 'imdb_id', 'imdb_link', 'tmdb_poster_link', 'tmdb_id']


In [4]:
# Step2: Locate movie_id by movie name, imdb_id, or tmdb_id 
col_value = 'Elf'
col_name = 'movie_title' # 'imdb_id', 'tmdb_id', or 'movie_name'
locate_movie_id(df, col_name=col_name, col_value=col_value)

Unnamed: 0,movie_title,movie_year,springfield_link,script_text,imdb_id,imdb_link,tmdb_poster_link,tmdb_id
9005,Elf,2003,https://www.springfieldspringfield.co.uk/movie...,"Oh, hello. Youre, uh, youre probably here Abo...",tt0319343,https://www.imdb.com/title/tt0319343/,https://image.tmdb.org/t/p/w500//oOleziEempUPu...,10719


In [5]:
# Step 3: Perform update
movie_id = 33385
update_dict = {"movie_title": "WALL-E", "imdb_id": "tt0910970", "tmdb_id": 10681}

print(
    f'Original entry:\n {df.loc[movie_id].drop("script_text").to_markdown(tablefmt="grid")}\n'
)

df = df_update(df=df, movie_id=movie_id, update_dict=update_dict)

print(
    f'Updated entry:\n{df.loc[movie_id].drop("script_text").to_markdown(tablefmt="grid")}'
)

Original entry:
 +------------------+------------------------------------------------------------------------+
|                  | 33385                                                                  |
| movie_title      | WALL-E                                                                 |
+------------------+------------------------------------------------------------------------+
| movie_year       | 2008                                                                   |
+------------------+------------------------------------------------------------------------+
| springfield_link | https://www.springfieldspringfield.co.uk/movie_script.php?movie=wall-e |
+------------------+------------------------------------------------------------------------+
| imdb_id          | tt0910970                                                              |
+------------------+------------------------------------------------------------------------+
| imdb_link        | https://www.imdb.com/t

In [8]:
# Step 4: Push changes to the csv file and update synthesized scripts and imdb dataset
df.to_csv(
    "data_cleaning_and_synthesis_out\\springfield_movie_scripts_2023_01_13_clean.csv"
)

df_new = dataset_recsys_upload()
df_new.to_csv("data_preprocessing_eda_out\\df_spaces_upload.csv")

  df_imdb_basics = pd.read_csv(path_to_imdb_titles_tsv, sep="\t")


Unnamed: 0,movie_title,movie_year,genre,average_rating,num_votes,is_adult,imdb_id,imdb_link,tmdb_poster_link
0,A 2nd Hand Lover,2015,Romance,4.7,6.0,0,tt10919164,https://www.imdb.com/title/tt10919164/,https://image.tmdb.org/t/p/w500//SZZNNCqEJw4il...
1,A Aa,2016,"Comedy,Drama,Romance",6.8,3310.0,0,tt5684466,https://www.imdb.com/title/tt5684466/,https://image.tmdb.org/t/p/w500//pDkmxToe0sNQG...
2,A Baby at Any Cost,2022,Thriller,5.6,147.0,0,tt15331880,https://www.imdb.com/title/tt15331880/,https://image.tmdb.org/t/p/w500//8osrDpWMzf7w6...
3,A Bad Idea Gone Wrong,2017,Comedy,5.8,1246.0,0,tt5212918,https://www.imdb.com/title/tt5212918/,https://image.tmdb.org/t/p/w500//zGA4aQzypmHiA...
4,A Bad Moms Christmas,2017,Comedy,5.6,54137.0,0,tt6359956,https://www.imdb.com/title/tt6359956/,https://image.tmdb.org/t/p/w500//gPNHolu7AGnrB...
