## Update film script dataset as needed

Description: Functions to continually update the dataset are provided (similar to SQLs UPDATE command)

In [2]:
import pandas as pd
import numpy as np
import requests

In [4]:
def df_update(df, movie_id, update_dict):
    for jcol in update_dict:
        if jcol in df.columns:
            # make the update
            df.loc[movie_id, jcol] = update_dict[jcol]

            # if necessary, update imdb link
            if jcol == "imdb_id":
                df.loc[movie_id, "imdb_link"] = (
                    "https://www.imdb.com/title/" + df.loc[movie_id, "imdb_id"] + "/"
                )

            # if necessary, update tmdb_poster_link
            if jcol == "tmdb_id":
                url = "https://api.themoviedb.org/3/movie/{}?api_key=075d83b3063def6fdd12763959a9086e&language=en-US".format(
                    df.loc[movie_id, "tmdb_id"]
                )
                try:
                    data = requests.get(url)
                    data.raise_for_status()
                    data = data.json()
                    df.loc[movie_id, "tmdb_poster_link"] = (
                        "https://image.tmdb.org/t/p/w500/" + data["poster_path"]
                    )
                except:
                    df.loc[movie_id, "tmdb_poster_link"] = "NULL"
        else:
            print("WARNING:" + jcol + " is not a valid column name.")

    return df


def locate_movie_id(df, col_name, col_value):
    return df.query(col_name + ' == @col_value')

### Load in current script dataset 

In [5]:
df = pd.read_csv('data_cleaning_and_synthesis_out\\springfield_movie_scripts_2023_01_13_clean.csv', index_col = [0])
col_names = df.columns.tolist()
print('Valid Column names to update:')
print(col_names)

Valid Column names to update:
['movie_title', 'movie_year', 'springfield_link', 'script_text', 'imdb_id', 'imdb_link', 'tmdb_poster_link', 'tmdb_id']


### Locate movie_id by movie name, imdb_id, or tmdb_id 

In [9]:
# method 1: movie name
col_value = 'Elf'
col_name = 'movie_title' # 'imdb_id', 'tmdb_id', or 'movie_name'
locate_movie_id(df, col_name=col_name, col_value=col_value)


Unnamed: 0,movie_title,movie_year,springfield_link,script_text,imdb_id,imdb_link,tmdb_poster_link,tmdb_id
9005,Elf,2003,https://www.springfieldspringfield.co.uk/movie...,"Oh, hello. Youre, uh, youre probably here Abo...",tt0319343,https://www.imdb.com/title/tt0319343/,https://image.tmdb.org/t/p/w500//oOleziEempUPu...,10719


### Perform update

In [7]:
movie_id = 33385
update_dict = {"movie_title":"WALL-E", "imdb_id": "tt0910970", "tmdb_id":10681}

print('Original dataframe:')
print(df.loc[movie_id].drop('script_text').to_markdown(tablefmt="grid") + '\n')

df = df_update(df=df, movie_id=movie_id, update_dict=update_dict)

print('Updated dataframe:')
print(df.loc[movie_id].drop('script_text').to_markdown(tablefmt="grid"))

Original dataframe:
+------------------+------------------------------------------------------------------------+
|                  | 33385                                                                  |
| movie_title      | WALL-E                                                                 |
+------------------+------------------------------------------------------------------------+
| movie_year       | 2008                                                                   |
+------------------+------------------------------------------------------------------------+
| springfield_link | https://www.springfieldspringfield.co.uk/movie_script.php?movie=wall-e |
+------------------+------------------------------------------------------------------------+
| imdb_id          | nan                                                                    |
+------------------+------------------------------------------------------------------------+
| imdb_link        | nan                

### Push changes to the csv file

In [8]:
df.to_csv(
    "data_cleaning_and_synthesis_out\\springfield_movie_scripts_2023_01_13_clean.csv"
)