## Clean film script dataset and find matching IMDb_id

Description: load in raw dataset of film scripts obtained from webscrapers, clean any inconsistencies or mistakes, assign each film script an IMDb_id based on title or by manual insert. Functions to continually update the dataset at later times are provided at the end of the notbook (similar to SQLs UPDATE command)

In [4]:
import pandas as pd
import numpy as np
import gradio as gr
from matplotlib import pyplot as plt
import pickle
import time

In [7]:
# import film script dataset
path_to_csv = '../database_film_scripts/scraped_scripts/springfield_movie_scripts_2023_01_13.csv' 
df_orig = pd.read_csv(path_to_csv, index_col=[0])

df_orig.head()

Unnamed: 0,Movie Title,Movie Script Link,Script Text
0,A 2nd Hand Lover (2015),https://www.springfieldspringfield.co.uk/movie...,Wow! Amazing! What a beautiful place. Im meet...
1,A Aa (2016),https://www.springfieldspringfield.co.uk/movie...,"3 My every birthday, Mom hopes for something...."
2,A Baby at Any Cost (2022),https://www.springfieldspringfield.co.uk/movie...,"0 Jen? Jen? Jen? Oops... sorry! Jason? Oh, sh..."
3,A Bad Idea Gone Wrong (2017),https://www.springfieldspringfield.co.uk/movie...,"1 - The tropical places, you know, thats the ..."
4,A Bad Moms Christmas (2017),https://www.springfieldspringfield.co.uk/movie...,"1 My name is Amy Mitchell, and this year Ive ..."


In [8]:
# create Movie Year feature using information from Movie Title column
df_orig["Movie Year"] = df_orig["Movie Title"].map(lambda x: x[-5:-1])
df_orig["Movie Title"] = df_orig["Movie Title"].map(lambda x: x[0:-6])

# manually inspect and fix data errors such as movie being released in year 0000 or 0147
print(df_orig["Movie Year"].unique())
print()
print(df_orig.loc[df_orig["Movie Year"] == "0147", ["Movie Title", "Movie Year"]])

# manually go in and fix the spotted errors. Use IMDb to find correct information
df_orig.loc[12856, "Movie Title"] = "Hollows Grove"
df_orig.loc[12224, "Movie Title"] = "Alien Battlefield Earth"
df_orig.loc[8151, "Movie Title"] = "Disappearance"

df_orig.loc[df_orig["Movie Year"] == "0000", "Movie Year"] = "1986"
df_orig.loc[df_orig["Movie Year"] == "2050", "Movie Year"] = "2017"
df_orig.loc[df_orig["Movie Year"] == "0147", "Movie Year"] = "2014"
df_orig.loc[df_orig["Movie Year"] == "arth", "Movie Year"] = "2021"

# df_orig["Movie Year"] = df_orig["Movie Year"].astype("int")

# reorder and rename columns
df_orig = df_orig[["Movie Title", "Movie Year", "Movie Script Link", "Script Text"]]
df_orig.columns = ['movie_title', 'movie_year', 'springfield_link', 'script_text']
df_orig.head()

['2015' '2016' '2022' '2017' '2011' '2012' '2021' '2004' '2019' '2010'
 '2014' '2020' '1966' '1932' '2013' '1968' '2018' '2009' '1969' '1991'
 '1998' '2008' '2003' '1967' '1965' '1984' '1999' '1983' '1940' '1987'
 '1971' '1949' '2006' '1956' '1937' '1948' '1928' '1997' '1973' '2000'
 '1989' '1957' '1993' '1935' '1992' '1931' '1945' '1952' '1990' '1943'
 '1986' '1955' '1995' '1962' '2001' '1950' '1979' '2005' '1994' '1941'
 '1946' '1982' '2007' '1913' '1958' '1985' '1978' '1961' '1972' '1974'
 '1938' '1976' '1963' '1960' '1951' '1933' '2002' '1975' '1936' '1988'
 '1942' '1977' '1953' '1954' '1930' '1981' '1980' '1964' '1970' '1939'
 '1996' '1944' '1959' '1910' '2023' '1947' '1934' '1915' '1926' '1922'
 '1929' '1914' '1916' '1920' '0000' '2050' '1921' '1924' '1927' '1925'
 'arth' '0147' '1923' '1912' '1917' '1919' '1918']

           Movie Title Movie Year
12856  Hollows Grove (       0147


Unnamed: 0,Movie Title,Movie Year,Movie Script Link,Script Text
0,A 2nd Hand Lover,2015,https://www.springfieldspringfield.co.uk/movie...,Wow! Amazing! What a beautiful place. Im meet...
1,A Aa,2016,https://www.springfieldspringfield.co.uk/movie...,"3 My every birthday, Mom hopes for something...."
2,A Baby at Any Cost,2022,https://www.springfieldspringfield.co.uk/movie...,"0 Jen? Jen? Jen? Oops... sorry! Jason? Oh, sh..."
3,A Bad Idea Gone Wrong,2017,https://www.springfieldspringfield.co.uk/movie...,"1 - The tropical places, you know, thats the ..."
4,A Bad Moms Christmas,2017,https://www.springfieldspringfield.co.uk/movie...,"1 My name is Amy Mitchell, and this year Ive ..."


In [10]:
# clean up title names: remove white space and convert 'title, the' to 'the title'
df_orig['movie_title'] = df_orig['movie_title'].str.strip()
df_orig['movie_title'] = ['The ' + jmovie_title[0:-5] if jmovie_title.endswith(', The') else jmovie_title for jmovie_title in df_orig['movie_title']]

In [11]:
# inspect movies with duplicate name and year
duplicates = df_orig[["movie_title", "movie_year"]].value_counts()
duplicates = duplicates[duplicates > 1]
print(len(duplicates))
for jmovie in duplicates.index:
    print(df_orig.query("movie_title == @jmovie[0] and movie_year == @jmovie[1]"))
    print()

60
                          movie_title movie_year  \
24535  The Seeker: The Dark Is Rising       2007   
30621  The Seeker: The Dark Is Rising       2007   

                                        springfield_link  \
24535  https://www.springfieldspringfield.co.uk/movie...   
30621  https://www.springfieldspringfield.co.uk/movie...   

                                             script_text  
24535   - We go, let us leave of here. - The school f...  
30621   - We go, let us leave of here. - The school f...  

            movie_title movie_year  \
13161  The House of Yes       1997   
29096  The House of Yes       1997   

                                        springfield_link  \
13161  https://www.springfieldspringfield.co.uk/movie...   
29096  https://www.springfieldspringfield.co.uk/movie...   

                                             script_text  
13161   Ill always remember that day. Marty and I had...  
29096   Ill always remember that day. Marty and I had...  

      m

In [12]:
# The door and the avengers are not duplicates but have the same name and year
# drop all other duplicates from the database and reset the index
jdrop = []
for j in duplicates.index:
    if j[0].lower() == 'the door' or j[0].lower() == 'the avengers':
        print(j[0])
    else:
        jdrop.append(df_orig.query('movie_title == @j[0] and movie_year == @j[1]').index[1])   
df_orig = df_orig.drop(jdrop,axis=0)
df_orig = df_orig.reset_index(drop=True)

The Avengers
The Door


## Synthesize film script dataset with IMDb dataset by assigning each movie in film script dataset an IMDb_id

### method a) match tables based on title name

In [None]:
# preprocess movie script dataframe titles
df = df_orig.copy()
df['movie_title'] = df['movie_title'].str.lower()
df['movie_title'] = df['movie_title'].str.strip()
df['movie_year'] = df['movie_year'].astype(int)

In [None]:
# load in IMDB database
path_to_tsv = 'C:\\Users\\Nick\\Documents\\DataScience\\databases\\IMDB_database\\2023_02_12_IMDbDataFiles\\titleBasics.tsv'
df_imdb = pd.read_csv(path_to_tsv, sep='\t')

# preprocess imdb data titles
df_imdb['primaryTitle'] = df_imdb['primaryTitle'].str.lower()
df_imdb['originalTitle'] = df_imdb['originalTitle'].str.lower()
df_imdb['startYear'] = df_imdb['startYear'].replace('\\N','0').astype(int)

In [None]:
# took 272 minutes to run this code
tt_all = []
year_all = []
jredo = []
for j in range(len(df)):
    
    ij = df_imdb['primaryTitle'] == df.loc[j,'Movie Title']
    if(np.any(ij)):
        if(np.sum(ij) == 1):
            tt_all.append(df_imdb.loc[ij,'tconst'].values[0])
            year_all.append(df_imdb.loc[ij,'startYear'].values[0])
        if(np.sum(ij) > 1):
            # filter by titleType and year
            dfyear = df.loc[j,'Movie Year']
            dfyearp = dfyear+1
            dfyearm = dfyear-1
            df_title_year = df_imdb[ij].query("(titleType == 'movie' or titleType == 'tvMovie' or titleType == 'video') and (startYear == @dfyear or startYear == @dfyearp or startYear == @dfyearm)")
            if(len(df_title_year) == 1):
                tt_all.append(df_title_year['tconst'].values[0])
                year_all.append(df_title_year['startYear'].values[0])
            elif(len(df_title_year) > 1):
                if(len(df_title_year.query("titleType == 'movie' or titleType == 'tvMovie'")) == 1):
                    temp = df_title_year.query("titleType == 'movie' or titleType == 'tvMovie'")
                    tt_all.append(temp['tconst'].values[0])
                    year_all.append(temp['startYear'].values[0])                   
                else:
                    tt_all.append([j for j in df_title_year['tconst'].values[:]])
                    year_all.append(df_title_year['startYear'].values[0])
            else:
                jredo.append(j)
                tt_all.append('NULL')
                year_all.append('NULL')
    elif(np.any(df_imdb['originalTitle'] == df.loc[j,'Movie Title'])):
        ij = df_imdb['originalTitle'] == df.loc[j,'Movie Title']
        if(np.sum(ij) == 1):
            tt_all.append(df_imdb.loc[ij,'tconst'].values[0])
            year_all.append(df_imdb.loc[ij,'startYear'].values[0])
        if(np.sum(ij) > 1):
            # filter by titleType and year
            dfyear = df.loc[j,'Movie Year']
            dfyearp = dfyear+1
            dfyearm = dfyear-1
            df_title_year = df_imdb[ij].query("(titleType == 'movie' or titleType == 'tvMovie' or titleType == 'video') and (startYear == @dfyear or startYear == @dfyearp or startYear == @dfyearm)")
            if(len(df_title_year) == 1):
                tt_all.append(df_title_year['tconst'].values[0])
                year_all.append(df_title_year['startYear'].values[0])
            elif(len(df_title_year) > 1):
                if(len(df_title_year.query("titleType == 'movie' or titleType == 'tvMovie'")) == 1):
                    temp = df_title_year.query("titleType == 'movie' or titleType == 'tvMovie'")
                    tt_all.append(temp['tconst'].values[0])
                    year_all.append(temp['startYear'].values[0])                   
                else:
                    tt_all.append([j for j in df_title_year['tconst'].values[:]])
                    year_all.append(df_title_year['startYear'].values[0])
            else:
                jredo.append(j)
                # # display options to select from
                # print(df.loc[j,'Movie Title'], '--' + str(df.loc[j,'Movie Year']))
                # print(df_imdb.loc[ij,['tconst', 'titleType', 'primaryTitle', 'startYear']].reset_index())
                # time.sleep(1)
                # user_input = int(input("enter a value: "))
                # tt_all.append(df_imdb.loc[ij,'tconst'].values[user_input])
                # year_all.append(df_imdb.loc[ij,'startYear'].values[user_input])
                # print()
                tt_all.append('NULL')
                year_all.append('NULL')
    else:
        jredo.append(j)
        tt_all.append('NULL')
        year_all.append('NULL')

In [None]:
# add tconst to dataframe 
df_orig['imdb_id'] = tt_all

# for multiple tconsts, keep only the first one in the list
df_orig['imdb_id'] = [jk.replace('[','').replace(']','').replace("'",'').split(', ')[0] if "['tt" in jk else jk for jk in df_orig['imdb_id'].tolist()]

# create IMDb link to movie based on imdb_id
df_orig['imdb_link'] = 'https://www.imdb.com/title/' + df_orig['imdb_id'] + '/'

# save updated dataset that has been cleaned and contains IMDb_id
df_orig.to_csv('data_cleaning_and_synthesis_out\\springfield_movie_scripts_2023_01_13_cleaned_with_imdb_id.csv')

### method b) load in tconsts from existing database and manually add new values

In [14]:
df_out = pd.read_csv('data_cleaning_and_synthesis_out\\springfield_movie_scripts_2023_01_13_cleaned_with_imdb_id.csv', index_col = [0])
print(len(df_orig), len(df_out))

# left join df_orig and df_out on id and manually insert imdb_id and imdb_link
df_orig = df_orig.join(df_out[['imdb_id','imdb_link']], how='left' )
print(len(df_orig))

In [16]:
# insert/fix imdb_id and update imdb_link
df_orig.loc[35537,'imdb_id'] = 'tt1517268'
df_orig['imdb_link'] = 'https://www.imdb.com/title/' + df_orig['imdb_id'] + '/'

35538 35537


In [31]:
# save changes
df_orig.to_csv('data_cleaning_and_synthesis_out\\springfield_movie_scripts_2023_01_13_cleaned_with_imdb_id.csv')

### method c) load in tconsts from existing database and manually fix mistakes

In [33]:
# TO DO: add functions to update database entries, similar to SQL UPDATE command

## Synthesize film script dataset with tmdb dataset by assigning each movie in film scripts dataset the corresponding tmdb_id. Use tmdb API

In [2]:
import pandas as pd
import requests
df_orig = pd.read_csv('data_cleaning_and_synthesis_out\\springfield_movie_scripts_2023_01_13_cleaned_with_imdb_id.csv', index_col = [0])

In [4]:
df_orig

Unnamed: 0,movie_title,movie_year,springfield_link,script_text,imdb_id,imdb_link
0,A 2nd Hand Lover,2015,https://www.springfieldspringfield.co.uk/movie...,Wow! Amazing! What a beautiful place. Im meet...,tt10919164,https://www.imdb.com/title/tt10919164/
1,A Aa,2016,https://www.springfieldspringfield.co.uk/movie...,"3 My every birthday, Mom hopes for something....",tt5684466,https://www.imdb.com/title/tt5684466/
2,A Baby at Any Cost,2022,https://www.springfieldspringfield.co.uk/movie...,"0 Jen? Jen? Jen? Oops... sorry! Jason? Oh, sh...",tt15331880,https://www.imdb.com/title/tt15331880/
3,A Bad Idea Gone Wrong,2017,https://www.springfieldspringfield.co.uk/movie...,"1 - The tropical places, you know, thats the ...",tt5212918,https://www.imdb.com/title/tt5212918/
4,A Bad Moms Christmas,2017,https://www.springfieldspringfield.co.uk/movie...,"1 My name is Amy Mitchell, and this year Ive ...",tt6359956,https://www.imdb.com/title/tt6359956/
...,...,...,...,...,...,...
35533,99,2019,https://www.springfieldspringfield.co.uk/movie...,O destination where are you? Where am I headi...,tt10329084,https://www.imdb.com/title/tt10329084/
35534,99 Homes,2014,https://www.springfieldspringfield.co.uk/movie...,"Yeah. Oh. Oh, Jesus, no. This isnt a good tim...",tt2891174,https://www.imdb.com/title/tt2891174/
35535,[REC],2007,https://www.springfieldspringfield.co.uk/movie...,"(^^)SIMPLYKOKE(^^) Enjoy the original film, n...",tt1038988,https://www.imdb.com/title/tt1038988/
35536,[REC] 2,2009,https://www.springfieldspringfield.co.uk/movie...,"Record it all, God damn it! Rosso, record! Ok...",,


### methoda a) call tmdb api using imdb_id

In [None]:
tmdb_id = []
for jtt in df_orig['imdb_tconst']:
    url = "https://api.themoviedb.org/3/movie/{}?api_key=075d83b3063def6fdd12763959a9086e&language=en-US".format(
    jtt
    )
    try:
        data = requests.get(url)
        data.raise_for_status()
        data = data.json()
        poster_path = data["poster_path"]
        full_path = "https://image.tmdb.org/t/p/w500/" + poster_path
        tmdb_id.append(full_path)
    except:
        tmdb_id.append("NULL")


### method b) call tmdb (search) api using movie name

In [50]:
movie_title = df_orig["movie_title"].iloc[-2]
url = "http://api.themoviedb.org/3/search/movie?api_key=075d83b3063def6fdd12763959a9086e&query={}".format(
    movie_title.replace(" ", "+")
)
data = requests.get(url)
data.raise_for_status()
data = data.json()


data

{'page': 1,
 'results': [{'adult': False,
   'backdrop_path': '/svZPLmBwYHjlwDwHkZz4EotpXqx.jpg',
   'genre_ids': [53, 27],
   'id': 10664,
   'original_language': 'es',
   'original_title': '[REC]²',
   'overview': 'The action continues from [REC], with the medical officer and a SWAT team outfitted with video cameras are sent into the sealed off apartment to control the situation.',
   'popularity': 23.872,
   'poster_path': '/gWmIqmhb6B6bmL6EPX4sVx0jO02.jpg',
   'release_date': '2009-09-15',
   'title': '[REC]²',
   'video': False,
   'vote_average': 6.6,
   'vote_count': 1793},
  {'adult': False,
   'backdrop_path': '/mHMwoMPLkWHsMeNtFZSNSUOOJ1r.jpg',
   'genre_ids': [27, 9648, 878],
   'id': 59115,
   'original_language': 'en',
   'original_title': 'Quarantine 2: Terminal',
   'overview': 'A plane is taken over by a mysterious virus. When the plane lands it is placed under quarantine. Now a group of survivors must band together to survive the quarantine.',
   'popularity': 10.306,


In [38]:
if(data['total_pages'] > 1):
    print('more than one page of search results')
if(data['total_results'] > 1):
    print('more than one result was returned by query')

tmdb_id = data['results'][0]['id']
tmdb_title = data['results'][0]['original_title']
tmdb_year = data['results'][0]['release_date'][:4]
tmdb_poster_path = "https://image.tmdb.org/t/p/w500/" + data['results'][0]['poster_path']

In [46]:
tmdb_poster_path

'/7iIFOMG6IR3ZJmCfnM9RdJTykRi.jpg'