In [137]:
from transformers import pipeline
import pandas as pd
import string
from collections import Counter
import numpy as np
from scipy import stats
import textstat
from google.cloud import bigquery
import os
from pathlib import Path
import re
import csv

In [2]:
csv_file_name = "../raw_data/springfield_script_oceans_eleven_2001.csv"
scripts_df = pd.read_csv(csv_file_name)
scripts_df

Unnamed: 0,Title,Year,Script
0,Ocean's Eleven (2001),2001,One con under escort.\n Open gate one.\n Man w...


In [3]:
script = scripts_df['Script'][0]

In [4]:
for punctuation in string.punctuation:
    script = script.replace(punctuation, "")
script_split = script.split(" ")
print(len(script_split))

8240


In [5]:
print(script_split[50:100])

['to', 'break', 'the', 'law', 'again\n', 'This', 'was', 'your', 'first', 'conviction\n', 'but', 'youve', 'been', 'implicated\n', 'in', 'a', 'dozen', 'other', 'schemes\n', 'and', 'frauds\n', 'What', 'can', 'you', 'tell', 'us', 'about', 'this\n', 'As', 'you', 'say', 'maam\n', 'I', 'was', 'never', 'charged\n', 'Were', 'trying', 'to', 'find', 'out\n', 'if', 'there', 'was', 'a', 'reason\n', 'for', 'committing', 'this', 'crime\n']


In [6]:
# any(c.isalpha() for c in string_1)
script_words = [word for word in script_split if any(c.isalpha() for c in word)]
len(script_words)

7657

In [7]:
# for punctuation in string.punctuation:
#     text = text.replace(punctuation, '')
words_stripped = [word.strip() for word in script_words]
words_clean = [word.lower() for word in words_stripped]

In [8]:
word_counts = Counter(words_clean)

In [9]:
hl = 0
for word in word_counts.keys():
    if word_counts[word] == 1:
        hl += 1
print(hl)

912


In [10]:
def clean_words(script):
    '''
    Function that takes raw script and cleans it.
    Returns list of individual words.
    Example: 
    Input: 'Hello, my... name is!'
    Output: ['hello','my','name','is']
    '''
    # Remove punctuation.
    for punctuation in string.punctuation:
        script = script.replace(punctuation, "")
    # Split on whitespace to isolate words
    script_split = script.split(" ")
    # Removing "words" that are just numbers, i.e. have no letters
    script_words = [word for word in script_split if any(c.isalpha() for c in word)]
    # Remove new lines, \n isn't removed by punctuation above.
    words_stripped = [word.strip() for word in script_words]
    # Lowercase in order to count occurances of same word.
    words_clean = [word.lower() for word in words_stripped]
    return words_clean

In [11]:
def count_hapax(script):
    ''' 
    Function to count number of hapax legomenon, i.e.
    words that appear once in a corpus/text.
    '''
    
    words_clean = clean_words(script)
    word_counts = Counter(words_clean)
    # Hapax Legomenon counter
    hell = 0
    for word in word_counts.keys():
        if word_counts[word] == 1:
            hell += 1
    return hell

In [12]:
print(count_hapax(script))

912


In [13]:
scripts_df_10 = pd.read_csv("../raw_data/springfield_10_scripts.csv")

In [14]:
scripts_df_10['Title']

0                        Barbarosa (1982)
1                         Chestnut (2023)
2                  Contractor, The (2007)
3    George Michael: Freedom Uncut (2022)
4                Objective, Burma! (1945)
5                    Out on a Limb (1992)
6                          Piranha (2010)
7                            Scoop (2006)
8              Secret of the Incas (1954)
9        Slumber Party Massacre II (1987)
Name: Title, dtype: object

In [15]:
for scr in scripts_df_10['Script']:
    print(count_hapax(scr))

482
453
536
1090
801
634
564
1047
714
457


In [16]:
my_movies = ["springfield_script_aftersun_2022.csv",
             "springfield_script_bourne_identity_the_2002.csv",
             "springfield_script_grown_ups_2010.csv",
             "springfield_script_oceans_eleven_2001.csv",
             "springfield_script_pulp_fiction_1994.csv",
             "springfield_script_silence_of_the_lambs_the_1991.csv"
            ]

for filename in my_movies:
    scripts_df = pd.read_csv(f'../raw_data/{filename}')
    script = scripts_df['Script'][0]
    title = scripts_df['Title'][0]
    print(f'Title: {title}: #Hapax: {count_hapax(script)}')

Title: Aftersun (2022): #Hapax: 365
Title: Bourne Identity, The (2002): #Hapax: 724
Title: Grown Ups (2010): #Hapax: 919
Title: Ocean's Eleven (2001): #Hapax: 912
Title: Pulp Fiction (1994): #Hapax: 1119
Title: Silence of the Lambs, The (1991): #Hapax: 979


In [17]:
def mean_word_length(script):
    ''' 
    Function to find the mean word length in a script.
    '''
    
    words_clean = clean_words(script)
    word_lengths = np.array([len(word) for word in words_clean],dtype='int')
    return np.mean(word_lengths)

In [18]:
print(mean_word_length(script))

4.134212567882079


In [19]:
for filename in my_movies:
    scripts_df = pd.read_csv(f'../raw_data/{filename}')
    script = scripts_df['Script'][0]
    title = scripts_df['Title'][0]
    print(f'Title: {title}: Mean word length: {mean_word_length(script)}')

Title: Aftersun (2022): Mean word length: 3.8761695101816183
Title: Bourne Identity, The (2002): Mean word length: 4.11635565312843
Title: Grown Ups (2010): Mean word length: 3.891901795529498
Title: Ocean's Eleven (2001): Mean word length: 4.023377301815333
Title: Pulp Fiction (1994): Mean word length: 3.965002797289737
Title: Silence of the Lambs, The (1991): Mean word length: 4.134212567882079


In [20]:
def mode_word_length(script):
    ''' 
    Function to find the mode word length in a script.
    '''
    
    words_clean = clean_words(script)
    word_lengths = np.array([len(word) for word in words_clean],dtype='int')
    
    return stats.mode(word_lengths)[0]

In [21]:
for filename in my_movies:
    scripts_df = pd.read_csv(f'../raw_data/{filename}')
    script = scripts_df['Script'][0]
    title = scripts_df['Title'][0]
    print(f'Title: {title}: Mode word length: {mode_word_length(script)}')

Title: Aftersun (2022): Mode word length: 4
Title: Bourne Identity, The (2002): Mode word length: 4
Title: Grown Ups (2010): Mode word length: 3
Title: Ocean's Eleven (2001): Mode word length: 4
Title: Pulp Fiction (1994): Mode word length: 4
Title: Silence of the Lambs, The (1991): Mode word length: 4


In [22]:
def readability_metrics(script):
    '''
    Function that calculates the readability of a script.
    '''
    
    # Cleaning is done differently here so that the input to the textstat
    # metric functions is correct. Essentially it wants to keep punctuation.
    
    # Split on whitespace to isolate words
    script_split = script.split(" ")
    # Removing "words" that are just numbers, i.e. have no letters
    script_words = [word for word in script_split if any(c.isalpha() for c in word)]
    # Remove new lines
    words_stripped = [word.strip() for word in script_words]
    words_clean = words_stripped
    
    text = " ".join(words_clean)
    # Flesch-Kincaid Grade Level - measures US Grade level required to read text.
    fkgl = textstat.flesch_kincaid_grade(text)
    # Flesch Reading Ease - overall score
    fre = textstat.flesch_reading_ease(text)
    # SMOG Test - better for jargon/technical text
    smog = textstat.smog_index(text)
    # Gunning Fog Index - complexity of sentence structure and vocab
    fog = textstat.gunning_fog(text)

    return (fkgl, fre, smog, fog)

In [23]:
for filename in my_movies:
    scripts_df = pd.read_csv(f'../raw_data/{filename}')
    script = scripts_df['Script'][0]
    title = scripts_df['Title'][0]

    (fkgl, fre, smog, fog) = readability_metrics(script)
    print(f'Title: {title}')
    print(f"Flesch-Kincaid Grade Level: {fkgl}")
    print(f"Flesch Reading Ease: {fre}")
    print(f"Gunning Fog Index: {fog}")
    print(f"SMOG Index: {smog}")
    print(" ")

Title: Aftersun (2022)
Flesch-Kincaid Grade Level: 2.4
Flesch Reading Ease: 89.95
Gunning Fog Index: 3.39
SMOG Index: 6.4
 
Title: Bourne Identity, The (2002)
Flesch-Kincaid Grade Level: 2.4
Flesch Reading Ease: 89.95
Gunning Fog Index: 3.85
SMOG Index: 6.7
 
Title: Grown Ups (2010)
Flesch-Kincaid Grade Level: 2.2
Flesch Reading Ease: 90.46
Gunning Fog Index: 3.46
SMOG Index: 6.3
 
Title: Ocean's Eleven (2001)
Flesch-Kincaid Grade Level: 2.2
Flesch Reading Ease: 90.36
Gunning Fog Index: 3.75
SMOG Index: 6.6
 
Title: Pulp Fiction (1994)
Flesch-Kincaid Grade Level: 2.8
Flesch Reading Ease: 88.94
Gunning Fog Index: 3.89
SMOG Index: 6.5
 
Title: Silence of the Lambs, The (1991)
Flesch-Kincaid Grade Level: 2.6
Flesch Reading Ease: 89.34
Gunning Fog Index: 4.28
SMOG Index: 7.1
 


In [24]:
def vocab_size(script):
    ''' 
    Function to count number of unique words.
    '''
    
    words_clean = clean_words(script)
    word_counts = Counter(words_clean)
    
    return len(word_counts)

def type_token_ratio(script):
    ''' 
    Function to calculate the type token ratio.
    TTR = (# unique words)/(total # words)
    '''
    
    words_clean = clean_words(script)
    word_counts = Counter(words_clean)
    
    return len(word_counts)/len(words_clean)

def script_length(script):
    ''' 
    Function to calculate the script length.
    '''
    
    words_clean = clean_words(script)
    
    return len(words_clean)

In [25]:
for filename in my_movies:
    scripts_df = pd.read_csv(f'../raw_data/{filename}')
    script = scripts_df['Script'][0]
    title = scripts_df['Title'][0]

    pstr = f'''Title: {title}
    script length: {script_length(script)}
    vocab_size: {round(vocab_size(script),2)}
    TTR: {round(type_token_ratio(script),2)}
    iTTR: {round(vocab_size(script)/type_token_ratio(script),2)}
    '''
    print(pstr)

Title: Aftersun (2022)
    script length: 3634
    vocab_size: 764
    TTR: 0.21
    iTTR: 3634.0
    
Title: Bourne Identity, The (2002)
    script length: 7288
    vocab_size: 1343
    TTR: 0.18
    iTTR: 7288.0
    
Title: Grown Ups (2010)
    script length: 8187
    vocab_size: 1624
    TTR: 0.2
    iTTR: 8187.0
    
Title: Ocean's Eleven (2001)
    script length: 7657
    vocab_size: 1628
    TTR: 0.21
    iTTR: 7657.0
    
Title: Pulp Fiction (1994)
    script length: 16087
    vocab_size: 2294
    TTR: 0.14
    iTTR: 16087.0
    
Title: Silence of the Lambs, The (1991)
    script length: 7734
    vocab_size: 1712
    TTR: 0.22
    iTTR: 7734.0
    


In [62]:
PROJECT_ID = os.getenv("GCP_PROJECT")
DATASET_ID = os.getenv("BQ_DATASET")
TABLE_ID = os.getenv("BQ_TABLE")

client = bigquery.Client(project=PROJECT_ID)

query = f"""
    SELECT tmdbid, imdb_id, title, original_title, release_date, runtime, genre_1, genre_2, genre_3
    FROM `{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}`
"""

LOCAL_DATA_PATH = os.path.join(os.path.expanduser('~'), "code", "oliverramsaygray", "film_wizard", "raw_data")
cache_path = Path(LOCAL_DATA_PATH).joinpath(f"movie_lens_title_duration_genres.csv")

if not cache_path.is_file():
    movie_lens_df = client.query(query).to_dataframe()
    movie_lens_df.to_csv('../raw_data/movie_lens_title_duration_genres.csv', index=False)
else:
    movie_lens_df = pd.read_csv(cache_path)

movie_lens_df.head()



Unnamed: 0,tmdbid,imdb_id,title,original_title,release_date,runtime,genre_1,genre_2,genre_3
0,2,tt0094675,Ariel,Ariel,1988-10-21,73.0,Comedy,Drama,Romance
1,3,tt0092149,Shadows in Paradise,Varjoja paratiisissa,1986-10-17,74.0,Comedy,Drama,Romance
2,5,tt0113101,Four Rooms,Four Rooms,1995-12-09,98.0,Comedy,,
3,6,tt0107286,Judgment Night,Judgment Night,1993-10-15,109.0,Action,Crime,Thriller
4,11,tt0076759,Star Wars,Star Wars,1977-05-25,121.0,Adventure,Action,Science Fiction


In [63]:
movie_lens_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147209 entries, 0 to 147208
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tmdbid          147209 non-null  Int64  
 1   imdb_id         86364 non-null   object 
 2   title           147209 non-null  object 
 3   original_title  147209 non-null  object 
 4   release_date    86340 non-null   dbdate 
 5   runtime         86383 non-null   float64
 6   genre_1         85346 non-null   object 
 7   genre_2         55972 non-null   object 
 8   genre_3         26257 non-null   object 
dtypes: Int64(1), dbdate(1), float64(1), object(6)
memory usage: 10.2+ MB


In [64]:
movie_lens_df.columns

Index(['tmdbid', 'imdb_id', 'title', 'original_title', 'release_date',
       'runtime', 'genre_1', 'genre_2', 'genre_3'],
      dtype='object')

In [116]:
titles = np.array(movie_lens_df['title'])

In [117]:
len(titles)

147209

In [118]:
release_dates = np.array(movie_lens_df['release_date'])

In [119]:
release_years = [_.year for _ in release_dates]

In [120]:
release_years[0:10]

[1988, 1986, 1995, 1993, 1977, 2003, 1994, 1999, 1941, 2000]

In [121]:
titles_years = [title.lower() + f' ({year})' for (title, year) in zip(titles,release_years)]

In [122]:
type(titles_years)

list

In [123]:
script_titles_df = pd.read_csv(f'../raw_data/springfield_40k_movie_titles.csv')
script_titles_df

Unnamed: 0,Title,Year,URL
0,#AMFAD: All My Friends Are Dead (2024),2024.0,https://www.springfieldspringfield.co.uk/movie...
1,#Followme (2019),2019.0,https://www.springfieldspringfield.co.uk/movie...
2,#Horror (2015),2015.0,https://www.springfieldspringfield.co.uk/movie...
3,#IMomSoHard Live (2019),2019.0,https://www.springfieldspringfield.co.uk/movie...
4,#MenToo (2023),2023.0,https://www.springfieldspringfield.co.uk/movie...
...,...,...,...
38158,implanted (2013),2013.0,https://www.springfieldspringfield.co.uk/movie...
38159,uwantme2killhim? (2013),2013.0,https://www.springfieldspringfield.co.uk/movie...
38160,xXx (2002),2002.0,https://www.springfieldspringfield.co.uk/movie...
38161,xXx: Return of Xander Cage (2017),2017.0,https://www.springfieldspringfield.co.uk/movie...


In [124]:
script_titles = script_titles_df['Title']

In [125]:
len(script_titles)

38163

In [126]:
# thr = scripts_titles[0][:-7]
# script_titles_no_year = [_[:-7] for _ in script_titles]
script_titles_clean = [_.lower() for _ in script_titles]

In [127]:
type(script_titles_clean)

list

In [128]:
script_titles_hash = Counter(script_titles_clean)

In [129]:
len(script_titles_hash.keys())

38163

In [141]:
common_elements = list(set(script_titles_clean) & set(titles_years))

print(len(common_elements))

19624


In [142]:
common_elements

['finding you (2021)',
 'bogus (1996)',
 'riverworld (2003)',
 'beyond (2014)',
 'doctor dolittle (1998)',
 'behind the sightings (2021)',
 'countdown (2019)',
 'deepwater horizon (2016)',
 'night of the hunter (1991)',
 'braid (2019)',
 'hick (2011)',
 'kalev (2022)',
 'leprechaun (1993)',
 '47 meters down: uncaged (2019)',
 'sarpatta parambarai (2021)',
 'wattstax (1973)',
 'becky (2020)',
 'rewind (2013)',
 'woman at war (2018)',
 'monsters at large (2018)',
 'gook (2017)',
 'knuckleball (2018)',
 "eddie macon's run (1983)",
 'parallels (2015)',
 'leprechaun: origins (2014)',
 'a boy called christmas (2021)',
 'all of my heart: inn love (2017)',
 'attack of the 50 foot woman (1958)',
 'keeping up with the joneses (2016)',
 'the appearance (2018)',
 'inherent vice (2014)',
 'blood brothers: malcolm x & muhammad ali (2021)',
 "sinners' holiday (1930)",
 'mujhse shaadi karogi (2004)',
 'female prisoner scorpion: beast stable (1973)',
 'dirty dancing (1987)',
 'santoalla (2016)',
 'airp

In [143]:
i = 0
for row in common_elements:
    if i < 10:
        print(row)
    else:
        break
    i += 1

finding you (2021)
bogus (1996)
riverworld (2003)
beyond (2014)
doctor dolittle (1998)
behind the sightings (2021)
countdown (2019)
deepwater horizon (2016)
night of the hunter (1991)
braid (2019)


In [151]:
with open("../raw_data/movie_lens_scripts_titles.csv",'w',newline='') as csvfile:
    for row in common_elements:
        csvfile.write(f'{row}\n')