In [33]:
from transformers import pipeline
import pandas as pd
import string
from collections import Counter
import numpy as np
from scipy import stats
import textstat
from google.cloud import bigquery
import os
from pathlib import Path
import re
import csv
import tensorflow as tf

In [34]:
csv_file_name = "../raw_data/springfield_script_oceans_eleven_2001.csv"
scripts_df = pd.read_csv(csv_file_name)
scripts_df

Unnamed: 0,Title,Year,Script
0,Ocean's Eleven (2001),2001,One con under escort.\n Open gate one.\n Man w...


# Development of functions to extract linguistic information from scripts.

In [35]:
script = scripts_df['Script'][0]

In [36]:
for punctuation in string.punctuation:
    script = script.replace(punctuation, "")
script_split = script.split(" ")
print(len(script_split))

8240


In [37]:
print(script_split[50:100])

['to', 'break', 'the', 'law', 'again\n', 'This', 'was', 'your', 'first', 'conviction\n', 'but', 'youve', 'been', 'implicated\n', 'in', 'a', 'dozen', 'other', 'schemes\n', 'and', 'frauds\n', 'What', 'can', 'you', 'tell', 'us', 'about', 'this\n', 'As', 'you', 'say', 'maam\n', 'I', 'was', 'never', 'charged\n', 'Were', 'trying', 'to', 'find', 'out\n', 'if', 'there', 'was', 'a', 'reason\n', 'for', 'committing', 'this', 'crime\n']


In [38]:
# any(c.isalpha() for c in string_1)
script_words = [word for word in script_split if any(c.isalpha() for c in word)]
len(script_words)

7657

In [39]:
# for punctuation in string.punctuation:
#     text = text.replace(punctuation, '')
words_stripped = [word.strip() for word in script_words]
words_clean = [word.lower() for word in words_stripped]

In [40]:
word_counts = Counter(words_clean)

In [41]:
hl = 0
for word in word_counts.keys():
    if word_counts[word] == 1:
        hl += 1
print(hl)

912


In [42]:
def clean_words(script):
    '''
    Function that takes raw script and cleans it.
    Returns list of individual words.
    Example: 
    Input: 'Hello, my... name is!'
    Output: ['hello','my','name','is']
    '''
    # Remove punctuation.
    for punctuation in string.punctuation:
        script = script.replace(punctuation, "")
    # Split on whitespace to isolate words
    script_split = script.split(" ")
    # Removing "words" that are just numbers, i.e. have no letters
    script_words = [word for word in script_split if any(c.isalpha() for c in word)]
    # Remove new lines, \n isn't removed by punctuation above.
    words_stripped = [word.strip() for word in script_words]
    # Lowercase in order to count occurances of same word.
    words_clean = [word.lower() for word in words_stripped]
    return words_clean

In [43]:
def count_hapax(script):
    ''' 
    Function to count number of hapax legomenon, i.e.
    words that appear once in a corpus/text.
    '''
    
    words_clean = clean_words(script)
    word_counts = Counter(words_clean)
    # Hapax Legomenon counter
    hell = 0
    for word in word_counts.keys():
        if word_counts[word] == 1:
            hell += 1
    return hell

In [44]:
print(count_hapax(script))

912


In [45]:
scripts_df_10 = pd.read_csv("../raw_data/springfield_10_scripts.csv")

In [46]:
scripts_df_10['Title']

0                        Barbarosa (1982)
1                         Chestnut (2023)
2                  Contractor, The (2007)
3    George Michael: Freedom Uncut (2022)
4                Objective, Burma! (1945)
5                    Out on a Limb (1992)
6                          Piranha (2010)
7                            Scoop (2006)
8              Secret of the Incas (1954)
9        Slumber Party Massacre II (1987)
Name: Title, dtype: object

In [47]:
for scr in scripts_df_10['Script']:
    print(count_hapax(scr))

482
453
536
1090
801
634
564
1047
714
457


In [48]:
my_movies = ["springfield_script_aftersun_2022.csv",
             "springfield_script_bourne_identity_the_2002.csv",
             "springfield_script_grown_ups_2010.csv",
             "springfield_script_oceans_eleven_2001.csv",
             "springfield_script_pulp_fiction_1994.csv",
             "springfield_script_silence_of_the_lambs_the_1991.csv"
            ]

for filename in my_movies:
    scripts_df = pd.read_csv(f'../raw_data/{filename}')
    script = scripts_df['Script'][0]
    title = scripts_df['Title'][0]
    print(f'Title: {title}: #Hapax: {count_hapax(script)}')

Title: Aftersun (2022): #Hapax: 365
Title: Bourne Identity, The (2002): #Hapax: 724
Title: Grown Ups (2010): #Hapax: 919
Title: Ocean's Eleven (2001): #Hapax: 912
Title: Pulp Fiction (1994): #Hapax: 1119
Title: Silence of the Lambs, The (1991): #Hapax: 979


In [49]:
def mean_word_length(script):
    ''' 
    Function to find the mean word length in a script.
    '''
    
    words_clean = clean_words(script)
    word_lengths = np.array([len(word) for word in words_clean],dtype='int')
    return np.mean(word_lengths)

In [50]:
print(mean_word_length(script))

4.134212567882079


In [51]:
for filename in my_movies:
    scripts_df = pd.read_csv(f'../raw_data/{filename}')
    script = scripts_df['Script'][0]
    title = scripts_df['Title'][0]
    print(f'Title: {title}: Mean word length: {mean_word_length(script)}')

Title: Aftersun (2022): Mean word length: 3.8761695101816183
Title: Bourne Identity, The (2002): Mean word length: 4.11635565312843
Title: Grown Ups (2010): Mean word length: 3.891901795529498
Title: Ocean's Eleven (2001): Mean word length: 4.023377301815333
Title: Pulp Fiction (1994): Mean word length: 3.965002797289737
Title: Silence of the Lambs, The (1991): Mean word length: 4.134212567882079


In [52]:
def mode_word_length(script):
    ''' 
    Function to find the mode word length in a script.
    '''
    
    words_clean = clean_words(script)
    word_lengths = np.array([len(word) for word in words_clean],dtype='int')
    
    return stats.mode(word_lengths)[0]

In [53]:
for filename in my_movies:
    scripts_df = pd.read_csv(f'../raw_data/{filename}')
    script = scripts_df['Script'][0]
    title = scripts_df['Title'][0]
    print(f'Title: {title}: Mode word length: {mode_word_length(script)}')

Title: Aftersun (2022): Mode word length: 4
Title: Bourne Identity, The (2002): Mode word length: 4
Title: Grown Ups (2010): Mode word length: 3
Title: Ocean's Eleven (2001): Mode word length: 4
Title: Pulp Fiction (1994): Mode word length: 4
Title: Silence of the Lambs, The (1991): Mode word length: 4


In [54]:
def readability_metrics(script):
    '''
    Function that calculates the readability of a script.
    '''
    
    # Cleaning is done differently here so that the input to the textstat
    # metric functions is correct. Essentially it wants to keep punctuation.
    
    # Split on whitespace to isolate words
    script_split = script.split(" ")
    # Removing "words" that are just numbers, i.e. have no letters
    script_words = [word for word in script_split if any(c.isalpha() for c in word)]
    # Remove new lines
    words_stripped = [word.strip() for word in script_words]
    words_clean = words_stripped
    
    text = " ".join(words_clean)
    # Flesch-Kincaid Grade Level - measures US Grade level required to read text.
    fkgl = textstat.flesch_kincaid_grade(text)
    # Flesch Reading Ease - overall score
    fre = textstat.flesch_reading_ease(text)
    # SMOG Test - better for jargon/technical text
    smog = textstat.smog_index(text)
    # Gunning Fog Index - complexity of sentence structure and vocab
    fog = textstat.gunning_fog(text)

    return (fkgl, fre, smog, fog)

In [55]:
for filename in my_movies:
    scripts_df = pd.read_csv(f'../raw_data/{filename}')
    script = scripts_df['Script'][0]
    title = scripts_df['Title'][0]

    (fkgl, fre, smog, fog) = readability_metrics(script)
    print(f'Title: {title}')
    print(f"Flesch-Kincaid Grade Level: {fkgl}")
    print(f"Flesch Reading Ease: {fre}")
    print(f"Gunning Fog Index: {fog}")
    print(f"SMOG Index: {smog}")
    print(" ")

Title: Aftersun (2022)
Flesch-Kincaid Grade Level: 2.4
Flesch Reading Ease: 89.95
Gunning Fog Index: 3.39
SMOG Index: 6.4
 
Title: Bourne Identity, The (2002)
Flesch-Kincaid Grade Level: 2.4
Flesch Reading Ease: 89.95
Gunning Fog Index: 3.85
SMOG Index: 6.7
 
Title: Grown Ups (2010)
Flesch-Kincaid Grade Level: 2.2
Flesch Reading Ease: 90.46
Gunning Fog Index: 3.46
SMOG Index: 6.3
 
Title: Ocean's Eleven (2001)
Flesch-Kincaid Grade Level: 2.2
Flesch Reading Ease: 90.36
Gunning Fog Index: 3.75
SMOG Index: 6.6
 
Title: Pulp Fiction (1994)
Flesch-Kincaid Grade Level: 2.8
Flesch Reading Ease: 88.94
Gunning Fog Index: 3.89
SMOG Index: 6.5
 
Title: Silence of the Lambs, The (1991)
Flesch-Kincaid Grade Level: 2.6
Flesch Reading Ease: 89.34
Gunning Fog Index: 4.28
SMOG Index: 7.1
 


In [56]:
def vocab_size(script):
    ''' 
    Function to count number of unique words.
    '''
    
    words_clean = clean_words(script)
    word_counts = Counter(words_clean)
    
    return len(word_counts)

def type_token_ratio(script):
    ''' 
    Function to calculate the type token ratio.
    TTR = (# unique words)/(total # words)
    '''
    
    words_clean = clean_words(script)
    word_counts = Counter(words_clean)
    
    return len(word_counts)/len(words_clean)

def script_length(script):
    ''' 
    Function to calculate the script length.
    '''
    
    words_clean = clean_words(script)
    
    return len(words_clean)

In [57]:
for filename in my_movies:
    scripts_df = pd.read_csv(f'../raw_data/{filename}')
    script = scripts_df['Script'][0]
    title = scripts_df['Title'][0]

    pstr = f'''Title: {title}
    script length: {script_length(script)}
    vocab_size: {round(vocab_size(script),2)}
    TTR: {round(type_token_ratio(script),2)}
    iTTR: {round(vocab_size(script)/type_token_ratio(script),2)}
    '''
    print(pstr)

Title: Aftersun (2022)
    script length: 3634
    vocab_size: 764
    TTR: 0.21
    iTTR: 3634.0
    
Title: Bourne Identity, The (2002)
    script length: 7288
    vocab_size: 1343
    TTR: 0.18
    iTTR: 7288.0
    
Title: Grown Ups (2010)
    script length: 8187
    vocab_size: 1624
    TTR: 0.2
    iTTR: 8187.0
    
Title: Ocean's Eleven (2001)
    script length: 7657
    vocab_size: 1628
    TTR: 0.21
    iTTR: 7657.0
    
Title: Pulp Fiction (1994)
    script length: 16087
    vocab_size: 2294
    TTR: 0.14
    iTTR: 16087.0
    
Title: Silence of the Lambs, The (1991)
    script length: 7734
    vocab_size: 1712
    TTR: 0.22
    iTTR: 7734.0
    


# Finding overlap between Springfiled 40K dataset and Movielens 115K dataset of movie reviews.

In [61]:
PROJECT_ID = "film-wizard-453315" #os.getenv("GCP_PROJECT")
DATASET_ID = "tmdb_metadata" #os.getenv("BQ_DATASET")
TABLE_ID = "movie_details" #os.getenv("BQ_TABLE")

client = bigquery.Client(project=PROJECT_ID)

query = f"""
    SELECT tmdbid, imdb_id, title, original_title, release_date, runtime, genre_1, genre_2, genre_3
    FROM `{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}`
"""

LOCAL_DATA_PATH = os.path.join(os.path.expanduser('~'), "code", "oliverramsaygray", "film_wizard", "raw_data")
cache_path = Path(LOCAL_DATA_PATH).joinpath(f"movie_lens_title_duration_genres.csv")

if cache_path.is_file():
    movie_lens_df = client.query(query).to_dataframe()
    movie_lens_df.to_csv('../raw_data/movie_lens_title_duration_genres.csv', index=False)
# else:
#     movie_lens_df = pd.read_csv(cache_path)

movie_lens_df.head()



Unnamed: 0,tmdbid,imdb_id,title,original_title,release_date,runtime,genre_1,genre_2,genre_3
0,2,tt0094675,Ariel,Ariel,1988-10-21,73.0,Comedy,Drama,Romance
1,3,tt0092149,Shadows in Paradise,Varjoja paratiisissa,1986-10-17,74.0,Comedy,Drama,Romance
2,5,tt0113101,Four Rooms,Four Rooms,1995-12-09,98.0,Comedy,,
3,6,tt0107286,Judgment Night,Judgment Night,1993-10-15,109.0,Action,Crime,Thriller
4,11,tt0076759,Star Wars,Star Wars,1977-05-25,121.0,Adventure,Action,Science Fiction


In [62]:
movie_lens_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147209 entries, 0 to 147208
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tmdbid          147209 non-null  Int64  
 1   imdb_id         86364 non-null   object 
 2   title           147209 non-null  object 
 3   original_title  147209 non-null  object 
 4   release_date    86340 non-null   dbdate 
 5   runtime         86383 non-null   float64
 6   genre_1         85346 non-null   object 
 7   genre_2         55972 non-null   object 
 8   genre_3         26257 non-null   object 
dtypes: Int64(1), dbdate(1), float64(1), object(6)
memory usage: 10.2+ MB


In [63]:
movie_lens_df.columns

Index(['tmdbid', 'imdb_id', 'title', 'original_title', 'release_date',
       'runtime', 'genre_1', 'genre_2', 'genre_3'],
      dtype='object')

In [64]:
titles = np.array(movie_lens_df['title'])

In [65]:
len(titles)

147209

In [66]:
release_dates = np.array(movie_lens_df['release_date'])

In [67]:
release_years = [_.year for _ in release_dates]

In [68]:
release_years[0:10]

[1988, 1986, 1995, 1993, 1977, 2003, 1994, 1999, 1941, 2000]

In [69]:
titles_years = [title.lower() + f' ({year})' for (title, year) in zip(titles,release_years)]

In [70]:
type(titles_years)

list

In [71]:
script_titles_df = pd.read_csv(f'../raw_data/springfield_40k_movie_titles.csv')
script_titles_df

Unnamed: 0,Title,Year,URL
0,#AMFAD: All My Friends Are Dead (2024),2024.0,https://www.springfieldspringfield.co.uk/movie...
1,#Followme (2019),2019.0,https://www.springfieldspringfield.co.uk/movie...
2,#Horror (2015),2015.0,https://www.springfieldspringfield.co.uk/movie...
3,#IMomSoHard Live (2019),2019.0,https://www.springfieldspringfield.co.uk/movie...
4,#MenToo (2023),2023.0,https://www.springfieldspringfield.co.uk/movie...
...,...,...,...
38158,implanted (2013),2013.0,https://www.springfieldspringfield.co.uk/movie...
38159,uwantme2killhim? (2013),2013.0,https://www.springfieldspringfield.co.uk/movie...
38160,xXx (2002),2002.0,https://www.springfieldspringfield.co.uk/movie...
38161,xXx: Return of Xander Cage (2017),2017.0,https://www.springfieldspringfield.co.uk/movie...


In [72]:
script_titles = script_titles_df['Title']

In [73]:
len(script_titles)

38163

In [74]:
# thr = scripts_titles[0][:-7]
# script_titles_no_year = [_[:-7] for _ in script_titles]
script_titles_clean = [_.lower() for _ in script_titles]

In [75]:
type(script_titles_clean)

list

In [76]:
script_titles_hash = Counter(script_titles_clean)

In [77]:
len(script_titles_hash.keys())

38163

In [78]:
common_elements = list(set(script_titles_clean) & set(titles_years))

print(len(common_elements))

19624


In [79]:
common_elements

['smashed (2012)',
 'the gatekeepers (2012)',
 'couples retreat (2009)',
 'love is all you need? (2016)',
 'drive angry (2011)',
 "get rich or die tryin' (2005)",
 'booty call (1997)',
 '8 x 10 tasveer (2009)',
 'escape from zahrain (1962)',
 'the air up there (1994)',
 'indictment: the mcmartin trial (1995)',
 'radhe shyam (2022)',
 'catwoman (2004)',
 'chronicle (2012)',
 'sibyl (2019)',
 'green lantern: beware my power (2022)',
 'if these walls could talk 2 (2000)',
 'hairbrained (2013)',
 'the canyon (2009)',
 'sometimes aunt martha does dreadful things (1971)',
 'hot bot (2016)',
 "my best friend's exorcism (2022)",
 'arena (2011)',
 'jassy (1947)',
 'house on haunted hill (1999)',
 'vision quest (1985)',
 'iron doors (2010)',
 'yonkers joe (2008)',
 'scissors (1991)',
 'dhaakad (2022)',
 'andron (2015)',
 'how to please a woman (2022)',
 'tarzan and his mate (1934)',
 'rattlesnake (2019)',
 'kannathil muthamittal (2002)',
 'flutter (2014)',
 'once upon a time in mumbai dobaara! (

In [80]:
i = 0
for row in common_elements:
    if i < 10:
        print(row)
    else:
        break
    i += 1

smashed (2012)
the gatekeepers (2012)
couples retreat (2009)
love is all you need? (2016)
drive angry (2011)
get rich or die tryin' (2005)
booty call (1997)
8 x 10 tasveer (2009)
escape from zahrain (1962)
the air up there (1994)


In [81]:
with open("../raw_data/movie_lens_scripts_titles.csv",'w',newline='') as csvfile:
    for row in common_elements:
        csvfile.write(f'{row}\n')

# Developing a linear regression model to recommend

In [82]:
PROJECT_ID = os.getenv("GCP_PROJECT")
DATASET_ID = os.getenv("BQ_DATASET")
TABLE_ID = os.getenv("BQ_TABLE")

client = bigquery.Client(project=PROJECT_ID)

query = f"""
    SELECT *
    FROM `{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}`
"""

# LOCAL_DATA_PATH = os.path.join(os.path.expanduser('~'), "code", "oliverramsaygray", "film_wizard", "raw_data")
# cache_path = Path(LOCAL_DATA_PATH).joinpath(f"movie_lens_title_duration_genres.csv")

# if not cache_path.is_file():
scripts_feats_df = client.query(query).to_dataframe()
#     movie_lens_df.to_csv('../raw_data/movie_lens_title_duration_genres.csv', index=False)
# else:
#     movie_lens_df = pd.read_csv(cache_path)

scripts_feats_df.head()



Unnamed: 0,lower_title,Title,URL,Script,Clean_Script,joy,neutral,sadness,surprise,anger,...,sentiment_std,hapax,fkgl,fre,smog,fog,word count,TTR,script_length,mean_word_length
0,a pink christmas (1978),A Pink Christmas (1978),/movie_script.php?movie=a-pink-christmas,1\n [PINK PANTHERTHEME PLAYING]\n [EXPLOSION]\...,"[pink, panthertheme, playing, explosion, child...",1.0,,,,,...,0.0,145,235.3,-532.46,0.0,238.94,242,0.406723,595,5.163025
1,anima (2019),Anima (2019),/movie_script.php?movie=anima,Who are these people?\n I'm in black treacle\n...,"[who, are, these, people, im, in, black, treac...",,,0.666667,,,...,0.5,91,62.3,-60.65,0.0,65.77,159,0.486239,327,3.902141
2,bao (2018),Bao (2018),/movie_script.php?movie=bao,1\n (BOWL BEING SET DOWN)\n (RHYTHMIC THUMPING...,"[bowl, being, set, down, rhythmic, thumping, i...",0.5,0.5,,,,...,0.0,32,22.8,4.48,0.0,20.3,49,0.597561,82,6.402439
3,all this and rabbit stew (1941),All This and Rabbit Stew (1941),/movie_script.php?movie=all-this-and-rabbit-stew,I's gonna get me a rabbit\n Gonna catch me a r...,"[is, gonna, get, me, a, rabbit, gonna, catch, ...",,0.333333,0.333333,0.333333,,...,0.0,65,0.0,106.87,4.8,3.24,113,0.491304,230,3.552174
4,apocalypto (2006),Apocalypto (2006),/movie_script.php?movie=apocalypto,"""A great civilization is not conquererd from w...","[a, great, civilization, is, not, conquererd, ...",,0.052632,0.263158,0.157895,0.052632,...,1.025899,336,1.0,99.02,4.9,2.87,585,0.286484,2042,3.770813


# Checking access to local GPU

In [86]:
print("TensorFlow Version:", tf.__version__)

TensorFlow Version: 2.18.0


In [87]:
# List available devices
print("Available devices:")
for device in tf.config.list_physical_devices():
    print(device)

# Check for GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print("GPU is available: ", gpus)
else:
    print("No GPU found")

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

Available devices:
PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
GPU is available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14915229009621110660
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3760193536
locality {
  bus_id: 1
  links {
  }
}
incarnation: 11335826503799047549
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6"
xla_global_id: 416903419
]


I0000 00:00:1741966091.479743   28836 gpu_device.cc:2022] Created device /device:GPU:0 with 3586 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


# Fitting a k-means cluster

In [88]:
scripts_feats_df = scripts_feats_df.fillna(0)

In [92]:
sfd = scripts_feats_df
sfd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   lower_title        4000 non-null   object 
 1   Title              4000 non-null   object 
 2   URL                4000 non-null   object 
 3   Script             4000 non-null   object 
 4   Clean_Script       4000 non-null   object 
 5   joy                4000 non-null   float64
 6   neutral            4000 non-null   float64
 7   sadness            4000 non-null   float64
 8   surprise           4000 non-null   float64
 9   anger              4000 non-null   float64
 10  disgust            4000 non-null   float64
 11  fear               4000 non-null   float64
 12  sentiment_entropy  4000 non-null   float64
 13  sentiment_std      4000 non-null   float64
 14  hapax              4000 non-null   Int64  
 15  fkgl               4000 non-null   float64
 16  fre                4000 