In [127]:
from transformers import pipeline
import pandas as pd
import string
from collections import Counter
import numpy as np
from scipy import stats
import textstat

In [91]:
csv_file_name = "../raw_data/springfield_script_oceans_eleven_2001.csv"
df = pd.read_csv(csv_file_name)
df

Unnamed: 0,Title,Year,Script
0,Ocean's Eleven (2001),2001,One con under escort.\n Open gate one.\n Man w...


In [92]:
script = df['Script'][0]

In [93]:
for punctuation in string.punctuation:
    script = script.replace(punctuation, "")
script_split = script.split(" ")
print(len(script_split))

8240


In [94]:
print(script_split[50:100])

['to', 'break', 'the', 'law', 'again\n', 'This', 'was', 'your', 'first', 'conviction\n', 'but', 'youve', 'been', 'implicated\n', 'in', 'a', 'dozen', 'other', 'schemes\n', 'and', 'frauds\n', 'What', 'can', 'you', 'tell', 'us', 'about', 'this\n', 'As', 'you', 'say', 'maam\n', 'I', 'was', 'never', 'charged\n', 'Were', 'trying', 'to', 'find', 'out\n', 'if', 'there', 'was', 'a', 'reason\n', 'for', 'committing', 'this', 'crime\n']


In [95]:
# any(c.isalpha() for c in string_1)
script_words = [word for word in script_split if any(c.isalpha() for c in word)]
len(script_words)

7657

In [96]:
# for punctuation in string.punctuation:
#     text = text.replace(punctuation, '')
words_stripped = [word.strip() for word in script_words]
words_clean = [word.lower() for word in words_stripped]

In [97]:
word_counts = Counter(words_clean)

In [98]:
hl = 0
for word in word_counts.keys():
    if word_counts[word] == 1:
        hl += 1
print(hl)

912


In [99]:
def clean_words(script):
    '''
    Function that takes raw script and cleans it.
    Returns list of individual words.
    Example: 
    Input: 'Hello, my... name is!'
    Output: ['hello','my','name','is']
    '''
    # Remove punctuation.
    for punctuation in string.punctuation:
        script = script.replace(punctuation, "")
    # Split on whitespace to isolate words
    script_split = script.split(" ")
    # Removing "words" that are just numbers, i.e. have no letters
    script_words = [word for word in script_split if any(c.isalpha() for c in word)]
    # Remove new lines, \n isn't removed by punctuation above.
    words_stripped = [word.strip() for word in script_words]
    # Lowercase in order to count occurances of same word.
    words_clean = [word.lower() for word in words_stripped]
    return words_clean

In [100]:
def count_hapax(script):
    ''' 
    Function to count number of hapax legomenon, i.e.
    words that appear once in a corpus/text.
    '''
    
    words_clean = clean_words(script)
    word_counts = Counter(words_clean)
    # Hapax Legomenon counter
    hell = 0
    for word in word_counts.keys():
        if word_counts[word] == 1:
            hell += 1
    return hell

In [101]:
print(count_hapax(script))

912


In [102]:
df_10 = pd.read_csv("../raw_data/springfield_10_scripts.csv")

In [103]:
df_10['Title']

0                        Barbarosa (1982)
1                         Chestnut (2023)
2                  Contractor, The (2007)
3    George Michael: Freedom Uncut (2022)
4                Objective, Burma! (1945)
5                    Out on a Limb (1992)
6                          Piranha (2010)
7                            Scoop (2006)
8              Secret of the Incas (1954)
9        Slumber Party Massacre II (1987)
Name: Title, dtype: object

In [104]:
for scr in df_10['Script']:
    print(count_hapax(scr))

482
453
536
1090
801
634
564
1047
714
457


In [105]:
my_movies = ["springfield_script_aftersun_2022.csv",
             "springfield_script_bourne_identity_the_2002.csv",
             "springfield_script_grown_ups_2010.csv",
             "springfield_script_oceans_eleven_2001.csv",
             "springfield_script_pulp_fiction_1994.csv",
             "springfield_script_silence_of_the_lambs_the_1991.csv"
            ]

for filename in my_movies:
    df = pd.read_csv(f'../raw_data/{filename}')
    script = df['Script'][0]
    title = df['Title'][0]
    print(f'Title: {title}: #Hapax: {count_hapax(script)}')

Title: Aftersun (2022): #Hapax: 365
Title: Bourne Identity, The (2002): #Hapax: 724
Title: Grown Ups (2010): #Hapax: 919
Title: Ocean's Eleven (2001): #Hapax: 912
Title: Pulp Fiction (1994): #Hapax: 1119
Title: Silence of the Lambs, The (1991): #Hapax: 979


In [119]:
def mean_word_length(script):
    ''' 
    Function to find the mean word length in a script.
    '''
    
    words_clean = clean_words(script)
    word_lengths = np.array([len(word) for word in words_clean],dtype='int')
    return np.mean(word_lengths)

In [120]:
print(mean_word_length(script))

4.134212567882079


In [121]:
for filename in my_movies:
    df = pd.read_csv(f'../raw_data/{filename}')
    script = df['Script'][0]
    title = df['Title'][0]
    print(f'Title: {title}: Mean word length: {mean_word_length(script)}')

Title: Aftersun (2022): Mean word length: 3.8761695101816183
Title: Bourne Identity, The (2002): Mean word length: 4.11635565312843
Title: Grown Ups (2010): Mean word length: 3.891901795529498
Title: Ocean's Eleven (2001): Mean word length: 4.023377301815333
Title: Pulp Fiction (1994): Mean word length: 3.965002797289737
Title: Silence of the Lambs, The (1991): Mean word length: 4.134212567882079


In [125]:
def mode_word_length(script):
    ''' 
    Function to find the mode word length in a script.
    '''
    
    words_clean = clean_words(script)
    word_lengths = np.array([len(word) for word in words_clean],dtype='int')
    
    return stats.mode(word_lengths)[0]

In [126]:
for filename in my_movies:
    df = pd.read_csv(f'../raw_data/{filename}')
    script = df['Script'][0]
    title = df['Title'][0]
    print(f'Title: {title}: Mode word length: {mode_word_length(script)}')

Title: Aftersun (2022): Mode word length: 4
Title: Bourne Identity, The (2002): Mode word length: 4
Title: Grown Ups (2010): Mode word length: 3
Title: Ocean's Eleven (2001): Mode word length: 4
Title: Pulp Fiction (1994): Mode word length: 4
Title: Silence of the Lambs, The (1991): Mode word length: 4


In [132]:
def readability_metrics(script):
    '''
    Function that calculates the readability of a script.
    '''
    
    # Cleaning is done differently here so that the input to the textstat
    # metric functions is correct. Essentially it wants to keep punctuation.
    
    # Split on whitespace to isolate words
    script_split = script.split(" ")
    # Removing "words" that are just numbers, i.e. have no letters
    script_words = [word for word in script_split if any(c.isalpha() for c in word)]
    # Remove new lines
    words_stripped = [word.strip() for word in script_words]
    words_clean = words_stripped
    
    text = " ".join(words_clean)
    # Flesch-Kincaid Grade Level - measures US Grade level required to read text.
    fkgl = textstat.flesch_kincaid_grade(text)
    # Flesch Reading Ease - overall score
    fre = textstat.flesch_reading_ease(text)
    # SMOG Test - better for jargon/technical text
    smog = textstat.smog_index(text)
    # Gunning Fog Index - complexity of sentence structure and vocab
    fog = textstat.gunning_fog(text)

    return (fkgl, fre, smog, fog)

In [133]:
for filename in my_movies:
    df = pd.read_csv(f'../raw_data/{filename}')
    script = df['Script'][0]
    title = df['Title'][0]

    (fkgl, fre, smog, fog) = readability_metrics(script)
    print(f'Title: {title}')
    print(f"Flesch-Kincaid Grade Level: {fkgl}")
    print(f"Flesch Reading Ease: {fre}")
    print(f"Gunning Fog Index: {fog}")
    print(f"SMOG Index: {smog}")
    print(" ")

Title: Aftersun (2022)
Flesch-Kincaid Grade Level: 2.4
Flesch Reading Ease: 89.95
Gunning Fog Index: 3.39
SMOG Index: 6.4
 
Title: Bourne Identity, The (2002)
Flesch-Kincaid Grade Level: 2.4
Flesch Reading Ease: 89.95
Gunning Fog Index: 3.85
SMOG Index: 6.7
 
Title: Grown Ups (2010)
Flesch-Kincaid Grade Level: 2.2
Flesch Reading Ease: 90.46
Gunning Fog Index: 3.46
SMOG Index: 6.3
 
Title: Ocean's Eleven (2001)
Flesch-Kincaid Grade Level: 2.2
Flesch Reading Ease: 90.36
Gunning Fog Index: 3.75
SMOG Index: 6.6
 
Title: Pulp Fiction (1994)
Flesch-Kincaid Grade Level: 2.8
Flesch Reading Ease: 88.94
Gunning Fog Index: 3.89
SMOG Index: 6.5
 
Title: Silence of the Lambs, The (1991)
Flesch-Kincaid Grade Level: 2.6
Flesch Reading Ease: 89.34
Gunning Fog Index: 4.28
SMOG Index: 7.1
 
