In [1]:
# !pip install transformers torch pytesseract
# !pip install sentencepiece sacremoses
# !brew install xz
#import _lzma
from transformers import pipeline
import pandas as pd
import os
import string
from collections import Counter
import numpy as np
from scipy import stats
import textstat

In [2]:
from google.cloud import storage

# Correct bucket name and file path
bucket_name = "springfield_40k"  # Ensure this matches your actual bucket name
css_file_path = "springfield_10_scripts.csv"  # Use only the relative path, not the full URL
css_file_path_full = "springfield_40k_movie_scripts.csv"  # Use only the relative path, not the full URL

# Initialize GCS client
client = storage.Client()
bucket = client.bucket(bucket_name)
blob = bucket.blob(css_file_path)

# Read the CSS content (or CSV in this case)
df = pd.read_csv(blob.open('r'))
df

Unnamed: 0,Title,Year,Script
0,Barbarosa (1982),1982,"DAMN, WHERE ARE WE?\n WHEW, WE GOTTA FIND SOME..."
1,Chestnut (2023),2023,1\n [ Birds chirping in distance ]\n [ Phone r...
2,"Contractor, The (2007)",2007,"Ah, James.\n Ali Mahmud Jahar.\n Remember him?..."
3,George Michael: Freedom Uncut (2022),2022,"On Christmas Day, 2016,\n we heard with shock\..."
4,"Objective, Burma! (1945)",1945,This is Burma...\n the toughest battleground i...
5,Out on a Limb (1992),1992,"MISS CLAYTON: Okay,\n find a seat. Sit down, p..."
6,Piranha (2010),2010,Oh... boy...\n That's a fast fish.\n C'mon... ...
7,Scoop (2006),2006,Don't mourn for Joe Strombel.\n Joe Strombel l...
8,Secret of the Incas (1954),1954,1\n (dramatic music)\n (flute music)\n (singer...
9,Slumber Party Massacre II (1987),1987,Dr. Weiss says\n that it's perfectly normal\n ...


# Josh Preprocessor

In [65]:
def clean_words(script):
    '''
    Function that takes raw script and cleans it.
    Returns list of individual words.
    Example: 
    Input: 'Hello, my... name is!'
    Output: ['hello','my','name','is']
    '''
    # Remove punctuation.
    for punctuation in string.punctuation:
        script = script.replace(punctuation, "")
    # Split on whitespace to isolate words
    script_split = script.split(" ")
    # Removing "words" that are just numbers, i.e. have no letters
    script_words = [word for word in script_split if any(c.isalpha() for c in word)]
    # Remove new lines, \n isn't removed by punctuation above.
    words_stripped = [word.strip() for word in script_words]
    # Lowercase in order to count occurances of same word.
    words_clean = [word.lower() for word in words_stripped]
    return words_clean
    
def count_hapax(words_clean):
    ''' 
    Function to count number of hapax legomenon, i.e.
    words that appear once in a corpus/text.
    '''
    #words_clean = clean_words(script)
    word_counts = Counter(words_clean)
    # Hapax Legomenon counter
    hell = 0
    for word in word_counts.keys():
        if word_counts[word] == 1:
            hell += 1
    return {'hapax': hell}

def readability_metrics(script):
    '''
    Function that calculates the readability of a script.
    '''
    # Cleaning is done differently here so that the input to the textstat
    # metric functions is correct. Essentially it wants to keep punctuation.
    
    # Split on whitespace to isolate words
    script_split = script.split(" ")
    # Removing "words" that are just numbers, i.e. have no letters
    script_words = [word for word in script_split if any(c.isalpha() for c in word)]
    # Remove new lines
    words_stripped = [word.strip() for word in script_words]
    words_clean = words_stripped
    
    text = " ".join(words_clean)
    # Flesch-Kincaid Grade Level - measures US Grade level required to read text.
    fkgl = textstat.flesch_kincaid_grade(text)
    # Flesch Reading Ease - overall score
    fre = textstat.flesch_reading_ease(text)
    # SMOG Test - better for jargon/technical text
    smog = textstat.smog_index(text)
    # Gunning Fog Index - complexity of sentence structure and vocab
    fog = textstat.gunning_fog(text)

    return {'fkgl': fkgl, 'fre': fre, 'smog': smog, 'fog': fog}

def vocab_size(words_clean):
    ''' 
    Function to count number of unique words.
    '''
    
    #words_clean = clean_words(script)
    word_counts = Counter(words_clean)
    
    return {'word count': len(word_counts)}

def type_token_ratio(words_clean):
    ''' 
    Function to calculate the type token ratio.
    TTR = (# unique words)/(total # words)
    '''
    
    #words_clean = clean_words(script)
    word_counts = Counter(words_clean)
    
    return {'TTR': len(word_counts)/len(words_clean)}

def script_length(words_clean):
    ''' 
    Function to calculate the script length.
    '''
    
    #words_clean = clean_words(script)
    
    return {'script_length': len(words_clean)}

def mean_word_length(words_clean):
    ''' 
    Function to find the mean word length in a script.
    '''
    
    #words_clean = clean_words(script)
    word_lengths = np.array([len(word) for word in words_clean],dtype='int')
    return {'mean_word_length': np.mean(word_lengths)}

In [66]:
df["Clean Script"] = df["Script"].apply(clean_words)
df

Unnamed: 0,Title,Year,Script,Clean Script
0,Barbarosa (1982),1982,"DAMN, WHERE ARE WE?\n WHEW, WE GOTTA FIND SOME...","[damn, where, are, we, whew, we, gotta, find, ..."
1,Chestnut (2023),2023,1\n [ Birds chirping in distance ]\n [ Phone r...,"[birds, chirping, in, distance, phone, ringing..."
2,"Contractor, The (2007)",2007,"Ah, James.\n Ali Mahmud Jahar.\n Remember him?...","[ah, james, ali, mahmud, jahar, remember, him,..."
3,George Michael: Freedom Uncut (2022),2022,"On Christmas Day, 2016,\n we heard with shock\...","[on, christmas, day, we, heard, with, shock, a..."
4,"Objective, Burma! (1945)",1945,This is Burma...\n the toughest battleground i...,"[this, is, burma, the, toughest, battleground,..."
5,Out on a Limb (1992),1992,"MISS CLAYTON: Okay,\n find a seat. Sit down, p...","[miss, clayton, okay, find, a, seat, sit, down..."
6,Piranha (2010),2010,Oh... boy...\n That's a fast fish.\n C'mon... ...,"[oh, boy, thats, a, fast, fish, cmon, cmon, cm..."
7,Scoop (2006),2006,Don't mourn for Joe Strombel.\n Joe Strombel l...,"[dont, mourn, for, joe, strombel, joe, strombe..."
8,Secret of the Incas (1954),1954,1\n (dramatic music)\n (flute music)\n (singer...,"[dramatic, music, flute, music, singer, vocali..."
9,Slumber Party Massacre II (1987),1987,Dr. Weiss says\n that it's perfectly normal\n ...,"[dr, weiss, says, that, its, perfectly, normal..."


In [67]:
sentiment_pipe = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment')
emotion_pipe = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
Device set to use 0
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at j-hartmann/emotion-english-distilroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
Device set to use 0


In [68]:
def split_script(script, max_chars=600):
    chunks = []
    start = 0

    while start < len(script):
        # Find the nearest newline after 400 characters
        end = start + max_chars
        if end < len(script):
            newline_pos = script.rfind("\n", start, end)
            if newline_pos != -1:
                end = newline_pos + 1  # Include the newline
        chunks.append(script[start:end].strip().replace('\n',' '))  # Add chunk, remove leading/trailing spaces
        start = end  # Move to the next chunk

    return chunks


def sentiment_params(script):
    sentiments = []
    chunks = split_script(script)

    # Define mapping dictionary
    sentiment_mapping = {
        '0': -1,  # Negative
        '1': 0,   # Neutral
        '2': 1    # Positive
    }

    for chunk in chunks:
        sentiment_label = sentiment_pipe(chunk)[0]['label']
        sentiment = sentiment_mapping.get(sentiment_label[-1], 0)  # Default to 0 if not found
        sentiments.append(sentiment)

    #sentiments ready

    #2 Entropy
    count_pos = sentiments.count(1)
    count_neg = sentiments.count(-1)
    count3_neut = sentiments.count(0)
    #print(count_pos, count_neg, count3_neut)
    prob_pos, prob_neg, prob_neut= count_pos/len(sentiments), count_neg/len(sentiments), count3_neut/len(sentiments)

    sentiment_entropy = -(prob_pos * np.log2(prob_pos) + prob_neg * np.log2(prob_neg) + prob_neut * np.log2(prob_neut))

    #3 Std of sentiment
    sentiment_diff = np.array(sentiments[1:]) - np.array(sentiments[:-1])
    sentiment_std = np.std(sentiment_diff)

    return {'sentiment_entropy': sentiment_entropy, 'sentiment_std': sentiment_std}

In [69]:
def emotion_frequencies(script):
    emotions = []
    chunks = split_script(script)
    for chunk in chunks:
        emot = emotion_pipe(chunk)[0]['label']
        emotions.append(emot)

    # Count occurrences
    counts = Counter(emotions)

    # Total number of occurrences (for normalization)
    total_count = sum(counts.values())

    # Normalize frequencies
    normalized_frequencies = {emotion: freq / total_count for emotion, freq in counts.items()}

    return normalized_frequencies

In [73]:
def embedder(df): 
    #clean script for processing
    df["Clean_Script"] = df["Script"].apply(clean_words)
    #emotions
    df_emotions = df["Script"].apply(emotion_frequencies).apply(pd.Series)
    #sentiments
    df_sentiments = df["Script"].apply(sentiment_params).apply(pd.Series)
    #complexity
    df_hapax = df["Clean_Script"].apply(count_hapax).apply(pd.Series)
    df_readability = df["Script"].apply(readability_metrics).apply(pd.Series)
    df_voc = df["Clean_Script"].apply(vocab_size).apply(pd.Series)
    df_ttr = df["Clean_Script"].apply(type_token_ratio).apply(pd.Series)
    df_scr = df["Clean_Script"].apply(script_length).apply(pd.Series)
    df_mwl = df["Clean_Script"].apply(mean_word_length).apply(pd.Series)
    # Merge new features into original DataFrame
    
    df_embedded = pd.concat([df, 
                             df_emotions, 
                             df_sentiments,
                             df_hapax,
                             df_readability,
                             df_voc,
                             df_ttr,
                             df_scr,
                             df_mwl
                            ],
                            axis=1)

    # Show result
    return df_embedded


In [75]:
%%time
embedder(df)

CPU times: user 11min 31s, sys: 1min 26s, total: 12min 58s
Wall time: 3min 37s


Unnamed: 0,Title,Year,Script,Clean Script,Clean_Script,surprise,neutral,fear,anger,joy,...,sentiment_std,hapax,fkgl,fre,smog,fog,word count,TTR,script_length,mean_word_length
0,Barbarosa (1982),1982,"DAMN, WHERE ARE WE?\n WHEW, WE GOTTA FIND SOME...","[damn, where, are, we, whew, we, gotta, find, ...","[damn, where, are, we, whew, we, gotta, find, ...",0.444444,0.066667,0.133333,0.2,0.022222,...,0.722642,482,1.3,98.21,5.6,3.31,949,0.203125,4672,3.858305
1,Chestnut (2023),2023,1\n [ Birds chirping in distance ]\n [ Phone r...,"[birds, chirping, in, distance, phone, ringing...","[birds, chirping, in, distance, phone, ringing...",0.227273,0.136364,0.060606,0.060606,0.287879,...,0.936315,453,2.9,88.63,6.7,3.75,988,0.149245,6620,3.939275
2,"Contractor, The (2007)",2007,"Ah, James.\n Ali Mahmud Jahar.\n Remember him?...","[ah, james, ali, mahmud, jahar, remember, him,...","[ah, james, ali, mahmud, jahar, remember, him,...",,0.365854,0.414634,,,...,0.74162,536,3.6,81.49,7.6,4.41,952,0.235294,4046,4.260257
3,George Michael: Freedom Uncut (2022),2022,"On Christmas Day, 2016,\n we heard with shock\...","[on, christmas, day, we, heard, with, shock, a...","[on, christmas, day, we, heard, with, shock, a...",0.142857,0.111111,0.150794,0.047619,0.253968,...,0.995992,1090,5.5,81.93,8.6,6.96,2043,0.15158,13478,4.016471
4,"Objective, Burma! (1945)",1945,This is Burma...\n the toughest battleground i...,"[this, is, burma, the, toughest, battleground,...","[this, is, burma, the, toughest, battleground,...",0.059524,0.607143,0.214286,0.02381,0.035714,...,0.806599,801,2.4,89.85,6.2,3.5,1579,0.182543,8650,3.998613
5,Out on a Limb (1992),1992,"MISS CLAYTON: Okay,\n find a seat. Sit down, p...","[miss, clayton, okay, find, a, seat, sit, down...","[miss, clayton, okay, find, a, seat, sit, down...",0.15873,0.206349,0.142857,0.301587,0.079365,...,0.782881,634,2.6,89.55,6.4,3.67,1251,0.196112,6379,3.99467
6,Piranha (2010),2010,Oh... boy...\n That's a fast fish.\n C'mon... ...,"[oh, boy, thats, a, fast, fish, cmon, cmon, cm...","[oh, boy, thats, a, fast, fish, cmon, cmon, cm...",0.5,0.02,0.08,0.16,0.18,...,1.160577,564,2.2,90.36,5.9,3.46,1041,0.203281,5121,3.93263
7,Scoop (2006),2006,Don't mourn for Joe Strombel.\n Joe Strombel l...,"[dont, mourn, for, joe, strombel, joe, strombe...","[dont, mourn, for, joe, strombel, joe, strombe...",0.264463,0.173554,0.280992,0.008264,0.165289,...,0.908257,1047,2.7,89.14,7.1,4.23,1935,0.153219,12629,3.99129
8,Secret of the Incas (1954),1954,1\n (dramatic music)\n (flute music)\n (singer...,"[dramatic, music, flute, music, singer, vocali...","[dramatic, music, flute, music, singer, vocali...",0.15,0.3375,0.2875,0.0375,0.1375,...,0.746299,714,2.6,89.34,6.7,3.8,1405,0.175933,7986,4.086026
9,Slumber Party Massacre II (1987),1987,Dr. Weiss says\n that it's perfectly normal\n ...,"[dr, weiss, says, that, its, perfectly, normal...","[dr, weiss, says, that, its, perfectly, normal...",0.387755,0.020408,0.244898,0.020408,0.183673,...,1.136515,457,2.2,95.98,5.8,4.13,954,0.181853,5246,3.773351


In [44]:
import time


ValueError: Length of values (1) does not match length of index (10)