In [1]:
from transformers import pipeline
import nltk
import torch
import os
from glob import glob
import re
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download the Punkt tokenizer models
# `nltk.download('punkt')` fetches the Punkt sentence tokenizer data from NLTK's repository.
# It enables the ability to split text into sentences for NLP tasks, which is essential for sentence boundary detection.
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load Model


In [13]:

model_id ='facebook/bart-large-mnli'


In [14]:
device = 0 if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cpu


In [15]:
def load_model(device):
    theme_classifier = pipeline(
        task='zero-shot-classification',
        model=model_id,
        device=device
    )
    return theme_classifier


In [32]:
list_themes = ['love', 'hope', 'fear', 'anger', 'joy', 'sadness', 'betrayal', 'courage', 'guilt', 'redemption']

In [30]:
theme_classifier = load_model(device)



In [None]:
theme_classifier(
    'He was climbing up the mountain to conquer the world', 
    list_themes,
    multi_label = True
)

# Load Dataset

In [41]:
files = glob('../Subtitles/*/*.srt')

In [42]:
files[:15]

['../Subtitles\\Breaking.Bad_.S01\\Breaking.Bad.S01E01.720p.BluRay.x264-REWARD.srt',
 '../Subtitles\\Breaking.Bad_.S01\\Breaking.Bad.S01E02.720p.BluRay.x264-REWARD.srt',
 '../Subtitles\\Breaking.Bad_.S01\\Breaking.Bad.S01E03.720p.BluRay.x264-REWARD.srt',
 '../Subtitles\\Breaking.Bad_.S01\\Breaking.Bad.S01E04.720p.BluRay.x264-REWARD.srt',
 '../Subtitles\\Breaking.Bad_.S01\\Breaking.Bad.S01E05.720p.BluRay.x264-REWARD.srt',
 '../Subtitles\\Breaking.Bad_.S01\\Breaking.Bad.S01E06.720p.BluRay.x264-REWARD.srt',
 '../Subtitles\\Breaking.Bad_.S01\\Breaking.Bad.S01E07.720p.BluRay.x264-REWARD.srt',
 '../Subtitles\\Breaking.Bad_.S02\\Breaking Bad s02ep1 720p brrip.sujaidr.EN.srt',
 '../Subtitles\\Breaking.Bad_.S02\\Breaking Bad s02ep10 720p brrip.sujaidr.EN.srt',
 '../Subtitles\\Breaking.Bad_.S02\\Breaking Bad s02ep11 720p brrip.sujaidr.EN.srt',
 '../Subtitles\\Breaking.Bad_.S02\\Breaking Bad s02ep12 720p brrip.sujaidr.EN.srt',
 '../Subtitles\\Breaking.Bad_.S02\\Breaking Bad s02ep13 720p brrip.suj

In [47]:
with open(files[0], 'r') as file:
    text = file.readlines()
    dialogues = [line.strip() for line in text if not line.strip().isdigit() and '-->' not in line and line.strip()][1:]
    

In [23]:
[line.strip() for line in text if not line.strip().isdigit() and '-->' not in line]

['\ufeff1',
 'Oh, my God. Christ!',
 '',
 'Shit.',
 '',
 'Oh, God. Oh, my God.',
 '',
 'Oh, my God. Oh, my God.',
 'Think, think, think.',
 '',
 'Oh, my gosh.',
 '',
 'Okay. Come on,',
 'come on, come on.',
 '',
 'Come on.',
 '',
 'My name is',
 'Walter Hartwell White.',
 '',
 'I live at 308 Negra Arroyo Lane,',
 '',
 'Albuquerque, New Mexico, 87104.',
 '',
 'To all law-enforcement entities,',
 '',
 'this is not an admission',
 'of guilt.',
 '',
 'I am speaking to my family now.',
 '',
 'Skyler.',
 '',
 'You are the love of my life.',
 '',
 'I hope you know that.',
 '',
 'Walter Jr.',
 '',
 "You're my big man.",
 '',
 'There are... There are going',
 'to be some things...',
 '',
 "Things that you'll",
 'come to learn about me',
 '',
 'in the next few days.',
 '',
 'I just want you to know',
 'that no matter how it may look',
 '',
 'I only had you in my heart.',
 '',
 'Goodbye.',
 '',
 'Money before the panel earlier this year.',
 '',
 'Happy birthday.',
 '',
 'Oh.',
 '',
 'Look at that

In [40]:
dialogues = [line.strip() for line in text if not line.strip().isdigit() and '-->' not in line and line.strip()]

In [None]:
dialogues.pop(0)

In [48]:
dialogues

['Oh, my God. Christ!',
 'Shit.',
 'Oh, God. Oh, my God.',
 'Oh, my God. Oh, my God.',
 'Think, think, think.',
 'Oh, my gosh.',
 'Okay. Come on,',
 'come on, come on.',
 'Come on.',
 'My name is',
 'Walter Hartwell White.',
 'I live at 308 Negra Arroyo Lane,',
 'Albuquerque, New Mexico, 87104.',
 'To all law-enforcement entities,',
 'this is not an admission',
 'of guilt.',
 'I am speaking to my family now.',
 'Skyler.',
 'You are the love of my life.',
 'I hope you know that.',
 'Walter Jr.',
 "You're my big man.",
 'There are... There are going',
 'to be some things...',
 "Things that you'll",
 'come to learn about me',
 'in the next few days.',
 'I just want you to know',
 'that no matter how it may look',
 'I only had you in my heart.',
 'Goodbye.',
 'Money before the panel earlier this year.',
 'Happy birthday.',
 'Oh.',
 'Look at that.',
 'That is veggie bacon.',
 'Believe it or not.',
 'Zero cholesterol',
 "and you won't even",
 'taste the difference.',
 'Mm.',
 'What time do y

In [52]:
' '.join(dialogues[:30])

"Oh, my God. Christ! Shit. Oh, God. Oh, my God. Oh, my God. Oh, my God. Think, think, think. Oh, my gosh. Okay. Come on, come on, come on. Come on. My name is Walter Hartwell White. I live at 308 Negra Arroyo Lane, Albuquerque, New Mexico, 87104. To all law-enforcement entities, this is not an admission of guilt. I am speaking to my family now. Skyler. You are the love of my life. I hope you know that. Walter Jr. You're my big man. There are... There are going to be some things... Things that you'll come to learn about me in the next few days. I just want you to know that no matter how it may look I only had you in my heart."

In [None]:
filename = files[33]

# Regular expression to find the pattern SxxExx or sxxexx
match = re.search(r'[sS](\d{2})[eE](\d{2})', filename)

if match:
    season_number = int(match.group(1))
    episode_number = int(match.group(2))
    print(type(episode_number))
    print(f"Season: {season_number}, Episode: {episode_number}")
else:
    print("No season and episode information found.")

In [9]:
def load_subtitles(folder_path):
    
    subtitles_path = glob(folder_path + '/*/*.srt')
    
    scripts = []
    episode_number_list = []
    season_number_list = []
    
    for file in subtitles_path:
        print(f'Processing {file}')
        with open(file, 'r', encoding='utf-8', errors='replace') as f:
            #, encoding='utf-8', errors='replace'
            lines = f.readlines()
            dialogues = [line.strip() for line in lines if not line.strip().isdigit() and '-->' not in line and line.strip()][1:]

        
        script = ' '.join(dialogues)
        match = re.search(r'[sS](\d{2})[eE](\d{2})', file)
        
        if match:
            season_number = int(match.group(1))
            episode_number = int(match.group(2))
            
        scripts.append(script)
        season_number_list.append(season_number)
        episode_number_list.append(episode_number)
        
    print('Subtitles loaded successfully. Appending the data to a DataFrame.')    
    df = pd.DataFrame.from_dict({"Season": season_number_list, 
                                 "Episode": episode_number_list,
                                 "Script": scripts
                                 })
    return df
            

In [10]:
data_path = '../Subtitles'

In [11]:
df = load_subtitles(data_path)

Processing ../Subtitles\Breaking.Bad_.S01\Breaking.Bad.S01E01.720p.BluRay.x264-REWARD.srt
Processing ../Subtitles\Breaking.Bad_.S01\Breaking.Bad.S01E02.720p.BluRay.x264-REWARD.srt
Processing ../Subtitles\Breaking.Bad_.S01\Breaking.Bad.S01E03.720p.BluRay.x264-REWARD.srt
Processing ../Subtitles\Breaking.Bad_.S01\Breaking.Bad.S01E04.720p.BluRay.x264-REWARD.srt
Processing ../Subtitles\Breaking.Bad_.S01\Breaking.Bad.S01E05.720p.BluRay.x264-REWARD.srt
Processing ../Subtitles\Breaking.Bad_.S01\Breaking.Bad.S01E06.720p.BluRay.x264-REWARD.srt
Processing ../Subtitles\Breaking.Bad_.S01\Breaking.Bad.S01E07.720p.BluRay.x264-REWARD.srt
Processing ../Subtitles\Breaking.Bad_.S02\Breaking Bad s02ep1 720p brrip.sujaidr.EN.srt
Processing ../Subtitles\Breaking.Bad_.S02\Breaking Bad s02ep10 720p brrip.sujaidr.EN.srt
Processing ../Subtitles\Breaking.Bad_.S02\Breaking Bad s02ep11 720p brrip.sujaidr.EN.srt
Processing ../Subtitles\Breaking.Bad_.S02\Breaking Bad s02ep12 720p brrip.sujaidr.EN.srt
Processing ../S

In [12]:
df.head()

Unnamed: 0,Season,Episode,Script
0,1,1,"Oh, my God. Christ! Shit. Oh, God. Oh, my God...."
1,1,2,"Are you okay? - You are a lifesaver. - Yeah, m..."
2,1,3,Let's break it down. Hydrogen. What does that ...
3,1,4,Operation Icebreaker. How we liking that? We n...
4,1,5,"Here is, uh... Here's my résumé. I mean, techn..."


# Run Model 

In [19]:
script = df.iloc[0]['Script']

In [20]:
script

'Oh, my God. Christ! Shit. Oh, God. Oh, my God. Oh, my God. Oh, my God. Think, think, think. Oh, my gosh. Okay. Come on, come on, come on. Come on. My name is Walter Hartwell White. I live at 308 Negra Arroyo Lane, Albuquerque, New Mexico, 87104. To all law-enforcement entities, this is not an admission of guilt. I am speaking to my family now. Skyler. You are the love of my life. I hope you know that. Walter Jr. You\'re my big man. There are... There are going to be some things... Things that you\'ll come to learn about me in the next few days. I just want you to know that no matter how it may look I only had you in my heart. Goodbye. Money before the panel earlier this year. Happy birthday. Oh. Look at that. That is veggie bacon. Believe it or not. Zero cholesterol and you won\'t even taste the difference. Mm. What time do you think you\'ll be home? Same time. I don\'t want him dicking you around tonight. You get paid till 5, you work till 5. No later. Aha. Hey. Hey, happy birthday. 

In [22]:
script_sentences = nltk.sent_tokenize(script)
len(script_sentences)

796

In [27]:
sentence_batch_size = 20
script_batches = []
for i in range(0, len(script_sentences), sentence_batch_size):
    sentence = " ".join(script_sentences[i:i+sentence_batch_size])
    script_batches.append(sentence)

In [28]:
script_batches[:2]

["Oh, my God. Christ! Shit. Oh, God. Oh, my God. Oh, my God. Oh, my God. Think, think, think. Oh, my gosh. Okay. Come on, come on, come on. Come on. My name is Walter Hartwell White. I live at 308 Negra Arroyo Lane, Albuquerque, New Mexico, 87104. To all law-enforcement entities, this is not an admission of guilt. I am speaking to my family now. Skyler. You are the love of my life. I hope you know that. Walter Jr. You're my big man.",
 "There are... There are going to be some things... Things that you'll come to learn about me in the next few days. I just want you to know that no matter how it may look I only had you in my heart. Goodbye. Money before the panel earlier this year. Happy birthday. Oh. Look at that. That is veggie bacon. Believe it or not. Zero cholesterol and you won't even taste the difference. Mm. What time do you think you'll be home? Same time. I don't want him dicking you around tonight. You get paid till 5, you work till 5. No later. Aha. Hey."]

In [33]:
theme_output = theme_classifier(
    script_batches[:2], 
    list_themes, 
    multi_label=True
)

In [34]:
theme_output

[{'sequence': "Oh, my God. Christ! Shit. Oh, God. Oh, my God. Oh, my God. Oh, my God. Think, think, think. Oh, my gosh. Okay. Come on, come on, come on. Come on. My name is Walter Hartwell White. I live at 308 Negra Arroyo Lane, Albuquerque, New Mexico, 87104. To all law-enforcement entities, this is not an admission of guilt. I am speaking to my family now. Skyler. You are the love of my life. I hope you know that. Walter Jr. You're my big man.",
  'labels': ['love',
   'fear',
   'anger',
   'hope',
   'courage',
   'sadness',
   'betrayal',
   'redemption',
   'joy',
   'guilt'],
  'scores': [0.9370834231376648,
   0.6135249733924866,
   0.5785509347915649,
   0.5447491407394409,
   0.4263762831687927,
   0.3936326801776886,
   0.1748252809047699,
   0.14358466863632202,
   0.03915875777602196,
   0.02741333283483982]},
 {'sequence': "There are... There are going to be some things... Things that you'll come to learn about me in the next few days. I just want you to know that no matt

In [39]:
# wrangle the output
# love:[ 0.2498367875814438,0.13750796020030975]

themes= {}
for output in theme_output:
    for label, score in zip(output['labels'], output['scores']):
        if label not in themes:
            themes[label] = []
        themes[label].append(score)

In [42]:

themes = {k: sum(v)/len(v) for k, v in themes.items()}
themes

{'love': 0.6557595431804657,
 'fear': 0.34927841275930405,
 'anger': 0.4154204726219177,
 'hope': 0.39729296416044235,
 'courage': 0.4084272235631943,
 'sadness': 0.26458321511745453,
 'betrayal': 0.2388106733560562,
 'redemption': 0.3121117055416107,
 'joy': 0.08833335898816586,
 'guilt': 0.11912131030112505}

In [43]:
def get_theme_scores(script):
    script_sentences = nltk.sent_tokenize(script)
    sentence_batch_size = 20
    script_batches = []
    
    for i in range(0, len(script_sentences), sentence_batch_size):
        sentence = " ".join(script_sentences[i:i+sentence_batch_size])
        script_batches.append(sentence)
        
    # run model    
    theme_output = theme_classifier(
        script_batches[:2], 
        list_themes, 
        multi_label=True
    )
    
    themes= {}
    for output in theme_output:
        for label, score in zip(output['labels'], output['scores']):
            if label not in themes:
                themes[label] = []
            themes[label].append(score)
    
    themes = {k: sum(v)/len(v) for k, v in themes.items()}
            
    return themes

In [44]:
df = df.head(2)

In [45]:
df

Unnamed: 0,Season,Episode,Script
0,1,1,"Oh, my God. Christ! Shit. Oh, God. Oh, my God...."
1,1,2,"Are you okay? - You are a lifesaver. - Yeah, m..."


In [46]:
output_themes = df['Script'].apply(get_theme_scores)

In [47]:
output_themes

0    {'love': 0.6557595431804657, 'fear': 0.3492784...
1    {'anger': 0.8691986203193665, 'courage': 0.629...
Name: Script, dtype: object

In [48]:
theme_df = pd.DataFrame(output_themes.tolist())
theme_df

Unnamed: 0,love,fear,anger,hope,courage,sadness,betrayal,redemption,joy,guilt
0,0.65576,0.349278,0.41542,0.397293,0.408427,0.264583,0.238811,0.312112,0.088333,0.119121
1,0.19585,0.58675,0.869199,0.407178,0.629492,0.320397,0.640223,0.218755,0.093517,0.562398


In [49]:
df[theme_df.columns] = theme_df
df.head()

Unnamed: 0,Season,Episode,Script,love,fear,anger,hope,courage,sadness,betrayal,redemption,joy,guilt
0,1,1,"Oh, my God. Christ! Shit. Oh, God. Oh, my God....",0.65576,0.349278,0.41542,0.397293,0.408427,0.264583,0.238811,0.312112,0.088333,0.119121
1,1,2,"Are you okay? - You are a lifesaver. - Yeah, m...",0.19585,0.58675,0.869199,0.407178,0.629492,0.320397,0.640223,0.218755,0.093517,0.562398
