In [1]:
from transformers import pipeline
from nltk import sent_tokenize
import nltk
import torch
from glob import glob
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\myria\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [148]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\myria\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\myria\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\myria\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\myria\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\myria\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       tagge

True

# Load models

In [2]:
model_name = 'facebook/bart-large-mnli'
device = 0 if torch.cuda.is_available() else 'cpu'

In [8]:
device

0

In [3]:
def load_model(device):
    theme_classifier = pipeline(
        'zero-shot-classification',
        model = model_name,
        device= device
    )

    return theme_classifier

In [10]:
theme_classifier = load_model(device)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [11]:
theme_list = ['friendship', 'hope', 'sacrifice', 'battle', 'self development', 'betrayal', 'love', 'dialogue']

In [16]:
theme_classifier(
    'I punch him right in the face then left madly',
    theme_list,
    multi_label = True
)

{'sequence': 'I punch him right in the face then left madly',
 'labels': ['battle',
  'self development',
  'betrayal',
  'sacrifice',
  'love',
  'hope',
  'dialogue',
  'friendship'],
 'scores': [0.8379108905792236,
  0.19707369804382324,
  0.1469639241695404,
  0.084993377327919,
  0.017587676644325256,
  0.003336874069646001,
  0.0023824165109544992,
  7.160699169617146e-05]}

# Load data

In [4]:
data_files = glob('../data/Subtitles/*.ass')

In [5]:
with open(data_files[0], 'r') as file:
    lines = file.readlines()
    lines = lines[27:]
    lines = [','.join(line.replace('\\N', ' ').strip().split(',')[9:]) for line in lines ]
    

In [6]:
int(data_files[0].split('-')[-1].split('.')[0].strip())

1

In [7]:
#load subtitles and process text and episode number
def load_subtitles(subtitles_files_path):
    paths = glob(subtitles_files_path + '/*.ass')
    
    scripts = []
    episodes_num = []

    for path in paths:
        with open(path, 'r', encoding="utf8") as file:
            lines = file.readlines()
            lines = lines[27:]
            lines = [','.join(line.replace('\\N', ' ').strip().split(',')[9:]) for line in lines ]

        script = ' '.join(lines)
        
        episode_num = int(path.split('-')[-1].split('.')[0].strip())

        scripts.append(script)
        episodes_num.append(episode_num)

    df = pd.DataFrame.from_dict({'Episode' : episodes_num, 'Script' : scripts})

    return df

In [8]:
df = load_subtitles('../data/Subtitles')

In [9]:
df.tail(10)

Unnamed: 0,Episode,Script
208,211,"Fly into the wavy and twisted sky, into your h..."
209,212,"Fly into the wavy and twisted sky, into your h..."
210,213,"Fly into the wavy and twisted sky, into your h..."
211,214,"Fly into the wavy and twisted sky, into your h..."
212,215,"Fly into the wavy and twisted sky, into your h..."
213,216,"Summoning Jutsu! Oh, long time no see. We don’..."
214,217,Gaara. Kankuro. Gaara. Primary Lotus! Damn it!...
215,218,To think the Leaf’s reinforcements will be you...
216,219,"Fly into the wavy and twisted sky, into your h..."
217,220,So this is the Shukaku? It’s the first time I’...


In [154]:
script = df.iloc[0]['Script'] # .iloc make it possible to access data using index, in the example we access the column Script of the first row
script

'A long time ago, a powerful demon fox appeared with nine tails. With its powerful tails, it could smash mountains and create tidal waves. A band of Ninjas rose to defend their village from attack. We have to wait until the Fourth Hokage gets here! We can\'t let it get any closer to our village! One great Ninja was able to imprison the monster, but died in the process. This Ninja was known as… the Fourth Hokage. Naruto! Why did you do such a thing?! You\'re really gonna get it this time! I don\'t care! You know your problem? You can\'t do the things I do! Only I can do this! I\'m better than all of you! Believe it! There\'s a problem, sir! Lord Hokage! What is it? Did that Naruto do something again? Yes. He climbed onto the Mountainside Images… And he vandalized and graffitied all over them! Wait! Ha ha… Why should I? Hey, Naruto! How did you suddenly get here, lruka Sensei? The question is what are you doing here when you should be in class now? Now listen, Naruto. You failed the last

In [155]:
script_sentences = sent_tokenize(script)


In [157]:
#batch sentences
sentences_batch_size = 20
scripts_batches = []

for i in range(0, len(script_sentences), sentences_batch_size):
    sent = ' '.join(script_sentences[i : i+sentences_batch_size])
    scripts_batches.append(sent)

In [None]:
scripts_batches[0:3] #each bacth has 20 sentences

["A long time ago, a powerful demon fox appeared with nine tails. With its powerful tails, it could smash mountains and create tidal waves. A band of Ninjas rose to defend their village from attack. We have to wait until the Fourth Hokage gets here! We can't let it get any closer to our village! One great Ninja was able to imprison the monster, but died in the process. This Ninja was known as… the Fourth Hokage. Naruto! Why did you do such a thing?! You're really gonna get it this time! I don't care! You know your problem? You can't do the things I do! Only I can do this! I'm better than all of you! Believe it! There's a problem, sir! Lord Hokage! What is it? Did that Naruto do something again?",
 'Yes. He climbed onto the Mountainside Images… And he vandalized and graffitied all over them! Wait! Ha ha… Why should I? Hey, Naruto! How did you suddenly get here, lruka Sensei? The question is what are you doing here when you should be in class now? Now listen, Naruto. You failed the last 

In [162]:
output_classification = theme_classifier(
    scripts_batches[0:3],
    theme_list,
    multi_label=True
)

In [163]:
output_classification


[{'sequence': "A long time ago, a powerful demon fox appeared with nine tails. With its powerful tails, it could smash mountains and create tidal waves. A band of Ninjas rose to defend their village from attack. We have to wait until the Fourth Hokage gets here! We can't let it get any closer to our village! One great Ninja was able to imprison the monster, but died in the process. This Ninja was known as… the Fourth Hokage. Naruto! Why did you do such a thing?! You're really gonna get it this time! I don't care! You know your problem? You can't do the things I do! Only I can do this! I'm better than all of you! Believe it! There's a problem, sir! Lord Hokage! What is it? Did that Naruto do something again?",
  'labels': ['dialogue',
   'betrayal',
   'battle',
   'sacrifice',
   'self development',
   'hope',
   'friendship',
   'love'],
  'scores': [0.9790682792663574,
   0.9497532248497009,
   0.8569772839546204,
   0.7666406035423279,
   0.7487459778785706,
   0.19316503405570984,


In [167]:
themes = {}
for output in output_classification:
    for label, score in zip(output['labels'], output['scores']):
        if label not in themes:
            themes[label]=[]
        
        themes[label].append(score)

In [168]:
themes

{'dialogue': [0.9790682792663574, 0.9152395725250244, 0.9280455708503723],
 'betrayal': [0.9497532248497009, 0.6924763917922974, 0.6690558791160583],
 'battle': [0.8569772839546204, 0.6468244194984436, 0.6658564209938049],
 'sacrifice': [0.7666406035423279, 0.5885695219039917, 0.351496160030365],
 'self development': [0.7487459778785706,
  0.8458867073059082,
  0.7924256920814514],
 'hope': [0.19316503405570984, 0.1360895335674286, 0.2414838969707489],
 'friendship': [0.06438494473695755, 0.05948065593838692, 0.14436203241348267],
 'love': [0.04339267313480377, 0.01833723671734333, 0.050931643694639206]}

In [186]:
def get_theme_classification(script):
  #script tokenization
  script_sentences = sent_tokenize(script)
  
  #script sentences batching
  sentences_batch_size = 20
  scripts_batches = []
  for i in range(0, len(script_sentences), sentences_batch_size):
    sent = ' '.join(script_sentences[i : i+sentences_batch_size])
    scripts_batches.append(sent)

  #script sentences theme classification
  theme_classification_output = theme_classifier(
    scripts_batches,
    theme_list,
    multi_label=True
  )
  
  #wrangling output : clean, transform data into a structured format
  themes = {}
  for output in theme_classification_output:
    for label, score in zip(output['labels'], output['scores']):
        if label not in themes:
            themes[label]=[]
        themes[label].append(score)
  
  themes = {key: np.mean(values) for key, values in themes.items()}
  
  return themes
  


In [175]:
themes = get_theme_classification(df.iloc[5]['Script'])
themes = {key: np.mean(values) for key, values in themes.items()}

In [214]:
themes = sorted(themes.items(), key=lambda x: x[1], reverse=True)

AttributeError: 'list' object has no attribute 'items'

In [212]:
df_themess= pd.DataFrame([dict(themes)])
df_themess

Unnamed: 0,dialogue,self development,sacrifice,battle,betrayal,hope,friendship,love
0,0.8992,0.705038,0.696785,0.631374,0.404202,0.38491,0.352712,0.185532


{'dialogue': 0.8992000073194504,
 'self development': 0.705038278674086,
 'sacrifice': 0.6967847421765327,
 'battle': 0.6313737630844116,
 'betrayal': 0.40420166961848736,
 'hope': 0.38490963509927195,
 'friendship': 0.3527124511698882,
 'love': 0.18553190561942756}

# Visualize output

In [None]:
df = df.drop('dialogue', axis=1)

In [None]:
theme_output = df.drop(['Episode', 'Script'], axis=1).sum().reset_index()
theme_output.columns = ['Theme', 'Score']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.barplot(data= theme_output, x='Theme', y='Score')
plt.xticks(rotation=45)
plt.show()