# **Trabajo de Fin de Máster**

# **1. Imports**

In [46]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PreTrainedTokenizerFast

from datasets import load_dataset

# Usual imports
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os

# Plotly based imports for visualization
from plotly import tools
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from spacy import displacy

# **2. Definición de funciones**

## **2.1. Carga del texto de entrada**

In [2]:
def load_corpus(data_files, ext_type='csv'):
    '''
    '''
    return load_dataset(ext_type, data_files=data_files)

In [3]:
def filter_corpus(corpus, element, filter_with):
    '''
    '''
    return corpus.filter(lambda filtered_corpus: filtered_corpus[element].startswith(filter_with))

In [4]:
def get_script_episode(corpus, season, episode):
    '''
    '''
    data_season = filter_corpus(corpus,'Season', season)
    return filter_corpus(data_season,'Episode', episode)

In [5]:
def get_full_text(input_text):
    return ''.join(input_text)

## **2.2. Procesamiento del texto de entrada**

In [44]:
def get_summary(input_text, model_name='sshleifer/distilbart-cnn-12-6'):
    '''
    '''
    #tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    # encode input context
    input_ids = tokenizer(input_text, truncation=True, padding=True, return_tensors="pt").input_ids
    # generate summary
    outputs = model.generate(input_ids=input_ids)
    # decode summary 
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [7]:
def show_entities(input_text):
    displacy.render(nlp(input_text), style="ent")

In [8]:
def spacy_tokenizer(sentence):
    parser = English()
    mytokens = parser(sentence)
    mytokens = [word.lower_ for word in mytokens]
    #mytokens = [word.lemma_.lower().strip() if word.lemma_ != "PRON" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

# **3. Procesamiento de un script de ejemplo de _Game of Thrones_**

*Source: https://www.kaggle.com/thebrownviking20/topic-modelling-with-spacy-and-scikit-learn/notebook?select=winemag-data_first150k.csv*

In [42]:
# Loading data
corpus = load_corpus("corpus/Game_Of_Thrones_Script.csv")['train']
s1_ep2 = get_full_text(get_script_episode(corpus, 'Season 1', 'Episode 2')['Sentence'])
s1_ep2_names = ''
for row in get_script_episode(corpus, 'Season 1', 'Episode 2'):
    line = f"{row['Name'].capitalize()}: {row['Sentence']}"
    s1_ep2_names = s1_ep2_names + " " + line
corpus_reviews = load_corpus("corpus/GameOfThrones_Reviews.csv")['train']

Using custom data configuration default-f9fcd536122e6c60
Reusing dataset csv (/Users/andreea/.cache/huggingface/datasets/csv/default-f9fcd536122e6c60/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23)
Loading cached processed dataset at /Users/andreea/.cache/huggingface/datasets/csv/default-f9fcd536122e6c60/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23/cache-53d31aea8cf28f09.arrow
Loading cached processed dataset at /Users/andreea/.cache/huggingface/datasets/csv/default-f9fcd536122e6c60/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23/cache-078e18027ebb6eda.arrow
Loading cached processed dataset at /Users/andreea/.cache/huggingface/datasets/csv/default-f9fcd536122e6c60/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c821476d20d5304287ce12bbc23/cache-53d31aea8cf28f09.arrow
Loading cached processed dataset at /Users/andreea/.cache/huggingface/datasets/csv/default-f9fcd536122e6c60/0.0.0/e138af468cb14e747fb46a19c787ffcfa5170c82

In [10]:
corpus_reviews['Episode recap'][0]

'The death of Jon Arryn (John Standing), who serves as the Hand of the King, brings King Robert Baratheon (Mark Addy) and the Lannister clan north to visit the Starks at Winterfell. It doesn’t take long for things to sour once the royal entourage arrives. From the moment Robert demands to pay his respects to Ned Stark’s (Sean Bean) late sister, Lyanna Stark (Aisling Franciosi), in the crypts, it’s clear there are some unresolved issues between the three families — something that becomes even more obvious when Jaime Lannister (Nikolaj Coster-Waldau) pushes Bran Stark (Isaac Hempstead Wright) out of a window for walking in on him with his twin sister, Cersei Lannister (Lena Headey).Meanwhile, the White Walkers make their presence known beyond the Wall, while across the Narrow Sea, Daenerys Targaryen (Emilia Clarke) is given three dragon eggs at her wedding to Dothraki warlord Khal Drogo (Jason Momoa).'

In [11]:
corpus

Dataset({
    features: ['Release Date', 'Season', 'Episode', 'Episode Title', 'Name', 'Sentence'],
    num_rows: 23911
})

In [12]:
nlp = spacy.load("en_core_web_sm")

In [13]:
# Creating a spaCy object
nlp = spacy.load('en_core_web_lg')

In [14]:
doc = nlp(corpus_reviews['Episode recap'][1])
spacy.displacy.render(doc, style='ent',jupyter=True)

In [15]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

In [16]:
review = str(" ".join([i.lemma_ for i in doc]))

In [17]:
doc = nlp(review)
spacy.displacy.render(doc, style='ent',jupyter=True)

In [28]:
# POS tagging
for i in nlp(review):
    print(i,"=>",i.pos_)

what => PRON
do => AUX
you => PRON
expect => VERB
? => PUNCT
they => PRON
be => VERB
savage => ADJ
. => PUNCT
one => NUM
lot => NOUN
steal => VERB
a => DET
goat => NOUN
from => ADP
another => DET
lot => NOUN
and => CCONJ
before => ADP
you => PRON
know => VERB
it => PRON
, => PUNCT
they => PRON
be => AUX
rip => VERB
each => DET
other => ADJ
to => ADP
piece => NOUN
. => PUNCT
I => PRON
've => AUX
never => ADV
see => VERB
wildling => NOUN
do => VERB
a => DET
thing => NOUN
like => ADP
this => DET
. => PUNCT
I => PRON
' => AUX
ve => AUX
never => ADV
see => VERB
a => DET
thing => NOUN
like => ADP
this => DET
, => PUNCT
not => PART
ever => ADV
in => ADP
my => PRON
life => NOUN
. => PUNCT
how => ADV
close => ADV
do => AUX
you => PRON
get?Close => VERB
as => ADP
any => DET
man => NOUN
would => AUX
. => PUNCT
we => PRON
should => AUX
head => VERB
back => ADV
to => ADP
the => DET
wall => NOUN
. => PUNCT
do => AUX
the => DET
dead => ADJ
frighten => VERB
you?Our => PROPN
order => NOUN
be => VERB
to

In [18]:
tqdm.pandas()
tokens = spacy_tokenizer(corpus_reviews['Episode recap'][1])

In [19]:
tokens

'jon snow kit harington know illegitimate son ned random unknown woman introduced pilot storyline pick second episode gifting half sister arya maisie williams small sword dubs “ needle ” jon leaves wall 700 foot tall 300 foot thick barrier ice spans 300 miles westeros ’ northern border uncle benjen joseph mawle tyrion lannister peter dinklage \n\n jon intends join night watch order men guard wall protect seven kingdoms invaders makes unsuccessful attempt question ned identity mother ahead ned departure king landing traveling south father arya sansa stark sophie turner find midst altercation prince joffrey baratheon jack gleeson leads deaths arya friend mycah rhodri hosking butcher boy sansa direwolf lady luckily arya able set direwolf nymeria loose wild harm come daenerys finds way connect new husband bran wakes coma month'

In [20]:
# Creating a vectorizer
vectorizer = CountVectorizer(stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform([tokens])

In [21]:
NUM_TOPICS = 10

In [22]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [23]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized) 

In [24]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)

In [25]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [26]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('order', 0.31539932856814423), ('woman', 0.3026206680271259), ('son', 0.3022918417469849), ('dubs', 0.30213940552397894), ('spans', 0.30209152968402836), ('protect', 0.3018513879594847), ('jon', 0.30139035397687086), ('mycah', 0.30005396811112117), ('makes', 0.2991600176422903), ('know', 0.2978952157138768)]
Topic 1:
[('arya', 3.5091985566654236), ('jon', 2.67888161554692), ('ned', 2.64845325294653), ('direwolf', 1.8856002901533806), ('foot', 1.8385792950523412), ('wall', 1.821334021132725), ('sansa', 1.8193937470271377), ('new', 1.1185862684448569), ('williams', 1.1181446350071496), ('coma', 1.1079916773327037)]
Topic 2:
[('miles', 0.33432203208979916), ('tall', 0.32984445468434903), ('tyrion', 0.3234188064872985), ('question', 0.3220438345648224), ('mawle', 0.316430972240304), ('wall', 0.31609660940833384), ('lady', 0.3134596977302499), ('baratheon', 0.31117463816319557), ('sword', 0.3090353339856813), ('gifting', 0.3087637614998201)]
Topic 3:
[('wall', 0.355530

In [27]:
# Keywords for topics clustered by Latent Semantic Indexing
print("NMF Model:")
selected_topics(nmf, vectorizer)

NMF Model:
Topic 0:
[('arya', 1.7439407915243479), ('jon', 1.2056037252834033), ('ned', 1.0976603196890549), ('foot', 0.7380911175604185), ('direwolf', 0.719313073915503), ('sansa', 0.6992749703127791), ('wall', 0.6830815769033672), ('sister', 0.3577415714425179), ('joffrey', 0.3568181675858964), ('woman', 0.35184507413395216)]
Topic 1:
[('south', 0.706015311943722), ('second', 0.6753539979788332), ('illegitimate', 0.6060700479576989), ('gleeson', 0.6023048792187152), ('gifting', 0.5844342000196365), ('unsuccessful', 0.5607045972255406), ('harm', 0.5606657773072516), ('small', 0.5523737170707443), ('mawle', 0.5471249422257012), ('jon', 0.5208481614616439)]
Topic 2:
[('way', 0.6777224992520446), ('stark', 0.6739095283417773), ('sansa', 0.6649729697100206), ('spans', 0.6537403702500609), ('introduced', 0.6412101809238522), ('departure', 0.6399170354656832), ('needle', 0.6005330778888678), ('sword', 0.5921228775241025), ('coma', 0.5789269726430901), ('hosking', 0.5788330920905606)]
Topic 

In [28]:
# Keywords for topics clustered by Non-Negative Matrix Factorization
print("LSI Model:")
selected_topics(lsi, vectorizer)

LSI Model:
Topic 0:
[('arya', 0.3244428422615251), ('ned', 0.2433321316961438), ('jon', 0.2433321316961438), ('wall', 0.16222142113076254), ('foot', 0.16222142113076254), ('direwolf', 0.16222142113076254), ('sansa', 0.16222142113076254), ('kit', 0.08111071056538127), ('illegitimate', 0.08111071056538127), ('half', 0.08111071056538127)]


In [51]:
# Transforming an individual sentence
text = spacy_tokenizer("Arya Stark.")
x = lda.transform(vectorizer.transform([text]))[0]
print(x)

[0.03333455 0.69998978 0.0333348  0.03333428 0.03333453 0.03333445
 0.03333414 0.03333445 0.03333463 0.03333439]


In [50]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [33]:
get_summary(tokens)

['jon snow kit harington know illegitimate son ned  ned random unknown woman introduced pilot storyline pick second episode gifting half sister arya maisie williams small sword dubs “ needle ” jon leaves wall 700 foot tall 300 foot thick barrier ice spans 300 miles.']

In [34]:
corpus_reviews['Episode recap'][1]

'Although Jon Snow (Kit Harington), who we know as the illegitimate son of Ned and a random, unknown woman, was introduced in the pilot, his storyline doesn’t really pick up until the second episode. After gifting his half-sister Arya (Maisie Williams) with a small sword that she dubs “Needle,” Jon leaves for the Wall, a 700-foot tall, 300-foot thick barrier of ice that spans all 300 miles of Westeros’ northern border, with his Uncle Benjen (Joseph Mawle) and Tyrion Lannister (Peter Dinklage).\n\nJon intends to join the Night’s Watch, an order of men who guard the Wall and protect the Seven Kingdoms from invaders. But first, he makes one last (unsuccessful) attempt to question Ned about the identity of his mother ahead of Ned’s departure for King’s Landing.While traveling south with their father, Arya and Sansa Stark (Sophie Turner) find themselves in the midst of an altercation with Prince Joffrey Baratheon (Jack Gleeson) that leads to the deaths of both Arya’s friend Mycah (Rhodri Ho

In [35]:
get_summary(corpus_reviews['Episode recap'][1])

[' Jon Snow (Kit Harington) is the illegitimate son of Ned and a random, unknown woman. Jon intends to join the Night’s Watch, an order of men who guard the Wall and protect the Seven Kingdoms. Elsewhere, Daenerys finds a way to connect with her new husband and Bran wakes up.']

In [36]:
corpus_reviews['Episode recap'][2]

'In the early days of her arranged marriage, Daenerys is repeatedly raped by her husband (a storyline that led to much controversy). But as she and Drogo develop mutual respect and admiration for one another, Dany begins to come into her own as Khaleesi — a development that upsets her older brother Viserys (Harry Lloyd) — and discovers that she is pregnant with Drogo’s son.Following Ned’s first Small Council meeting as Hand of the King, he learns that Catelyn has snuck into King’s Landing to show him the dagger that the catspaw assassin tried to use on Bran — a reveal that leads Master of Coin Petyr “Littlefinger” Baelish (Aidan Gillen) to claim he recognizes the blade as one he lost to Tyrion in a bet.As for the Stark kids, Jon starts to understand the harsh reality of serving in the Night’s Watch when he realizes that, unlike him, most of the recruits are untrained criminals who were forced into service, while Arya, who has no desire to become a “proper” lady, begins training with ma

In [49]:
corpus_reviews['Episode recap'][71]



In [37]:
get_summary(corpus_reviews['Episode recap'][2])

[' Dany begins to come into her own as Khaleesi. Bran struggles to come to terms with being paralyzed from the waist down. Jon starts to understand the harsh reality of serving in the Night\'s Watch. Arya, who has no desire to become a "proper" lady, begins training with a swordsman.']

In [38]:
get_summary(corpus_reviews['Episode recap'][2], "sshleifer/distilbart-xsum-12-3")

[" Dany, Ned Ned, Catelyn and Arya, have returned to King's Landing for the first time in more than a decade."]

In [39]:
get_summary(corpus_reviews['Episode recap'][2], "google/pegasus-xsum")

['In the latest episode of Game of Thrones, Daenerys Targaryen (Emilia Clarke) and Drogo Stark ( Kit Harington) are married.']

### Usando script sin nombres 

In [40]:
get_summary(s1_ep2)

[" Khaleesi: I'll stay with Drogo until he fulfils his end of the bargain and I have my crown. I'll be more comfortable in Pentos, your Grace. I have no interest in hospitality or comfort. I just want to stand on top of the Wall and piss off the edge of the world. I hope the boy does wake. I'd be very interested to hear what he has to say."]

### Usando script con nombres

In [41]:
get_summary(s1_ep2_names)

[" Jorah mormont: We're still not far from Pentos, your Grace. Magister Illyrio has extended his hospitality. You'd be more comfortable there. Viserys Targaryen: I have no interest in hospitality or comfort. I'll stay with Drogo until he fulfils his end of the bargain and I have my crown."]

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [48]:
get_summary(corpus_reviews['Episode recap'][3], "hyunwoongko/ctrlsum-cnndm")

[' After arriving in the Vale with Tyrion as her prisoner, Catelyn is concerned to']