# **Trabajo de Fin de Máster**

# **1. Imports**

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel

from datasets import load_dataset

import torch 

# Usual imports
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os

# Plotly based imports for visualization
from plotly import tools
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from spacy import displacy

# **2. Definición de funciones**

## **2.1. Carga del texto de entrada**

In [2]:
def load_corpus(data_files, ext_type='csv'):
    '''
    '''
    return load_dataset(ext_type, data_files=data_files)

In [3]:
def filter_corpus(corpus, element, filter_with):
    '''
    '''
    return corpus.filter(lambda filtered_corpus: filtered_corpus[element].startswith(filter_with))

In [4]:
def get_script_episode(corpus, season, episode):
    '''
    '''
    data_season = filter_corpus(corpus,'Season', season)
    return filter_corpus(data_season,'Episode', episode)

In [5]:
def get_full_text(input_text):
    return ''.join(input_text)

## **2.2. Procesamiento del texto de entrada**

In [6]:
def get_summary(input_text, model_name='sshleifer/distilbart-cnn-12-6'):
    '''
    Models: "sshleifer/distilbart-xsum-12-3", "google/pegasus-xsum"
    '''
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    # encode input context
    input_ids = tokenizer(input_text, truncation=True, padding=True, return_tensors="pt").input_ids
    # generate summary
    outputs = model.generate(input_ids=input_ids)
    # decode summary 
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [7]:
def show_entities(input_text):
    displacy.render(nlp(input_text), style="ent")

In [8]:
def spacy_tokenizer(sentence):
    parser = English()
    mytokens = parser(sentence)
    mytokens = [word.lower_ for word in mytokens]
    #mytokens = [word.lemma_.lower().strip() if word.lemma_ != "PRON" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

# **3. Procesamiento de un script de ejemplo de _Game of Thrones_**

*Source: https://www.kaggle.com/thebrownviking20/topic-modelling-with-spacy-and-scikit-learn/notebook?select=winemag-data_first150k.csv*

In [9]:
# Loading data
corpus = load_corpus("qubartCode/corpus/Game_Of_Thrones_Script.csv")['train']
s1_ep2 = get_full_text(get_script_episode(corpus, 'Season 1', 'Episode 2')['Sentence'])
s1_ep2_names = ''
for row in get_script_episode(corpus, 'Season 1', 'Episode 2'):
    line = f"{row['Name'].capitalize()}: {row['Sentence']}"
    s1_ep2_names = s1_ep2_names + " " + line
corpus_reviews = load_corpus("qubartCode/corpus/GameOfThrones_Reviews.csv")['train']

Using custom data configuration default-6267e762714f0fd0
Reusing dataset csv (/Users/andreea/.cache/huggingface/datasets/csv/default-6267e762714f0fd0/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)
Loading cached processed dataset at /Users/andreea/.cache/huggingface/datasets/csv/default-6267e762714f0fd0/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-05842c71d6ab9f3e.arrow
Loading cached processed dataset at /Users/andreea/.cache/huggingface/datasets/csv/default-6267e762714f0fd0/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-80a1cce4dbd0b731.arrow
Loading cached processed dataset at /Users/andreea/.cache/huggingface/datasets/csv/default-6267e762714f0fd0/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-05842c71d6ab9f3e.arrow
Loading cached processed dataset at /Users/andreea/.cache/huggingface/datasets/csv/default-6267e762714f0fd0/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304

In [10]:
corpus_reviews['Episode recap'][0]

'The death of Jon Arryn (John Standing), who serves as the Hand of the King, brings King Robert Baratheon (Mark Addy) and the Lannister clan north to visit the Starks at Winterfell. It doesn’t take long for things to sour once the royal entourage arrives. From the moment Robert demands to pay his respects to Ned Stark’s (Sean Bean) late sister, Lyanna Stark (Aisling Franciosi), in the crypts, it’s clear there are some unresolved issues between the three families — something that becomes even more obvious when Jaime Lannister (Nikolaj Coster-Waldau) pushes Bran Stark (Isaac Hempstead Wright) out of a window for walking in on him with his twin sister, Cersei Lannister (Lena Headey).Meanwhile, the White Walkers make their presence known beyond the Wall, while across the Narrow Sea, Daenerys Targaryen (Emilia Clarke) is given three dragon eggs at her wedding to Dothraki warlord Khal Drogo (Jason Momoa).'

In [11]:
corpus

Dataset({
    features: ['Release Date', 'Season', 'Episode', 'Episode Title', 'Name', 'Sentence'],
    num_rows: 23911
})

In [12]:
nlp = spacy.load("en_core_web_sm")

In [13]:
# Creating a spaCy object
nlp = spacy.load('en_core_web_lg')

In [14]:
doc = nlp(corpus_reviews['Episode recap'][1])
spacy.displacy.render(doc, style='ent',jupyter=True)

In [15]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

In [16]:
review = str(" ".join([i.lemma_ for i in doc]))

In [17]:
doc = nlp(review)
spacy.displacy.render(doc, style='ent',jupyter=True)

In [18]:
# POS tagging
for i in nlp(review):
    print(i,"=>",i.pos_)

although => SCONJ
Jon => PROPN
Snow => PROPN
( => PUNCT
Kit => PROPN
Harington => PROPN
) => PUNCT
, => PUNCT
who => PRON
we => PRON
know => VERB
as => ADP
the => DET
illegitimate => ADJ
son => NOUN
of => ADP
Ned => PROPN
and => CCONJ
a => DET
random => ADJ
, => PUNCT
unknown => ADJ
woman => NOUN
, => PUNCT
be => VERB
introduce => VERB
in => ADP
the => DET
pilot => NOUN
, => PUNCT
his => PRON
storyline => NOUN
do => AUX
n’t => AUX
really => ADV
pick => VERB
up => ADP
until => ADP
the => DET
second => ADJ
episode => NOUN
. => PUNCT
after => ADP
gift => NOUN
his => PRON
half => ADJ
- => PUNCT
sister => NOUN
Arya => PROPN
( => PUNCT
Maisie => PROPN
Williams => PROPN
) => PUNCT
with => ADP
a => DET
small => ADJ
sword => NOUN
that => DET
she => PRON
dub => VERB
" => PUNCT
Needle => PROPN
, => PUNCT
" => PUNCT
Jon => PROPN
leave => VERB
for => ADP
the => DET
Wall => PROPN
, => PUNCT
a => DET
700 => NUM
- => PUNCT
foot => NOUN
tall => ADJ
, => PUNCT
300 => NUM
- => PUNCT
foot => NOUN
thick =>

In [19]:
tqdm.pandas()
tokens = spacy_tokenizer(corpus_reviews['Episode recap'][1])

In [20]:
tokens

'jon snow kit harington know illegitimate son ned random unknown woman introduced pilot storyline pick second episode gifting half sister arya maisie williams small sword dubs “ needle ” jon leaves wall 700 foot tall 300 foot thick barrier ice spans 300 miles westeros ’ northern border uncle benjen joseph mawle tyrion lannister peter dinklage \n\n jon intends join night watch order men guard wall protect seven kingdoms invaders makes unsuccessful attempt question ned identity mother ahead ned departure king landing traveling south father arya sansa stark sophie turner find midst altercation prince joffrey baratheon jack gleeson leads deaths arya friend mycah rhodri hosking butcher boy sansa direwolf lady luckily arya able set direwolf nymeria loose wild harm come daenerys finds way connect new husband bran wakes coma month'

In [21]:
# Creating a vectorizer
vectorizer = CountVectorizer(stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform([tokens])

In [22]:
NUM_TOPICS = 10

In [23]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [24]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized) 

In [25]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)

In [26]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [27]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('dubs', 0.35243646958326713), ('joseph', 0.34118528035160367), ('pilot', 0.3389413958883106), ('arya', 0.337935391605536), ('ned', 0.33700901679747475), ('seven', 0.33619547139533124), ('order', 0.335546434927683), ('wild', 0.3331101423822483), ('harm', 0.33305492883338955), ('lannister', 0.3330426080164012)]
Topic 1:
[('know', 0.33258105165473206), ('prince', 0.32250036862166404), ('joffrey', 0.31739302286333654), ('storyline', 0.3168585421439219), ('question', 0.31482352754880255), ('finds', 0.31382248073148167), ('arya', 0.31382117408929455), ('woman', 0.313567558228462), ('month', 0.3129854828571788), ('nymeria', 0.3111487515517562)]
Topic 2:
[('bran', 0.33631117732120575), ('direwolf', 0.32566864011324115), ('mycah', 0.3166186997306659), ('williams', 0.3164839472289912), ('introduced', 0.3149717491049374), ('watch', 0.3135904266558861), ('half', 0.3130208766289102), ('nymeria', 0.3127658091644657), ('son', 0.3111707299307302), ('random', 0.3095243676044276)]


In [28]:
# Keywords for topics clustered by Latent Semantic Indexing
print("NMF Model:")
selected_topics(nmf, vectorizer)

NMF Model:
Topic 0:
[('arya', 2.518747608965112), ('jon', 1.9962862188663408), ('ned', 1.9686756567495611), ('sansa', 1.035112193098718), ('direwolf', 1.0184996937887716), ('wall', 0.9952689295742763), ('foot', 0.7966557114902468), ('harm', 0.5576239898414795), ('joffrey', 0.5494661158219921), ('leaves', 0.49381280668278554)]
Topic 1:
[('arya', 0.8021633991608227), ('butcher', 0.6643772414078438), ('dinklage', 0.6285117391035357), ('snow', 0.6259204470234161), ('pilot', 0.6093278460498023), ('guard', 0.5389054237275617), ('connect', 0.5366431327807308), ('pick', 0.5210806783358004), ('foot', 0.5006505342026639), ('westeros', 0.49919839003724625)]
Topic 2:
[('sword', 1.0086900503028946), ('maisie', 0.7442191124581287), ('direwolf', 0.6963072299384968), ('deaths', 0.6781495730139353), ('introduced', 0.6419099998213958), ('tall', 0.5926758520699748), ('second', 0.5854454515604035), ('sansa', 0.5813531933098401), ('jon', 0.5660799782331426), ('benjen', 0.5632364596288081)]
Topic 3:
[('ahea

In [29]:
# Keywords for topics clustered by Non-Negative Matrix Factorization
print("LSI Model:")
selected_topics(lsi, vectorizer)

LSI Model:
Topic 0:
[('arya', 0.3244428422615251), ('ned', 0.2433321316961438), ('jon', 0.2433321316961438), ('wall', 0.16222142113076254), ('foot', 0.16222142113076254), ('direwolf', 0.16222142113076254), ('sansa', 0.16222142113076254), ('kit', 0.08111071056538127), ('illegitimate', 0.08111071056538127), ('half', 0.08111071056538127)]


In [30]:
# Transforming an individual sentence
text = spacy_tokenizer("Arya Stark.")
x = lda.transform(vectorizer.transform([text]))[0]
print(x)

[0.03333501 0.03333475 0.03333418 0.03333434 0.69998853 0.03333511
 0.03333455 0.0333347  0.03333448 0.03333434]


In [31]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
#dash

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [32]:
corpus_reviews['Episode recap'][1]

'Although Jon Snow (Kit Harington), who we know as the illegitimate son of Ned and a random, unknown woman, was introduced in the pilot, his storyline doesn’t really pick up until the second episode. After gifting his half-sister Arya (Maisie Williams) with a small sword that she dubs “Needle,” Jon leaves for the Wall, a 700-foot tall, 300-foot thick barrier of ice that spans all 300 miles of Westeros’ northern border, with his Uncle Benjen (Joseph Mawle) and Tyrion Lannister (Peter Dinklage).\n\nJon intends to join the Night’s Watch, an order of men who guard the Wall and protect the Seven Kingdoms from invaders. But first, he makes one last (unsuccessful) attempt to question Ned about the identity of his mother ahead of Ned’s departure for King’s Landing.While traveling south with their father, Arya and Sansa Stark (Sophie Turner) find themselves in the midst of an altercation with Prince Joffrey Baratheon (Jack Gleeson) that leads to the deaths of both Arya’s friend Mycah (Rhodri Ho

In [33]:
!pip install sentencepiece



In [34]:
#get_summary(corpus_reviews['Episode recap'][1])

In [35]:
#corpus_reviews['Episode recap'][2]

In [36]:
#corpus_reviews['Episode recap'][71]

In [37]:
#get_summary(corpus_reviews['Episode recap'][2])

In [38]:
#get_summary(corpus_reviews['Episode recap'][2], "sshleifer/distilbart-xsum-12-3")

In [39]:
#get_summary(corpus_reviews['Episode recap'][2], "google/pegasus-xsum")

### Usando script sin nombres 

In [40]:
#get_summary(s1_ep2)

### Usando script con nombres

In [41]:
#get_summary(s1_ep2_names)

In [42]:
#get_summary(corpus_reviews['Episode recap'][3], "hyunwoongko/ctrlsum-cnndm")

## **Pruebas con el sentence similarity**
https://towardsdatascience.com/bert-for-measuring-text-similarity-eec91c6bf9e1

In [185]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings, or the last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_sentence_embeddings(input_data, model_name='sentence-transformers/paraphrase-mpnet-base-v2'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    encoded_input = tokenizer(input_data, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        
    # Perform pooling. In this case, max pooling.
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return sentence_embeddings.detach().numpy()

def get_similar_sentences(sentence, other_sentences, similarity_threshold=0.7):
    sims = cosine_similarity(sentence, other_sentences)
    return np.where(sims.reshape(-1) >= similarity_threshold)[0]

In [186]:
# convert from PyTorch tensor to numpy array
sentences = get_script_episode(corpus, 'Season 1', 'Episode 2')['Sentence']
mean_pooled = get_sentence_embeddings(sentences)

Loading cached processed dataset at /Users/andreea/.cache/huggingface/datasets/csv/default-6267e762714f0fd0/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-05842c71d6ab9f3e.arrow
Loading cached processed dataset at /Users/andreea/.cache/huggingface/datasets/csv/default-6267e762714f0fd0/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-80a1cce4dbd0b731.arrow


In [191]:
from sklearn.metrics.pairwise import cosine_similarity
# calculate
idx_sentence = 26
similar_sentences = get_similar_sentences([mean_pooled[idx_sentence]], mean_pooled)

In [192]:
for idx in similar_sentences:
    print(sentences[idx])

Is Bran going to die?
You've said goodbye to Bran? He's not going to die. I know it.
What if he's wrong? Bran needs me.
