In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import multiprocessing
from gensim.models import Word2Vec

In [22]:
import os
import gdown
if os.access('imdb_dataset.csv', os.F_OK) is False:
  url = 'https://drive.google.com/u/0/uc?id=15shf1NaGJEh96rctvI1imW-m4KwvNFL-&export=download'
  output = 'imdb_dataset.csv'
  gdown.download(url, output, quiet=False)
else:
    print("El dataset ya se encuentra descargado")

Downloading...
From: https://drive.google.com/u/0/uc?id=15shf1NaGJEh96rctvI1imW-m4KwvNFL-&export=download
To: /content/imdb_dataset.csv
100%|██████████| 66.2M/66.2M [00:00<00:00, 177MB/s]


### Dataset

In [23]:
dataset = pd.read_csv('imdb_dataset.csv')

In [24]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [25]:
dataset.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [26]:
print("Cantidad de documentos:", dataset.shape[0])

Cantidad de documentos: 50000


### Preprocesamiento

In [27]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
special_char = re.compile('[^A-Za-z0-9]+')
spaces = re.compile('\s+')

sentence_tokens = []
for row in dataset.review:
  newRow = re.sub(special_char,' ', row)
  newRow = re.sub(spaces, ' ', newRow)
  text_tokens = nltk.word_tokenize(newRow)
  text_tokens = [word for word in text_tokens if not word in stopwords.words('english')]
  tokens = [word for word in text_tokens if word != 'br']
  sentence_tokens.append(tokens)


In [29]:
sentence_tokens[1][:10]

['A',
 'wonderful',
 'little',
 'production',
 'The',
 'filming',
 'technique',
 'unassuming',
 'old',
 'time']

### Modelo

In [30]:
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [31]:
w2v_model = Word2Vec(min_count=5,    # frecuencia mínima de palabra para incluirla en el vocabulario
                     window=2,       # cant de palabras antes y desp de la predicha
                     size=300,       # dimensionalidad de los vectores 
                     negative=20,    # cantidad de negative samples... 0 es no se usa
                     workers=1,      # si tienen más cores pueden cambiar este valor
                     sg=1)           # modelo 0:CBOW  1:skipgram

In [32]:
w2v_model.build_vocab(sentence_tokens)

In [33]:
print("Cantidad de docs en el corpus:", w2v_model.corpus_count)

Cantidad de docs en el corpus: 50000


In [34]:
print("Cantidad de words distintas en el corpus:", len(w2v_model.wv.vocab))

Cantidad de words distintas en el corpus: 46894


In [35]:
w2v_model.train(sentence_tokens,
                 total_examples=w2v_model.corpus_count,
                 epochs=5,
                 compute_loss = True,
                 callbacks=[callback()]
                 )

Loss after epoch 0: 54247544.0
Loss after epoch 1: 20511584.0
Loss after epoch 2: 9938704.0
Loss after epoch 3: 9377984.0
Loss after epoch 4: 8761688.0


(30150742, 32796000)

In [58]:
w2v_model.most_similar(positive=['fun', 'violence'], topn=1)


Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).



[('campiness', 0.692103385925293)]

In [60]:
w2v_model.most_similar(positive=['violence'], negative=['campiness'], topn=1)


Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).



[('rape', 0.11471780389547348)]

In [62]:
w2v_model.most_similar(positive=['violence'], negative=['fun'], topn=1)


Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).



[('cruelty', 0.33642101287841797)]

Campiness: absurdly exaggerated, artificial, or affected in a usually humorous way. <br />
Dada la definición de "campiness", parece que ser que la violencia se asocia a la diversion cuando cae en  el obsurdo. Esto es muy claro en peliculas como Duro de Matar o Rapidos y Furiosos. Sin embargo, la violencia tiene otro tono cuando se le quita este componente y se torna dramática. Una violación parece ser lo que más fuertemente genera esta incomodidad. En general cualquier forma de crueldad incomoda al espectador.

In [72]:
w2v_model.wv.most_similar('boring', topn=10)

[('dull', 0.7621692419052124),
 ('uneventful', 0.7111982703208923),
 ('Boring', 0.7046566009521484),
 ('uninspiring', 0.7013168334960938),
 ('tedious', 0.7012977600097656),
 ('unengaging', 0.7004855275154114),
 ('aggravating', 0.699084997177124),
 ('DULL', 0.6955145597457886),
 ('turgid', 0.6949741840362549),
 ('repetitious', 0.6926065683364868)]

In [76]:
w2v_model.wv.most_similar(positive=['love', 'boring'], topn=1)

[('unsexy', 0.7038358449935913)]

In [77]:
w2v_model.wv.most_similar(positive=['love'], negative=['boring'], topn=1)

[('loyalty', 0.24865007400512695)]

In [78]:
w2v_model.wv.most_similar(positive=['love', 'dull'], topn=1)

[('unsexy', 0.6965025663375854)]

In [75]:
w2v_model.wv.most_similar(positive=['love', 'violence'], topn=1)

[('immorality', 0.6597450971603394)]

In [80]:
w2v_model.wv.most_similar(positive=['love'], negative=['fun'], topn=1)

[('lust', 0.2855861485004425)]

In [82]:
w2v_model.wv.most_similar(positive=['love'], negative=['unsexy'], topn=1)

[('loves', 0.10633078962564468)]

Parece que las películas de amor aburren cuando el componente sexual no se desarrolla. Pero los ultimos resultado parecen contradecir esto y además nos indican que no podemos esperar mcuho de nuestro modelo.  

### Visualizacion

In [40]:
from sklearn.decomposition import IncrementalPCA    
from sklearn.manifold import TSNE                   
import numpy as np                                  

def reduce_dimensions(model):
    num_dimensions = 2  

    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index2word)  

    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels

In [41]:
import plotly.graph_objects as go
import plotly.express as px

x_vals, y_vals, labels = reduce_dimensions(w2v_model)

MAX_WORDS=200
fig = px.scatter(x=x_vals[:MAX_WORDS], y=y_vals[:MAX_WORDS], text=labels[:MAX_WORDS])
fig.show(renderer="colab") # esto para plotly en colab