In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import multiprocessing
from gensim.models import Word2Vec

In [None]:
import os
import gdown
if os.access('imdb_dataset.csv', os.F_OK) is False:
    url = 'https://drive.google.com/file/d/15shf1NaGJEh96rctvI1imW-m4KwvNFL-/view?usp=sharing'
    output = 'imdb_dataset.csv'
    gdown.download(url, output, quiet=False)
else:
    print("El dataset ya se encuentra descargado")

El dataset ya se encuentra descargado


### Dataset

In [None]:
dataset = pd.read_csv('imdb_dataset.csv')

ParserError: ignored

# New Section

In [None]:
dataset.head()

In [None]:
dataset.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [None]:
print("Cantidad de documentos:", dataset.shape[0])

Cantidad de documentos: 50000


### Preprocesamiento

In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("wordnet")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
special_char = re.compile('[^A-Za-z0-9]+')
spaces = re.compile('\s+')
sentence_tokens = []
for row in dataset.review:
  newRow = re.sub(special_char,' ', row)
  newRow = re.sub(spaces, ' ', newRow)
  sentence_tokens.append(nltk.word_tokenize(newRow))

In [None]:
sentence_tokens[:2]

[['One',
  'of',
  'the',
  'other',
  'reviewers',
  'has',
  'mentioned',
  'that',
  'after',
  'watching',
  'just',
  '1',
  'Oz',
  'episode',
  'you',
  'll',
  'be',
  'hooked',
  'They',
  'are',
  'right',
  'as',
  'this',
  'is',
  'exactly',
  'what',
  'happened',
  'with',
  'me',
  'br',
  'br',
  'The',
  'first',
  'thing',
  'that',
  'struck',
  'me',
  'about',
  'Oz',
  'was',
  'its',
  'brutality',
  'and',
  'unflinching',
  'scenes',
  'of',
  'violence',
  'which',
  'set',
  'in',
  'right',
  'from',
  'the',
  'word',
  'GO',
  'Trust',
  'me',
  'this',
  'is',
  'not',
  'a',
  'show',
  'for',
  'the',
  'faint',
  'hearted',
  'or',
  'timid',
  'This',
  'show',
  'pulls',
  'no',
  'punches',
  'with',
  'regards',
  'to',
  'drugs',
  'sex',
  'or',
  'violence',
  'Its',
  'is',
  'hardcore',
  'in',
  'the',
  'classic',
  'use',
  'of',
  'the',
  'word',
  'br',
  'br',
  'It',
  'is',
  'called',
  'OZ',
  'as',
  'that',
  'is',
  'the',
  'ni

### Modelo

In [None]:
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [None]:
w2v_model = Word2Vec(min_count=5,    # frecuencia mínima de palabra para incluirla en el vocabulario
                     window=2,       # cant de palabras antes y desp de la predicha
                     size=300,       # dimensionalidad de los vectores 
                     negative=20,    # cantidad de negative samples... 0 es no se usa
                     workers=1,      # si tienen más cores pueden cambiar este valor
                     sg=1)           # modelo 0:CBOW  1:skipgram

In [None]:
w2v_model.build_vocab(sentence_tokens)

In [None]:
print("Cantidad de docs en el corpus:", w2v_model.corpus_count)

Cantidad de docs en el corpus: 50000


In [None]:
print("Cantidad de words distintas en el corpus:", len(w2v_model.wv.vocab))

Cantidad de words distintas en el corpus: 47047


In [None]:
w2v_model.train(sentence_tokens,
                 total_examples=w2v_model.corpus_count,
                 epochs=5,
                 compute_loss = True,
                 callbacks=[callback()]
                 )

Loss after epoch 0: 69473056.0
Loss after epoch 1: 17745800.0
Loss after epoch 2: 17051384.0
Loss after epoch 3: 16084360.0
Loss after epoch 4: 13863128.0


(45229301, 59916375)

In [None]:
w2v_model.wv.most_similar(positive=["boring"], topn=10)

[('dull', 0.7855067849159241),
 ('tedious', 0.7477730512619019),
 ('ridicules', 0.7065709233283997),
 ('redundant', 0.7020717859268188),
 ('uneventful', 0.6981176733970642),
 ('uninspiring', 0.6918354630470276),
 ('unengaging', 0.6903815865516663),
 ('monotonous', 0.6898486018180847),
 ('uninteresting', 0.6886431574821472),
 ('tiresome', 0.6849414110183716)]

In [None]:
w2v_model.wv.most_similar(positive=["cry"], topn=10)

[('wince', 0.670398473739624),
 ('giggle', 0.6588656902313232),
 ('weep', 0.6582844257354736),
 ('snicker', 0.6560351848602295),
 ('groan', 0.640190601348877),
 ('howl', 0.6313836574554443),
 ('laugh', 0.6282950639724731),
 ('goosebumps', 0.6040447950363159),
 ('squirm', 0.5948758721351624),
 ('puke', 0.5868561267852783)]

Las personas pueden econtrar aburrida una pelicula cuando 

### Visualizacion

In [None]:
from sklearn.decomposition import IncrementalPCA    
from sklearn.manifold import TSNE                   
import numpy as np                                  

def reduce_dimensions(model):
    num_dimensions = 2  

    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index2word)  

    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels

In [None]:
import plotly.graph_objects as go
import plotly.express as px

x_vals, y_vals, labels = reduce_dimensions(w2v_model)

MAX_WORDS=200
fig = px.scatter(x=x_vals[:MAX_WORDS], y=y_vals[:MAX_WORDS], text=labels[:MAX_WORDS])
fig.show(renderer="colab") # esto para plotly en colab