In [1]:
import pandas as pd

df = pd.read_csv("MovieReview.csv")
display(df.head())
print(df.shape)

df = df.drop("sentiment", axis=1)

Unnamed: 0,sentiment,review
0,Positive,With all this stuff going down at the moment w...
1,Positive,'The Classic War of the Worlds' by Timothy Hin...
2,Negative,The film starts with a manager (Nicholas Bell)...
3,Negative,It must be assumed that those who praised this...
4,Positive,Superbly trashy and wondrously unpretentious 8...


(25000, 2)


In [2]:
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# nltk.download()
stop_words = stopwords.words("english")


# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return "".join(
        c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn"
    )


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!]+", " ", w)
    w = re.sub(r"\b\w{0,2}\b", "", w)

    # remove stopword
    mots = word_tokenize(w.strip())
    mots = [mot for mot in mots if mot not in stop_words]
    return " ".join(mots).strip()


df.review = df.review.apply(lambda x: preprocess_sentence(x))
df.head()

Unnamed: 0,review
0,stuff going moment started listening music wat...
1,classic war worlds timothy hines entertaining ...
2,film starts manager nicholas bell giving welco...
3,must assumed praised film greatest filmed oper...
4,superbly trashy wondrously unpretentious explo...


In [11]:
import tensorflow as tf

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df.review)

In [12]:
word2idx = tokenizer.word_index
idx2word = tokenizer.index_word
vocab_size = tokenizer.num_words

print(vocab_size)

10000


In [13]:
import pickle

with open("word2idx.pkl", "wb") as f:
    pickle.dump(word2idx, f)
with open("idx2word.pkl", "wb") as f:
    pickle.dump(idx2word, f)

In [9]:
import numpy as np


def sentenceToData(tokens, WINDOW_SIZE):
    window = np.concatenate((np.arange(-WINDOW_SIZE, 0), np.arange(1, WINDOW_SIZE + 1)))
    X, Y = ([], [])
    for word_index, word in enumerate(tokens):
        if (word_index - WINDOW_SIZE >= 0) and (
            word_index + WINDOW_SIZE <= len(tokens) - 1
        ):
            X.append(word)
            Y.append([tokens[word_index - i] for i in window])
    return X, Y


WINDOW_SIZE = 5

X, Y = ([], [])
for review in df.review:
    for sentence in review.split("."):
        word_list = tokenizer.texts_to_sequences([sentence])[0]
        if len(word_list) >= WINDOW_SIZE:
            Y1, X1 = sentenceToData(word_list, WINDOW_SIZE // 2)
            X.extend(X1)
            Y.extend(Y1)

X = np.array(X).astype(int)
y = np.array(Y).astype(int).reshape([-1, 1])

In [11]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D

embedding_dim = 300
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(GlobalAveragePooling1D())
model.add(Dense(vocab_size, activation='softmax'))

In [12]:
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
model.fit(X, y, batch_size=128, epochs=50)

Epoch 1/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 14ms/step - accuracy: 0.0313 - loss: 7.6524
Epoch 2/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 15ms/step - accuracy: 0.0582 - loss: 6.9517
Epoch 3/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 19ms/step - accuracy: 0.0753 - loss: 6.5189
Epoch 4/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 15ms/step - accuracy: 0.0888 - loss: 6.1886
Epoch 5/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 15ms/step - accuracy: 0.1003 - loss: 5.9189
Epoch 6/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 15ms/step - accuracy: 0.1109 - loss: 5.6966
Epoch 7/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 15ms/step - accuracy: 0.1209 - loss: 5.5140
Epoch 8/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 15ms/step - accuracy: 0.1303

<keras.src.callbacks.history.History at 0x1fedd9a6410>

In [13]:
model.save("word2vec.h5")

