In [1]:
import pandas as pd

df = pd.read_csv("MovieReview.csv")
display(df.head())
print(df.shape)

df = df.drop('sentiment', axis=1)
print(df.shape)



Unnamed: 0,sentiment,review
0,Positive,With all this stuff going down at the moment w...
1,Positive,'The Classic War of the Worlds' by Timothy Hin...
2,Negative,The film starts with a manager (Nicholas Bell)...
3,Negative,It must be assumed that those who praised this...
4,Positive,Superbly trashy and wondrously unpretentious 8...


(25000, 2)
(25000, 1)


In [2]:
import nltk
for pkg in ["punkt", "punkt_tab", "stopwords"]:
    nltk.download(pkg, quiet=False)  # False pour voir le log; mets True si tu veux silencieux


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PERCHECMorgan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\PERCHECMorgan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PERCHECMorgan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words("english"))


In [4]:
import re
import unicodedata

stop_words = stopwords.words('english')

# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!]+", " ", w)
    w = re.sub(r'\b\w{0,2}\b', '', w)

    # remove stopword
    mots = word_tokenize(w.strip())
    mots = [mot for mot in mots if mot not in stop_words]
    return ' '.join(mots).strip()

df.review = df.review.apply(lambda x :preprocess_sentence(x))
df.head()

Unnamed: 0,review
0,stuff going moment started listening music wat...
1,classic war worlds timothy hines entertaining ...
2,film starts manager nicholas bell giving welco...
3,must assumed praised film greatest filmed oper...
4,superbly trashy wondrously unpretentious explo...


In [5]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp313-cp313-win_amd64.whl.metadata (4.6 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.9.23-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt_einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloading termcolor-3.1

In [6]:
import tensorflow as tf
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df.review)

In [7]:
word2idx = tokenizer.word_index
idx2word = tokenizer.index_word
vocab_size = tokenizer.num_words

In [8]:
import numpy as np


def sentenceToData(tokens, WINDOW_SIZE):
    window = np.concatenate((np.arange(-WINDOW_SIZE,0),np.arange(1,WINDOW_SIZE+1)))
    X,Y=([],[])
    for word_index, word in enumerate(tokens) :
        if ((word_index - WINDOW_SIZE >= 0) and (word_index + WINDOW_SIZE <= len(tokens) - 1)) :
            X.append(word)
            Y.append([tokens[word_index-i] for i in window])
    return X, Y


WINDOW_SIZE = 5

X, Y = ([], [])
for review in df.review:
    for sentence in review.split("."):
        word_list = tokenizer.texts_to_sequences([sentence])[0]
        if len(word_list) >= WINDOW_SIZE:
            Y1, X1 = sentenceToData(word_list, WINDOW_SIZE//2)
            X.extend(X1)
            Y.extend(Y1)
    
X = np.array(X).astype(int)
y = np.array(Y).astype(int).reshape([-1,1])

In [10]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D

embedding_dim = 300
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(GlobalAveragePooling1D())
model.add(Dense(vocab_size, activation='softmax'))

In [11]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, batch_size = 128, epochs=50)

Epoch 1/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 15ms/step - accuracy: 0.0314 - loss: 7.6517
Epoch 2/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 19ms/step - accuracy: 0.0582 - loss: 6.9525
Epoch 3/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 19ms/step - accuracy: 0.0754 - loss: 6.5192
Epoch 4/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 19ms/step - accuracy: 0.0888 - loss: 6.1879
Epoch 5/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 19ms/step - accuracy: 0.1006 - loss: 5.9178
Epoch 6/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 18ms/step - accuracy: 0.1114 - loss: 5.6949
Epoch 7/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m243s[0m 20ms/step - accuracy: 0.1210 - loss: 5.5124
Epoch 8/50
[1m12163/12163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 20ms/step - accuracy: 0.1303

<keras.src.callbacks.history.History at 0x1c8d3a78050>

In [12]:
model.save("word2vec.h5") 



In [None]:
import streamlit as st
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D

st.title("Modèle Word2Vec")

embedding_dim = 300
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(GlobalAveragePooling1D())
model.add(Dense(vocab_size, activation='softmax'))

model.load_weights("word2vec.h5")

In [None]:
vectors = model.layers[0].trainable_weights[0].numpy()
import numpy as np
from sklearn.preprocessing import Normalizer

def dot_product(vec1, vec2):
    return np.sum((vec1*vec2))

def cosine_similarity(vec1, vec2):
    return dot_product(vec1, vec2)/np.sqrt(dot_product(vec1, vec1)*dot_product(vec2, vec2))

def find_closest(word_index, vectors, number_closest):
    list1=[]
    query_vector = vectors[word_index]
    for index, vector in enumerate(vectors):
        if not np.array_equal(vector, query_vector):
            dist = cosine_similarity(vector, query_vector)
            list1.append([dist,index])
    return np.asarray(sorted(list1,reverse=True)[:number_closest])

def compare(index_word1, index_word2, index_word3, vectors, number_closest):
    list1=[]
    query_vector = vectors[index_word1] - vectors[index_word2] + vectors[index_word3]
    normalizer = Normalizer()
    query_vector =  normalizer.fit_transform([query_vector], 'l2')
    query_vector= query_vector[0]
    for index, vector in enumerate(vectors):
        if not np.array_equal(vector, query_vector):
            dist = cosine_similarity(vector, query_vector)
            list1.append([dist,index])
    return np.asarray(sorted(list1,reverse=True)[:number_closest])

def print_closest(word, number=10):
    index_closest_words = find_closest(word2idx[word], vectors, number)
    for index_word in index_closest_words :
        print(idx2word[index_word[1]]," -- ",index_word[0])

In [None]:
#Exemple d'utilisation de la fonction print_closest
print_closest('zombie')