In [75]:
import pandas as pd
import keras
import os
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import re
import random
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from scipy.sparse import lil_matrix
import matplotlib.pyplot as plt 
import plotly.graph_objects as go
from tokenizer import CustomTokenizer
from wordcloud import WordCloud
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import log_loss
from utils import plot_wordclouds_by_label, plot_review_length_distribution_plotly, plot_text_length_histograms, describe_text_data,longest_and_shortest_reviews

-> Pre training word vectors (embeddings) using unlabeled data = unsupervised manner.
Model : similar to Word2Vec to learn distributed representations of words (allow to capture semantic relationships between words and synonyms in similar contexts). 
-> Supervised learning using labeled data : word embeddings previously learnt as input features for training sentiment classifier, which was a logistic regression (permits the classifier to learn word embedding to map reviews to sentiment labels). Step for fine-tuning.

### Objectif de l'article : 
 Apprendre des word vectors non pas via des méthodes non supervisées classiques (comme Word2Vec), mais en les supervisant directement via une tâche de classification de sentiment.

 Mise en place modèle :
 - 1 : Tenter approche non supervisée classique Word2Vec classique et observer résultats (moyens?)
 - 2 : Approche de l'article 
 - 3 : Trouver d'autres méthodes état de l'art et tester pour comparer ? 

 Dataset : 
 - phrases et sous-phrases extraites de critiques de film (Rotten Tomatoes)
 - chaque sous-phrase annotée avec score sentiment (fine-grained ou binaire)
 

 Approche principale : 
 Utilisation modèle récursif (RNN) structuré selon grammaire phrases
 Chaque mot représenté par vecteur
 Vecteurs combinés récursivement selon structure syntaxique (parse tree) pour produire représentation de phrase.
 Supervision à chaque noeud de l'arbre = permet ajuster vecteurs de mot selon contribution au sentiment 


A répliquer : 
- Prétraitement : tokenisation, parsing syntaxique (parser de constituants type Stanford Parser), extraction de toutes sous phrases (phrases, clauses, etc.)
- Modèle : Implémenter RNN sur arbres syntaxiques (matrice embedding, fonction de composition f(W[v1;v2] +b), classificateur au dessus représnetations des noeuds pour prédire sentiment)
- Entraînement : loss supervisée à chaque noeud (cross-entropy), propagation gradient à travers recherche recursive 

implémentation : https://github.com/stanfordnlp/treelstm

Comparer la perf avec approches modernes : LSTM, BERT, etc.

The Stanford Sentiment Treebank is a corpus with fully labeled parse trees that allows for a complete analysis of the compositional effects of sentiment in language. The corpus is based on the dataset introduced by Pang and Lee (2005) and consists of 11,855 single sentences extracted from movie reviews. It was parsed with the Stanford parser and includes a total of 215,154 unique phrases from those parse trees, each annotated by 3 human judges.

Each phrase is labelled as either negative, somewhat negative, neutral, somewhat positive or positive. The corpus with all 5 labels is referred to as SST-5 or SST fine-grained. Binary classification experiments on full sentences (negative or somewhat negative vs somewhat positive or positive with neutral sentences discarded) refer to the dataset as SST-2 or SST binary.

lMD 
- 25,000 labeled reviews for training
(Highly polar — i.e., very positive or very negative)
- 25,000 labeled reviews for testing
- 50,000 unlabeled reviews for unsupervised pre-training
- Raw text and bag-of-words format included
- Binary classification only (positive vs. negative)

In [None]:
from keras.datasets import imdb  # preprocessed version


In [None]:
from datasets import load_dataset

sst_dataset = load_dataset("sst", "default")  # pour fine-grained
# Ou : load_dataset("sst2") pour version binaire


In [None]:
# Load the data with a dictionnary (text is the movie review 
# and label is the positive or negative label)
def load_imdb_data(path, split="train"):
    data = []
    for label in ['pos', 'neg']:
        folder = os.path.join(path, split, label)
        for filename in os.listdir(folder):
            with open(os.path.join(folder, filename), encoding='utf-8') as f:
                text = f.read()
                data.append({
                    'text': text,
                    'label': 1 if label == 'pos' else 0
                })
    return data

train_data = load_imdb_data("data_movie", split="train")
test_data = load_imdb_data("data_movie", split="test")
print(f"Loaded {len(train_data)} train reviews.")
print(f"Loaded {len(test_data)} test reviews.") 

In [None]:
print(train_data[2])
print(len(train_data)+len(test_data))

## **Descriptive analysis** 

First, let us carry a descriptive analysis of the train and test set (and maybe one of the unsupervised set as well since it is used next). 
The goal is to see basic statistics (number of samples, disctribution of labels,average length text in tokens), but also textual insights (the most frequent tokens, vocabulary size, longest/shortest reviews and possible outliers), and finally some global visualizations (histogram of text length, bar plot of label distribution).

In [None]:
train_texts = [sample['text'] for sample in train_data]
train_labels = [sample['label'] for sample in train_data]
test_texts = [sample['text'] for sample in test_data]
test_labels = [sample['label'] for sample in test_data]

In [None]:
# Tokenization : first approach same as in the paper, using a simple tokenizer
# based on splitting words 
desc_df = describe_text_data(train_data, test_data, tokenizer=CustomTokenizer())
desc_df

In [None]:

tokenizer = CustomTokenizer()
tokenizer.build_vocab(train_texts + test_texts)


# --- Most frequent tokens ---

top_tokens = tokenizer.token_freqs.most_common(50)
print("\nTop 50 Most Common Tokens:")
for token, freq in top_tokens:
    print(f"{token:>10} : {freq}")

# --- Vocabulary Overlap ---
train_vocab = set(t for text in train_texts for t in tokenizer.tokenize(text))
test_vocab = set(t for text in test_texts for t in tokenizer.tokenize(text))
shared_vocab = train_vocab & test_vocab

print(f"\nVocab Overlap: {len(shared_vocab)} shared tokens")
print(f"Train-only tokens: {len(train_vocab - test_vocab)}")
print(f"Test-only tokens:  {len(test_vocab - train_vocab)}\n")


In [None]:
# --- Longest/Shortest Reviews ---
longest_and_shortest_reviews(train_data,tokenizer = CustomTokenizer())

In [None]:
# Plot the distribution of the length of the reviews 
plot_text_length_histograms(train_data,test_data,tokenizer=CustomTokenizer())

In [None]:
# First test of wordcloud
plot_wordclouds_by_label(train_texts, train_labels,tokenizer=CustomTokenizer())

# And if we want to add more stop words to see a bit more of the polarity 
plot_wordclouds_by_label(train_texts, train_labels,tokenizer=CustomTokenizer(),stop_words={"br", "the", "and", "is", "it", "to", "of","movie","film","one","character"})

The wordcloud is strange since there is no huge difference between the negative reviews (label 0) and the positive ones (label 1)... That might explain the poor results of the model later. 

In [None]:
# Plot the link between the length of the review and the label 
plot_review_length_distribution_plotly(train_data, tokenizer=CustomTokenizer())

### **Unsupervised part**

I use the unsupervised part of the train set in order to learn word embeddings. I will compare the results of known embeddings such as Word2Vec, GloVe, and FastText, Collobert, BERT (contextuel). (Faire visualisations avec T-SNE cf TP2).
Evaluer embeddings avec datasets pré-établis et annotés manuellement comme dans le TP pour justifier du choix d'embedding. 


In [None]:
# Load the unsupervised data 

def load_unsupervised_data(path):
    data = []
    for filename in os.listdir(path):
        with open(os.path.join(path, filename), encoding='utf-8') as f:
            text = f.read()
            data.append(text)  # store only text since they are unlabeled
    return data

unsupervised_data = load_unsupervised_data("data_movie/train/unsup")
print(f"Loaded {len(unsupervised_data)} unsupervised reviews.")

### Tokenization :
 - The paper used a simple tokenizer because BERT did not exist at the time so my first approach will be to replicate this. They build a fixed dictionary of the 5.000 most frequent tokens, but ignore the 50 most frequent terms from the original full vocabulary. They do not stem or remove stop words such as punctuation '!',':-)' since they induce sentiment. 
 - Second approach : Using BERT tokenizer (or another) which has good results, because it gives contextualized embeddings (so richer), is pretrained on massive data, and has a better accuracy. 

 !!! Limit to 30 reviews per movie !!!

**First approach : building a custom tokenizer**
For the first approach, I use regex in order to make punctuation be a token because I could get things like : 
- "wonderful!!!" and if I don't split the punctuation, the token would be ["wonderful!!!"] while I would like it to be ["wonderful","!!!"]
- then I just split by spaces to get the tokens.

In [None]:
# Tokenize from training data to prevent data leakage from the test data and 
# to allow better generalization

train_texts = [sample['text'] for sample in train_data]
tokenizer = CustomTokenizer()

vocab_dict = tokenizer.build_vocab(train_texts)


print("\n--- Vocabulary Preview (first 50 words after special tokens) ---")
for i, (word, idx) in enumerate(list(tokenizer.vocab.items())[2:52], start=1):
    print(f"{idx:4} : {word}")

In [None]:
# Unsupervised data : tokenization  

unsup_tokenized = [tokenizer.tokenize(text) for text in unsupervised_data]

In [None]:
unsup_tokenized[0]

Tokenization seems to work well, and we still have the information of ['don't'] and not ['do','n't'] which is questionable for sentiment analysis. 

The tokenization step is now done so we can encode the dataset to convert each text review into a sequence of integers (token ids) :

In [None]:
# Encoding the data 

encoded_train = []
for sample in train_data:
    encoded = tokenizer.encode(sample['text'])
    encoded_train.append({
        'input_ids': encoded,
        'label': sample['label']
    })

encoded_test = []
for sample in test_data:
    encoded = tokenizer.encode(sample['text'])
    encoded_test.append({
        'input_ids': encoded,
        'label': sample['label']
    })

In [None]:
print(encoded_train[1])

There is a vocab file from the dataset but I do not use it for now since its size is way larger than 5000 (it must come from more reviews) and I will see first if my vocabulary is enough.

Now that the data is encoded, the next step is to apply some padding in order to have the same length as input for the models.

In [None]:
# Padding based on the largest sentence (token wise)

max_length = max(len(sample['input_ids']) for sample in encoded_train)
print(f"Maximum sequence length in the training data: {max_length}")


def pad_sequences(sequences, max_length, pad_token_id=0):
    padded_sequences = []
    for seq in sequences:
        padding = [pad_token_id] * (max_length - len(seq))  # Pad to the right
        padded_sequences.append(seq + padding if len(seq) < max_length else seq[:max_length])
    return padded_sequences


# Apply padding to the encoded data
padded_encoded_train = pad_sequences([sample['input_ids'] for sample in encoded_train], max_length)
padded_encoded_test = pad_sequences([sample['input_ids'] for sample in encoded_test], max_length)

padded_train_data = [{'input_ids': seq, 'label': sample['label']} for seq, sample in zip(padded_encoded_train, encoded_train)]
padded_test_data = [{'input_ids': seq, 'label': sample['label']} for seq, sample in zip(padded_encoded_test, encoded_test)]


In [None]:
print(f"Example padded train data: {padded_train_data[0]}")
print(f"Example padded test data: {padded_test_data[0]}")

Now that dataset imdb is loaded, we can try to tokenize the text and build a vocab

Let us now train our own embedding in a first approach, instead of using pre trained embeddings such as GLoVE, Collobert, BERT. I use Word2Vec to do so, on the unsupervised data. Then, I will use BERT which will likely result in better performance. 
We want 50-dimensional vectors (embeddings) in conformity with the article.

In [None]:
# Creation of the unsupervised embedding using Word2Vec

# I set vector_size = 50 same as the dimensional vectors of the article 
#and min_count = 5 because they filter words that appear less than 5 times 

model = Word2Vec(sentences=unsup_tokenized,vector_size=50,window=5,min_count=5,workers=4) 
model.save("word2vec_model")

In [None]:
# Test of the embedding 
word_vector = model.wv['good']
print(f"Embedding for 'good': {word_vector}")

Now, let us test the modelled embedding thanks to cosine similarity and some query words : 

In [None]:
# Cosine similarity function 

def cosine_similarity(word1, word2, model):
    # Get word vectors for word1 and word2
    vector1 = model.wv[word1]
    vector2 = model.wv[word2]
    
    # Compute cosine similarity using scipy
    return 1 - cosine(vector1, vector2)

In [None]:
# Test with a query word such as 'romantic' like in the article 

query_words = ["sadness","witty","dull","romantic"]
top_n = 5

# Most similar words
for w in query_words: 
    similar_words = model.wv.most_similar(w, topn=top_n)

    print(f"\nMost similar words to '{w}':")
    for word, similarity in similar_words:
        print(f"{word}: {similarity:.4f}")


The Word2Vec trained embedding seems to be quite good at finding similar words to the query words in entry !
It captures word semantics based on co-occurrence statistics. However, it does not explicitly capture sentiment information. We will know refine embeddings to predict sentiment labels (positive or negative polarity) thanks to the labels (supervised part).

## **Other word representations**

- LSA : Another approach consists in Latent Semantic Analysis (LSA). Here, is applied a SVD to a tf.idf weighted, cosine normalized count matrix. 
- LDA : Latent Dirichlet Allocation. 

LSA uses tf-idf ie a heuristic weighting scheme based on frequency and inverse document frequency so not probabilistic contrary to LDA that uses raw term counts to assume documents are mixtures of topics, and each topic a distribution of words.

L'idée est d'abord de créer une matrice de termes (tf-idf), puis de la réduire via SVD (Singular Value Decomposition) pour capturer les structures sémantiques latentes, tout en réduisant la dimensionnalité.

In [None]:
# Créer la matrice TF-IDF
tokenizer = CustomTokenizer()
vectorizer = TfidfVectorizer(tokenizer=tokenizer.tokenize)

X = vectorizer.fit_transform(train_texts)  # Matrice TF-IDF

#norm_X = X / np.linalg.norm(X, axis=1, keepdims=True)
# crash ici car matrice X a de grandes chances d'être sparse

# donc à la place on cherche à normaliser avc sklearn 
# Normalisation 

norm_X = normalize(X, norm='l2', axis=1)

# SVD décomposition 

n_components = 100  # (100-300 courant pour LSA)
svd = TruncatedSVD(n_components=n_components, random_state=42)
X_svd = svd.fit_transform(norm_X)

# Extraction vecteurs de mots 
# Vocabulaire (mot -> index) tel que vu par le TfidfVectorizer
vocab = vectorizer.get_feature_names_out()
word_to_index_lsa = {word: idx for idx, word in enumerate(vocab)}

# On transpose pour avoir une représentation "mot-vecteur" (LSA)
# X_svd est document × composantes → on veut terme × composantes
term_vectors_lsa = svd.components_.T  # shape: (n_terms, n_components)


In [None]:
# Test sur query words 
#from sklearn.metrics.pairwise import cosine_similarity

query_words = ["sadness", "witty", "dull", "romantic"]
top_n = 5

for query_word in query_words:
    if query_word not in word_to_index_lsa:
        print(f"'{query_word}' n'est pas dans le vocabulaire.")
        continue

    query_idx = word_to_index_lsa[query_word]
    query_vector = term_vectors_lsa[query_idx].reshape(1, -1)

    # Calculer similarité cosinus entre ce mot et tous les autres
    similarities = cosine_similarity(query_vector, term_vectors_lsa)[0]

    # Obtenir les indices des mots les plus similaires
    similar_indices = similarities.argsort()[::-1][1:top_n + 1]  # on skip le mot lui-même

    print(f"\nMost similar words to '{query_word}':")
    for idx in similar_indices:
        print(f"{vocab[idx]}: {similarities[idx]:.4f}")


#### **LDA** 
The ultimate test I wanted to implement is to try the LDA approach (Latent Dirichlet Allocation). It works almost like LSA but while LSA produces a continuous vector space, LDA produces probability distributions on topics (discreet and interpretable).

In [65]:
tokenizer = CustomTokenizer()
tokenizer.build_vocab(train_texts)


vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize, max_features=5000)
X_counts = vectorizer.fit_transform(train_texts)

vocab = vectorizer.get_feature_names_out()
word_to_index_lda = {word: idx for idx, word in enumerate(vocab)}

n_topics = 50  # nombre de topics selon l'article
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X_counts)

# Shape: (n_topics, n_words)
topic_word_distrib_lda = lda.components_  # nombre de fois qu'un mot est généré par un topic

# On normalise pour obtenir une distribution de probas (somme = 1)
word_topic_vectors_lda = topic_word_distrib_lda.T / topic_word_distrib_lda.T.sum(axis=1, keepdims=True)



The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [None]:
# Test 
from sklearn.metrics.pairwise import cosine_similarity

query_words = ["sadness", "witty", "dull", "romantic"]
top_n = 5

for query_word in query_words:
    if query_word not in word_to_index_lda:
        print(f"'{query_word}' n'est pas dans le vocabulaire.")
        continue

    query_idx = word_to_index_lda[query_word]
    query_vector = word_topic_vectors_lda[query_idx].reshape(1, -1)

    similarities = cosine_similarity(query_vector, word_topic_vectors_lda)[0]
    similar_indices = similarities.argsort()[::-1][1:top_n+1]

    print(f"\nMost similar words to '{query_word}' (LDA-based):")
    for idx in similar_indices:
        print(f"{vocab[idx]}: {similarities[idx]:.4f}")


Some of the words seem strange or even unexistant, which is because the words closest are taken from the built vocabulary. 

Now, in order to incorporate the sentiment part, we should modify the learnt word vectors to be sensitive to sentiment. 

So the next step is to train a supervised sentiment classifier trained on the train and test datasets. We will then fine tune the initial embeddings from the unsupervised methods (Word2Vec, LSA, LDA). 

## **Sentiment features from word embeddings** 

- First, let us average the word embeddings for each review (with Word2Vec, LSA and LDA, depending on what gives the best results) to obtain a sentence-level feature vector.
- Then, we will try more advanced methods not mentionned in the paper (sentence transformers or tokenizers such as BERT,...) to obtain better sentence representations.  

In [None]:
tokenizer = CustomTokenizer()
train_tokenized = [tokenizer.tokenize(text) for text in train_texts]
test_tokenized = [tokenizer.tokenize(text) for text in test_texts]

y_train = train_labels
y_test = test_labels

### Word2Vec

In [None]:
# For the Word2Vec model 
word2vec_model = Word2Vec.load("word2vec_model")

def get_sentence_embedding(sentence, model):
    embeddings = []
    for word in sentence:
        if word in model.wv:  # check that the word is in the vocabulary
            embeddings.append(model.wv[word])  
    
    if len(embeddings) == 0:
        return np.zeros(model.vector_size) # otherwise, returns a zero vector
    
    return np.mean(embeddings, axis=0) 

X_train_word2vec = np.array([get_sentence_embedding(sen, word2vec_model) for sen in train_tokenized])
X_test_word2vec = np.array([get_sentence_embedding(sen, word2vec_model) for sen in test_tokenized])

X_train_word2vec

In [None]:
def is_zero_vector(vec):
    return np.all(vec == 0)

num_zero_vectors = sum(is_zero_vector(vec) for vec in X_train_word2vec)

total_reviews = len(X_train_word2vec)

# Percentage of zero vectors (to check that vocabulary is rich enough)
percentage_zero = (num_zero_vectors / total_reviews) * 100
print(f"Number of zero vectors: {num_zero_vectors}/{total_reviews} ({percentage_zero:.2f}%)")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Use logistic regression bc vectors are dense and continuous, 
# so work well on very dense, low-dim input 

# Train a logistic regression model
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_word2vec, y_train)

# Evaluate the classifier on the training set (or on a validation set if you have one)
y_pred = classifier.predict(X_test_word2vec)
accuracy = accuracy_score(y_train, y_pred)

print(f"Test accuracy with Word2Vec embeddings: {accuracy:.4f}")

We get a really good score with the logistic regression and word2vec model !

### LSA

In [73]:
# Créer la matrice TF-IDF
tokenizer = CustomTokenizer()
vectorizer = TfidfVectorizer(tokenizer=tokenizer.tokenize)

X_train_tfidf = vectorizer.fit_transform(train_texts)  # Matrice TF-IDF

norm_X = normalize(X_train_tfidf, norm='l2', axis=1)

# SVD décomposition 

n_components = 100  # (100-300 courant pour LSA)
lsa_model = TruncatedSVD(n_components=n_components, random_state=42)
X_train_lsa = lsa_model.fit_transform(norm_X)

# Here I use .transform(), and not .fit_transform() because the model is already fitted
X_test_tfidf = vectorizer.transform(test_texts)  
norm_X_test = normalize(X_test_tfidf, norm='l2', axis=1)
X_test_lsa = lsa_model.transform(norm_X_test)

#Shape (25.000,100) so logical with 25.000 reviews compressed into a 100-dimensional space 
X_train_lsa.shape


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



(25000, 100)

In [79]:
# Train a logistic regression classifier using the LSA features
classifier_lsa = LogisticRegression(max_iter=1000, C=1.0) # C is a regularization adjustment
classifier_lsa.fit(X_train_lsa, y_train)


y_pred_lsa = classifier_lsa.predict(X_test_lsa)
accuracy_lsa = accuracy_score(y_test, y_pred_lsa)

print(f"Test accuracy with LSA embeddings: {accuracy_lsa:.4f}")

Test accuracy with LSA embeddings: 0.8428


In [81]:
from sklearn.model_selection import cross_val_score
cross_val_score(classifier_lsa, X_train_lsa, y_train, cv=5)

array([0.8568, 0.8456, 0.8434, 0.8368, 0.8414])

Test accuracy and cross-validation scores seem encouraging for LSA as well ! 

### LDA 

In [66]:
from sklearn.decomposition import LatentDirichletAllocation

# Since I had already fit the LDA model, let us do the transform directly 

# Topic distributions on the training and test set 
X_train_lda = lda.transform(X_counts)
X_test_counts = vectorizer.transform(test_texts) 
X_test_lda = lda.transform(X_test_counts)

In [67]:
# Train a logistic regression classifier using the LDA features
classifier_lda = LogisticRegression(max_iter=1000,C = 1.)
classifier_lda.fit(X_train_lda, y_train)

# Evaluate the classifier
y_pred_lda = classifier_lda.predict(X_test_lda)
train_accuracy_lda = accuracy_score(y_test, y_pred_lda)

print(f"Training accuracy with LDA embeddings: {train_accuracy_lda:.4f}")

Training accuracy with LDA embeddings: 0.8074


In [82]:
# Let us compute the cross-entropy losses for the three models, which is better 
#suited to classification 

# Word2Vec
y_test_prob_word2vec = classifier.predict_proba(X_test_word2vec)[:, 1]
cross_entropy_loss_word2vec = log_loss(y_test, y_test_prob_word2vec)
print(f"Cross-Entropy Loss for Word2Vec model: {cross_entropy_loss_word2vec:.4f}")

# LSA
y_test_prob_lsa = classifier_lsa.predict_proba(X_test_lsa)[:, 1]
cross_entropy_loss_lsa = log_loss(y_test, y_test_prob_lsa)
print(f"Cross-Entropy Loss for LSA model: {cross_entropy_loss_lsa:.4f}")

# LDA 
y_test_prob_lda = classifier_lda.predict_proba(X_test_lda)[:, 1]
cross_entropy_loss_lda = log_loss(y_test, y_test_prob_lda)
print(f"Cross-Entropy Loss for LDA model: {cross_entropy_loss_lda:.4f}")


Cross-Entropy Loss for Word2Vec model: 0.4034
Cross-Entropy Loss for LSA model: 0.3757
Cross-Entropy Loss for LDA model: 0.4329


In [86]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Word2Vec
y_pred_word2vec = classifier.predict(X_test_word2vec)
print(f"Precision for Word2Vec: {precision_score(y_test, y_pred_word2vec)}")
print(f"Recall for Word2Vec: {recall_score(y_test, y_pred_word2vec)}")
print(f"F1-Score for Word2Vec: {f1_score(y_test, y_pred_word2vec)}")
print(f"Confusion Matrix for Word2Vec:\n {confusion_matrix(y_test, y_pred_word2vec)}\n")

# LSA
y_pred_lsa = classifier_lsa.predict(X_test_lsa)
print(f"Precision for LSA: {precision_score(y_test, y_pred_lsa)}")
print(f"Recall for LSA: {recall_score(y_test, y_pred_lsa)}")
print(f"F1-Score for LSA: {f1_score(y_test, y_pred_lsa)}")
print(f"Confusion Matrix for LSA:\n {confusion_matrix(y_test, y_pred_lsa)}\n")

#LDA
y_pred_lda = classifier_lda.predict(X_test_lda)
print(f"Precision for LDA: {precision_score(y_test, y_pred_lda)}")
print(f"Recall for LDA: {recall_score(y_test, y_pred_lda)}")
print(f"F1-Score for LDA: {f1_score(y_test, y_pred_lda)}")
print(f"Confusion Matrix for LDA:\n {confusion_matrix(y_test, y_pred_lda)}\n")

Precision for Word2Vec: 0.8240402564726889
Recall for Word2Vec: 0.81224
F1-Score for Word2Vec: 0.8180975786632287
Confusion Matrix for Word2Vec:
 [[10332  2168]
 [ 2347 10153]]

Precision for LSA: 0.8361967521769829
Recall for LSA: 0.85272
F1-Score for LSA: 0.8443775498078979
Confusion Matrix for LSA:
 [[10412  2088]
 [ 1841 10659]]

Precision for LDA: 0.8024720516454101
Recall for LDA: 0.81544
F1-Score for LDA: 0.8089040552337116
Confusion Matrix for LDA:
 [[ 9991  2509]
 [ 2307 10193]]



The Word2Vec model seems to be the best one (since trained on real reviews to get the embeddings and not only on probabilities distributions that might not be enough given the size of the data).

## **Sentiment classifier** 

Now, let us train a classifier (logistic regression, SVM or NN) that takes the sentence-level embeddings as input and outputs a binary sentiment prediction. The classifier’s objective is to minimize the binary cross-entropy loss (for classification) and potentially a regularization term that prevents overfitting.

## **Fine-tuning Word Vectors**

- During the training process, backpropagate the error from the sentiment classifier through the word embeddings to fine-tune them.
- This way, the word embeddings are updated not only based on word co-occurrence (from unsupervised training) but also based on their effectiveness in predicting sentiment.

## Document Vector Construction

Each document has a latent vector d and each word a latent vector w. In the paper, they model P(w|d) the probability of a word given the document. After training, theu use the inferred document vector d for classification (which is not a sum or average of word embeddings). For inference, they use Bayesian inference (MAP estimation) to infer these document vectors. 

2 options : 
- Replication using Bayesian unsupervised model 
- Use a modern equivalent like Doc2Vec (DM mode) : it trains document embeddings just like the paper but more easily 

In [None]:
# Prepare embeddings for model  

def prepare_embeddings(data, word2vec_model, max_sequence_length=100, embedding_dim=50, batch_size=1000):
    """
    Convert the tokenized sentences into word embeddings.
    """
    embeddings = []
    
    # Process in batches to avoid memory overload
    for start_idx in range(0, len(data), batch_size):
        end_idx = min(start_idx + batch_size, len(data))
        batch = data[start_idx:end_idx]
        
        batch_embeddings = []
        
        for sentence in batch:
            sentence_embedding = []
            
            for token_id in sentence['input_ids']:
                # Check if the token exists in the Word2Vec model
                if token_id in word2vec_model.wv.key_to_index:
                    sentence_embedding.append(word2vec_model.wv[token_id])
                else:
                    sentence_embedding.append(np.zeros(embedding_dim))  # OOV token (zero vector)
            
            # Ensure sequence length matches max_sequence_length
            padding_length = max_sequence_length - len(sentence_embedding)
            if padding_length > 0:
                sentence_embedding.extend([np.zeros(embedding_dim)] * padding_length)
            else:
                sentence_embedding = sentence_embedding[:max_sequence_length]
            
            batch_embeddings.append(np.array(sentence_embedding))
        
        embeddings.extend(batch_embeddings)
    
    return np.array(embeddings)


train_embedded = prepare_embeddings(padded_train_data, model)
test_embedded = prepare_embeddings(padded_test_data, model)


1. Preprocessing
   ⤷ Tokenize reviews
   ⤷ Build vocab

2. Word Vector Initialization
   ⤷ Random or GloVe init
   ⤷ Embeddings will be fine-tuned via supervision

3. Context Window or Full Review
   ⤷ Either train on word windows (as in the paper)
   ⤷ Or process whole review as a sequence

4. Classifier
   ⤷ Logistic regression (window-level)
   ⤷ OR CNN/LSTM over full review

5. Training
   ⤷ Cross-entropy loss
   ⤷ SGD or Adam
