In [2]:
import pandas as pd
import keras
import os
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import re
import random
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

-> Pre training word vectors (embeddings) using unlabeled data = unsupervised manner.
Model : similar to Word2Vec to learn distributed representations of words (allow to capture semantic relationships between words and synonyms in similar contexts). 
-> Supervised learning using labeled data : word embeddings previously learnt as input features for training sentiment classifier, which was a logistic regression (permits the classifier to learn word embedding to map reviews to sentiment labels). Step for fine-tuning.

### Objectif de l'article : 
 Apprendre des word vectors non pas via des méthodes non supervisées classiques (comme Word2Vec), mais en les supervisant directement via une tâche de classification de sentiment.

 Mise en place modèle :
 - 1 : Tenter approche non supervisée classique Word2Vec classique et observer résultats (moyens?)
 - 2 : Approche de l'article 
 - 3 : Trouver d'autres méthodes état de l'art et tester pour comparer ? 

 Dataset : 
 - phrases et sous-phrases extraites de critiques de film (Rotten Tomatoes)
 - chaque sous-phrase annotée avec score sentiment (fine-grained ou binaire)
 

 Approche principale : 
 Utilisation modèle récursif (RNN) structuré selon grammaire phrases
 Chaque mot représenté par vecteur
 Vecteurs combinés récursivement selon structure syntaxique (parse tree) pour produire représentation de phrase.
 Supervision à chaque noeud de l'arbre = permet ajuster vecteurs de mot selon contribution au sentiment 


A répliquer : 
- Prétraitement : tokenisation, parsing syntaxique (parser de constituants type Stanford Parser), extraction de toutes sous phrases (phrases, clauses, etc.)
- Modèle : Implémenter RNN sur arbres syntaxiques (matrice embedding, fonction de composition f(W[v1;v2] +b), classificateur au dessus représnetations des noeuds pour prédire sentiment)
- Entraînement : loss supervisée à chaque noeud (cross-entropy), propagation gradient à travers recherche recursive 

implémentation : https://github.com/stanfordnlp/treelstm

Comparer la perf avec approches modernes : LSTM, BERT, etc.

The Stanford Sentiment Treebank is a corpus with fully labeled parse trees that allows for a complete analysis of the compositional effects of sentiment in language. The corpus is based on the dataset introduced by Pang and Lee (2005) and consists of 11,855 single sentences extracted from movie reviews. It was parsed with the Stanford parser and includes a total of 215,154 unique phrases from those parse trees, each annotated by 3 human judges.

Each phrase is labelled as either negative, somewhat negative, neutral, somewhat positive or positive. The corpus with all 5 labels is referred to as SST-5 or SST fine-grained. Binary classification experiments on full sentences (negative or somewhat negative vs somewhat positive or positive with neutral sentences discarded) refer to the dataset as SST-2 or SST binary.

lMD 
- 25,000 labeled reviews for training
(Highly polar — i.e., very positive or very negative)
- 25,000 labeled reviews for testing
- 50,000 unlabeled reviews for unsupervised pre-training
- Raw text and bag-of-words format included
- Binary classification only (positive vs. negative)

In [None]:
from keras.datasets import imdb  # preprocessed version


In [None]:
from datasets import load_dataset

sst_dataset = load_dataset("sst", "default")  # pour fine-grained
# Ou : load_dataset("sst2") pour version binaire


### **Unsupervised part**

I use the unsupervised part of the train set in order to learn word embeddings. I will compare the results of known embeddings such as Word2Vec, GloVe, and FastText, Collobert, BERT (contextuel). (Faire visualisations avec T-SNE cf TP2).
Evaluer embeddings avec datasets pré-établis et annotés manuellement comme dans le TP pour justifier du choix d'embedding. 


In [3]:
import os

# Load the data with a dictionnary (text is the movie review 
# and label is the positive or negative label)
def load_imdb_data(path, split="train"):
    data = []
    for label in ['pos', 'neg']:
        folder = os.path.join(path, split, label)
        for filename in os.listdir(folder):
            with open(os.path.join(folder, filename), encoding='utf-8') as f:
                text = f.read()
                data.append({
                    'text': text,
                    'label': 1 if label == 'pos' else 0
                })
    return data

train_data = load_imdb_data("data_movie", split="train")
test_data = load_imdb_data("data_movie", split="test")

In [4]:
# Load the unsupervised data 

def load_unsupervised_data(path):
    data = []
    for filename in os.listdir(path):
        with open(os.path.join(path, filename), encoding='utf-8') as f:
            text = f.read()
            data.append(text)  # store only text since they are unlabeled
    return data

unsupervised_data = load_unsupervised_data("data_movie/train/unsup")
print(f"Loaded {len(unsupervised_data)} unsupervised reviews.")

Loaded 50000 unsupervised reviews.


In [5]:
print(train_data[2])
print(len(train_data)+len(test_data))

{'text': 'A solid, if unremarkable film. Matthau, as Einstein, was wonderful. My favorite part, and the only thing that would make me go out of my way to see this again, was the wonderful scene with the physicists playing badmitton, I loved the sweaters and the conversation while they waited for Robbins to retrieve the birdie.', 'label': 1}
50000


Some movies receive substantially more reviews than others so limit to 30 reviews from any movie in the collection. Initial paper did not use any tokenizer since BERT did not exist at the time so tokenized simply => can try to use BERT tokenizer now ? 


### Tokenization :
 - The paper used a simple tokenizer because BERT did not exist at the time so my first approach will be to replicate this. They build a fixed dictionary of the 5.000 most frequent tokens, but ignore the 50 most frequent terms from the original full vocabulary. They do not stem or remove stop words such as punctuation '!',':-)' since they induce sentiment. 
 - Second approach : Using BERT tokenizer (or another) which has good results, because it gives contextualized embeddings (so richer), is pretrained on massive data, and has a better accuracy. 

For the first approach, I use regex in order to make punctuation be a token because I could get things like : 
- "wonderful!!!" and if I don't split the punctuation, the token would be ["wonderful!!!"] while I would like it to be ["wonderful","!!!"]
- then I just split by spaces to get the tokens.

In [6]:
# Tokenization : first approach same as in the paper, using a simple tokenizer
# based on splitting words 

import re
from collections import Counter, defaultdict

class CustomTokenizer:
    # the size of the vocabulary is 5000, and we skip the 50 most frequent tokens 
    def __init__(self, max_vocab_size=5000, skip_top=50):
        self.max_vocab_size = max_vocab_size
        self.skip_top = skip_top
        self.vocab = {}
        self.token_freqs = Counter() # for frequency
        self.special_tokens = {"<PAD>": 0, "<UNK>": 1}

    def tokenize(self, text):
        text = text.lower()
        text = re.sub(r'([!?.,:;()"])', r' \1 ', text)  # isolate punctuation 
        text = re.sub(r"\s+", " ", text)  # clean multiple spaces
        return text.strip().split()

    def build_vocab(self, texts):
        # count all tokens
        for text in texts:
            tokens = self.tokenize(text)
            self.token_freqs.update(tokens)

        # build vocab
        most_common = self.token_freqs.most_common(self.skip_top + self.max_vocab_size)
        filtered_tokens = most_common[self.skip_top:]  # to skip the skip_top first tokens 

        # start vocab with special tokens for padding and unknown
        self.vocab = dict(self.special_tokens)
        for idx, (token, _) in enumerate(filtered_tokens, start=len(self.vocab)):
            self.vocab[token] = idx 

    def encode(self, text):
        tokens = self.tokenize(text)
        return [self.vocab.get(t, self.vocab["<UNK>"]) for t in tokens]

In [7]:
# Tokenize from training data to prevent data leakage from the test data and 
# to allow better generalization

train_texts = [sample['text'] for sample in train_data]
tokenizer = CustomTokenizer()

tokenizer.build_vocab(train_texts)

print("\n--- Vocabulary Preview (first 50 words after special tokens) ---")
for i, (word, idx) in enumerate(list(tokenizer.vocab.items())[2:52], start=1):
    print(f"{idx:4} : {word}")


--- Vocabulary Preview (first 50 words after special tokens) ---
   2 : it's
   3 : ?
   4 : if
   5 : some
   6 : there
   7 : what
   8 : good
   9 : more
  10 : very
  11 : when
  12 : she
  13 : even
  14 : up
  15 : no
  16 : time
  17 : would
  18 : my
  19 : which
  20 : only
  21 : really
  22 : story
  23 : their
  24 : had
  25 : see
  26 : can
  27 : were
  28 : me
  29 : :
  30 : than
  31 : we
  32 : much
  33 : -
  34 : well
  35 : been
  36 : get
  37 : will
  38 : into
  39 : bad
  40 : people
  41 : other
  42 : because
  43 : do
  44 : also
  45 : great
  46 : him
  47 : how
  48 : first
  49 : most
  50 : don't
  51 : made


In [15]:
# Unsupervised data : tokenization  

unsup_tokenized = [tokenizer.tokenize(text) for text in unsupervised_data]

Tokenization seems to work well, and we still have the information of ['don't'] and not ['do','n't'] which is questionable for sentiment analysis. 

The tokenization step is now done so we can encode the dataset to convert each text review into a sequence of integers (token ids) :

In [9]:
# Encoding the data 

encoded_train = []
for sample in train_data:
    encoded = tokenizer.encode(sample['text'])
    encoded_train.append({
        'input_ids': encoded,
        'label': sample['label']
    })

encoded_test = []
for sample in test_data:
    encoded = tokenizer.encode(sample['text'])
    encoded_test.append({
        'input_ids': encoded,
        'label': sample['label']
    })

In [10]:
print(encoded_train[1])

{'input_ids': [1100, 160, 1, 1101, 1, 747, 1511, 1, 2517, 1, 1, 1, 1, 264, 1, 1, 1, 1, 1285, 1, 1, 1, 1, 142, 1, 592, 1, 2241, 1, 1, 1, 1753, 1, 1, 1, 1, 973, 1, 1, 3273, 1, 1, 1, 1, 1, 1, 566, 1, 1, 98, 1, 1, 1, 1, 10, 34, 1931, 1, 1, 1249, 188, 1, 361, 1, 1, 1, 1, 1163, 1, 1, 4950, 1, 1, 11, 1, 1050, 38, 1, 902, 4186, 4951, 1, 1, 1, 1, 1, 1973, 2242, 1, 1, 380, 1800, 1, 1, 141, 21, 333, 1, 1, 1, 4684, 1, 539, 1, 1, 3705, 1, 1, 1, 1, 331, 1, 2009, 150, 1, 1, 1, 1, 1, 1, 131, 1, 1, 1, 1, 1, 45, 208, 1, 1, 1, 521, 1229, 1, 1, 1, 1, 705, 1, 1, 1, 1, 1, 871, 1, 1, 1, 1, 1, 1, 1, 7, 1, 2114, 1, 423, 1, 1, 1, 1, 4078, 1, 1, 1, 1, 1, 1, 1, 1, 3602, 1, 345, 1, 1, 777, 90, 34, 1, 136, 454, 2243, 1, 1, 1, 1, 1, 1, 235, 1, 2626, 1, 1, 1, 2115], 'label': 1}


There is a vocab file from the dataset but I do not use it for now since its size is way larger than 5000 (it must come from more reviews) and I will see first if my vocabulary is enough.

Now that the data is encoded, the next step is to apply some padding in order to have the same length as input for the models.

In [11]:
# Padding based on the largest sentence (token wise)

max_length = max(len(sample['input_ids']) for sample in encoded_train)
print(f"Maximum sequence length in the training data: {max_length}")


def pad_sequences(sequences, max_length, pad_token_id=0):
    padded_sequences = []
    for seq in sequences:
        padding = [pad_token_id] * (max_length - len(seq))  # Pad to the right
        padded_sequences.append(seq + padding if len(seq) < max_length else seq[:max_length])
    return padded_sequences


# Apply padding to the encoded data
padded_encoded_train = pad_sequences([sample['input_ids'] for sample in encoded_train], max_length)
padded_encoded_test = pad_sequences([sample['input_ids'] for sample in encoded_test], max_length)

padded_train_data = [{'input_ids': seq, 'label': sample['label']} for seq, sample in zip(padded_encoded_train, encoded_train)]
padded_test_data = [{'input_ids': seq, 'label': sample['label']} for seq, sample in zip(padded_encoded_test, encoded_test)]


Maximum sequence length in the training data: 2719


In [12]:
print(f"Example padded train data: {padded_train_data[0]}")
print(f"Example padded test data: {padded_test_data[0]}")

Example padded train data: {'input_ids': [1, 1, 1, 1, 171, 15, 1129, 6, 209, 1, 1, 131, 1, 858, 4297, 3472, 1, 1, 1440, 1, 793, 1, 1, 78, 870, 1, 1, 121, 122, 1, 1, 1, 1, 1, 95, 1, 1, 1, 1, 69, 1, 1, 1486, 1978, 1, 65, 1, 1558, 1, 1, 1, 1, 1, 1712, 1, 1, 1, 521, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Now that dataset imdb is loaded, we can try to tokenize the text and build a vocab

Let us now train our own embedding in a first approach, instead of using pre trained embeddings such as GLoVE, Collobert, BERT. I use Word2Vec to do so, on the unsupervised data. Then, I will use BERT which will likely result in better performance. 
We want 50-dimensional vectors (embeddings) in conformity with the article.

In [16]:
# Creation of the unsupervised embedding using Word2Vec

# I set vector_size = 50 same as the dimensional vectors of the article
model = Word2Vec(sentences=unsup_tokenized,vector_size=50,window=5,min_count=1,workers=4) 
model.save("word2vec_model")

In [13]:

# Test of the embedding 
word_vector = model.wv['good']
print(f"Embedding for 'good': {word_vector}")

Embedding for 'good': [-0.02668276  1.6304002   3.0908344   2.0695004  -2.6603909   5.886345
 -0.95599335 -3.1640143   0.51996505 -0.92471564 -1.0598339   0.092731
  0.2507879   0.1548105  -2.5982487   1.6632124   1.1573884   0.46780157
  0.41147286 -1.9868916  -0.9447748  -1.7200239   1.1644139  -0.01681888
 -3.199935   -0.80628073  2.8439891   3.559361   -2.7016542   1.7777083
  1.4225734   3.1251025  -2.216038   -3.990622    2.8474953  -0.966506
 -3.1862013   5.037959   -0.9162124  -2.260319    1.9980825  -1.8151331
 -3.8939815  -1.5061668   2.380033   -6.5707107  -2.4918954  -4.733701
  0.6118964  -3.5265934 ]


Now, let us test the modelled embedding thanks to cosine similarity and some query words : 

In [17]:
# Cosine similarity function 

def cosine_similarity(word1, word2, model):
    # Get word vectors for word1 and word2
    vector1 = model.wv[word1]
    vector2 = model.wv[word2]
    
    # Compute cosine similarity using scipy
    return 1 - cosine(vector1, vector2)

In [18]:
# Test with a query word such as 'romantic' like in the article 

query_words = ["melancholy","ghastly","lackluster","romantic"]
top_n = 5

# Most similar words
for w in query_words: 
    similar_words = model.wv.most_similar(w, topn=top_n)

    print(f"\nMost similar words to '{w}':")
    for word, similarity in similar_words:
        print(f"{word}: {similarity:.4f}")



Most similar words to 'melancholy':
dreamy: 0.9010
melancholic: 0.8834
whimsical: 0.8388
playful: 0.8354
heartfelt: 0.8329

Most similar words to 'ghastly':
abundant: 0.7680
noisy: 0.7659
hairless: 0.7619
cartoony: 0.7575
rancid: 0.7555

Most similar words to 'lackluster':
sub-par: 0.8523
unimaginative: 0.8485
lacklustre: 0.8460
leaden: 0.8353
threadbare: 0.8342

Most similar words to 'romantic':
quirky: 0.7992
romance: 0.7885
erotic: 0.7753
tender: 0.7512
light-hearted: 0.7381


The Word2Vec trained embedding seems to be quite good at finding similar words to the query words in entry !
It captures word semantics based on co-occurrence statistics. However, it does not explicitly capture sentiment information. We will know refine embeddings to predict sentiment labels (positive or negative polarity) thanks to the labels (supervised part).

In [19]:
# I will use a logistic regression in order to have as an output the probability 
# of a word being associated with a positive sentiment. 

# First, let us embed the encoded train and test data 

# 1st try : crash of the kernel .... :/

def prepare_embeddings(data, word2vec_model, max_sequence_length=max_length,embedding_dim=50):
    """
    Convert the tokenized sentences into word embeddings.
    """
    embeddings = []
    
    for sentence in data:
        sentence_embedding = []
        for token_id in sentence['input_ids']:
            # Get word embedding for token
            if token_id in word2vec_model.wv:
                sentence_embedding.append(word2vec_model.wv[token_id])
            else:
                sentence_embedding.append(np.zeros(embedding_dim))  # OOV token (zero vector)
        
        embeddings.append(np.array(sentence_embedding))
    
    return np.array(embeddings)

train_embedded = prepare_embeddings(padded_train_data, model)
test_embedded = prepare_embeddings(padded_test_data, model)

KeyboardInterrupt: 

In [20]:
# 2nd try : proceed in batches : WORKS !!! 

def prepare_embeddings(data, word2vec_model, max_sequence_length=100, embedding_dim=50, batch_size=1000):
    """
    Convert the tokenized sentences into word embeddings.
    """
    embeddings = []
    
    # Process in batches to avoid memory overload
    for start_idx in range(0, len(data), batch_size):
        end_idx = min(start_idx + batch_size, len(data))
        batch = data[start_idx:end_idx]
        
        batch_embeddings = []
        
        for sentence in batch:
            sentence_embedding = []
            
            for token_id in sentence['input_ids']:
                # Check if the token exists in the Word2Vec model
                if token_id in word2vec_model.wv.key_to_index:
                    sentence_embedding.append(word2vec_model.wv[token_id])
                else:
                    sentence_embedding.append(np.zeros(embedding_dim))  # OOV token (zero vector)
            
            # Ensure sequence length matches max_sequence_length
            padding_length = max_sequence_length - len(sentence_embedding)
            if padding_length > 0:
                sentence_embedding.extend([np.zeros(embedding_dim)] * padding_length)
            else:
                sentence_embedding = sentence_embedding[:max_sequence_length]
            
            batch_embeddings.append(np.array(sentence_embedding))
        
        embeddings.extend(batch_embeddings)
    
    return np.array(embeddings)


train_embedded = prepare_embeddings(padded_train_data, model)
test_embedded = prepare_embeddings(padded_test_data, model)


LSA to try ! Based only on semantic analysis !!! and at the end compare the three approaches. Maybe change parameters in Word2Vec and see if get better ? 

In [None]:
import numpy as np

def encode_text_with_embeddings(text, model, tokenizer):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    # Get the corresponding embeddings for each token
    embeddings = [model.wv[token] if token in model.wv else np.zeros(model.vector_size) for token in tokens]
    return embeddings

# Encode the training and testing data
encoded_train_data = [encode_text_with_embeddings(text, model, tokenizer) for text in train_data]
encoded_test_data = [encode_text_with_embeddings(text, model, tokenizer) for text in test_data]

from tensorflow.keras.preprocessing.sequence import pad_sequences

def pad_sequences_to_max_length(sequences, max_len):
    # Pad the sequences to a fixed length (max_len)
    return pad_sequences(sequences, maxlen=max_len, dtype="float32", padding="post", value=0)

# Choose a max length based on your data (e.g., the longest sentence)
max_len = max(len(seq) for seq in encoded_train_data)

# Pad the sequences
padded_train_data = pad_sequences_to_max_length(encoded_train_data, max_len)
padded_test_data = pad_sequences_to_max_length(encoded_test_data, max_len)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Create an LSTM model for sentiment analysis
model = Sequential([
    LSTM(128, input_shape=(max_len, model.vector_size), return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model on your padded data
model.fit(padded_train_data, np.array(train_labels), epochs=5, batch_size=32, validation_data=(padded_test_data, np.array(test_labels)))

In [None]:
# Tokenization bis: 

class IMDbDataset(Dataset):
    def __init__(self, directory, vocab=None, max_vocab_size=20000, min_freq=2):
        self.samples = []  # List of (tokens, label)
        self.vocab = vocab
        self.pad_token = "<PAD>" # padding token : maked all sequences in a batch the same length so they can be stacked into tensors = add padding (so extra zeros to shorter sequences)
        self.unk_token = "<UNK>" # unknown token : handes any word that is not in the vocab (bc our vocab is max 20.000 words)

        for label, label_val in [("pos", 1), ("neg", 0)]:
            folder = os.path.join(directory, label)
            for filename in os.listdir(folder):
                with open(os.path.join(folder, filename), encoding="utf-8") as f:
                    text = f.read()
                    tokens = self.tokenize(text)
                    self.samples.append((tokens, label_val))

        if self.vocab is None:
            all_tokens = [token for tokens, _ in self.samples for token in tokens]
            freqs = Counter(all_tokens)
            most_common = [
                (word, freq) for word, freq in freqs.items() if freq >= min_freq
            ]
            most_common = sorted(most_common, key=lambda x: -x[1])[:max_vocab_size]
            self.vocab = {
                word: idx + 2 for idx, (word, _) in enumerate(most_common)
            }  # Reserve 0, 1
            self.vocab[self.pad_token] = 0
            self.vocab[self.unk_token] = 1

        self.pad_idx = self.vocab[self.pad_token]
        self.unk_idx = self.vocab[self.unk_token]

    def tokenize(self, text):
        text = text.lower()
        text = re.sub(r"<.*?>", "", text)
        return re.findall(r"\b\w+\b", text)

    def encode(self, tokens):
        return [self.vocab.get(token, self.unk_idx) for token in tokens]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        tokens, label = self.samples[idx]
        encoded = self.encode(tokens)
        return torch.tensor(encoded), torch.tensor(label)

def collate_batch(batch):
    sequences, labels = zip(*batch)
    lengths = [len(seq) for seq in sequences]
    max_len = max(lengths)
    padded = [torch.cat([seq, torch.zeros(max_len - len(seq))]) for seq in sequences]
    return torch.stack(padded).long(), torch.tensor(labels)

In [None]:
# Step : model - average of word embeddings and logistic regression 

import torch.nn as nn

class AvgEmbeddingsClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.classifier = nn.Linear(embedding_dim, 1)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)  # (batch, seq_len, emb_dim)
        mask = (input_ids != 0).unsqueeze(-1)  # padding mask
        summed = torch.sum(embedded * mask, dim=1)
        counts = torch.sum(mask, dim=1)
        avg = summed / counts
        logits = self.classifier(avg).squeeze(1)
        return logits

In [None]:
# Accuracy + evaluation 

def binary_accuracy(preds, labels):
    preds = torch.round(torch.sigmoid(preds))
    correct = (preds == labels).float()
    return correct.sum() / len(correct)

def evaluate_model(model, loader, device="cpu"):
    model.eval()
    total_acc = 0
    total_loss = 0
    criterion = nn.BCEWithLogitsLoss()

    with torch.no_grad():
        for batch in loader:
            inputs, labels = [x.to(device) for x in batch]
            outputs = model(inputs)
            loss = criterion(outputs, labels.float())
            acc = binary_accuracy(outputs, labels)

            total_loss += loss.item()
            total_acc += acc.item()

    print(f" → Val Loss: {total_loss / len(loader):.4f} | Acc: {total_acc / len(loader):.4f}")



In [None]:
# Training function 

def train_model(model, train_loader, val_loader, epochs=5, lr=1e-3, device="cpu"):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        epoch_acc = 0

        for batch in train_loader:
            inputs, labels = [x.to(device) for x in batch]
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.float())
            acc = binary_accuracy(outputs, labels)

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += acc.item()

        avg_loss = epoch_loss / len(train_loader)
        avg_acc = epoch_acc / len(train_loader)

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_loss:.4f} | Acc: {avg_acc:.4f}")

        evaluate_model(model, val_loader, device)


In [None]:
# Wrap up 

# Paths
train_path = "data_movie/train"
test_path = "data_movie/test"

# Load data
train_dataset = IMDbDataset(train_path)
test_dataset = IMDbDataset(test_path, vocab=train_dataset.vocab)  # use same vocab

# Split train/val
random.shuffle(train_dataset.samples)
split_idx = int(0.9 * len(train_dataset))
train_data = torch.utils.data.Subset(train_dataset, range(split_idx))
val_data = torch.utils.data.Subset(train_dataset, range(split_idx, len(train_dataset)))

# Dataloaders
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=32, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_batch)

# Model
vocab_size = len(train_dataset.vocab)
model = AvgEmbeddingsClassifier(vocab_size, embedding_dim=100)

# Train
train_model(model, train_loader, val_loader, epochs=5, lr=1e-3, device="cuda" if torch.cuda.is_available() else "cpu")

# Evaluate 

evaluate_model(model, test_loader)


1. Preprocessing
   ⤷ Tokenize reviews
   ⤷ Build vocab

2. Word Vector Initialization
   ⤷ Random or GloVe init
   ⤷ Embeddings will be fine-tuned via supervision

3. Context Window or Full Review
   ⤷ Either train on word windows (as in the paper)
   ⤷ Or process whole review as a sequence

4. Classifier
   ⤷ Logistic regression (window-level)
   ⤷ OR CNN/LSTM over full review

5. Training
   ⤷ Cross-entropy loss
   ⤷ SGD or Adam
