In [1]:
import os
import re
import numpy as np
import pickle
from typing import List, Tuple
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import logging

logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
logger = logging.getLogger(__name__)


In [2]:
class ReviewTokenizer:
    @staticmethod
    def tokenize(text: str) -> List[str]:
        """Basic text cleaning and tokenization"""
        return re.findall(r'\b\w+\b', text.lower())


class Word2VecEmbedder:
    def __init__(self, vector_size: int=100, window:int =5, min_count:int =2, sg:int =1, workers:int =4):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.sg = sg
        self.workers = workers
        self.model = None

    def train(self, tokenized_reviews: List[str]) -> None:
        """Train the Word2Vec model on tokenized reviews
        Args 
            tokenized_reviews: reviews cleaned and tokenized with ReviewTokenizer
        """
        logger.info("Training Word2Vec model...")
        self.model = Word2Vec(
            sentences=tokenized_reviews,
            vector_size=self.vector_size,
            window=self.window,
            min_count=self.min_count,
            sg=self.sg,
            workers=self.workers
        )
        logger.info("Word2Vec model trained successfully")

    def embed_reviews(self, tokenized_reviews: List[str]) -> np.ndarray:
        """Compute embeddings by averaging word vectors for each review
        Args 
            tokenized_reviews: List of reviews, each review is a list of tokenized words
        Returns
            np.ndarray: 2D array containing embeddings for each review
        """
        embeddings = []
        for tokens in tokenized_reviews:
            vectors = [self.model.wv[token] for token in tokens if token in self.model.wv]
            if vectors:
                embed = np.mean(vectors, axis=0)
            else:
                embed = np.zeros(self.vector_size)
            embeddings.append(embed)
        return np.array(embeddings)

    def save_embeddings(self, embeddings: np.ndarray, filepath: str) -> None:
        """Saves the embeddings obtained with word2vec to a given filepath
        Args 
            embeddings: Array containing the embeddings.
            filepath: Path where embeddings will be saved
        """
        with open(filepath, 'wb') as f:
            pickle.dump(embeddings, f)
        logger.info(f"Embeddings saved successfully to {filepath}")

    def load_embeddings(self, filepath: str) -> np.ndarray:
        """Retrieves embeddings obtained with word2vec
        Args
            filepath: Path from where embeddings will be loaded
        Returns 
            np.ndarray: Loaded embeddings array
        """
        with open(filepath, 'rb') as f:
            embeddings = pickle.load(f)
        return embeddings


class SentimentClassifier:
    def __init__(self, classifier=None):
        self.classifier = classifier if classifier else LogisticRegression(max_iter=500)

    def train(self, X: np.ndarray, y: np.ndarray) -> None:
        """Fit the model to train data
        Args
            X: Feature 
            y: True labels 
        """
        self.classifier.fit(X, y)
        logger.info("Classifier trained successfully.")

    def evaluate(self, X: np.ndarray, y: np.ndarray) -> Tuple[float, str]:
        """Evaluate the model on the provided dataset
        Args
            X: Feature 
            y: True labels 
        
        Returns
            Tuple[float, str]: Accuracy score and detailed classification report
        """
        predictions = self.classifier.predict(X)
        accuracy = accuracy_score(y, predictions)
        report = classification_report(y, predictions)
        return accuracy, report

In [None]:
df_train = pd.read_csv('aclImdb/df_train.csv')
df_test = pd.read_csv('aclImdb/df_test.csv')

# tokenisation
tokenized_reviews_train = [ReviewTokenizer.tokenize(text) for text in df_train['comment']]

# train embeddings
embedder = Word2VecEmbedder()
embedder.train(tokenized_reviews_train)
X_embeddings = embedder.embed_reviews(tokenized_reviews_train)
embedder.save_embeddings(X_embeddings, 'aclImdb/embeddings/X_train_word2vec_embeddings.pkl')

# train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X_embeddings, df_train['sentiment'], test_size=0.2, random_state=42
)

# train and evaluate classifier
clf = SentimentClassifier(classifier=LinearSVC())
clf.train(X_embeddings, df_train['sentiment'])

train_accuracy, train_report = clf.evaluate(X_embeddings, df_train['sentiment'])




In [None]:
#Check performance on test set 
tokenized_reviews_test = [ReviewTokenizer.tokenize(text) for text in df_test['comment']]

X_test_embeddings = embedder.embed_reviews(tokenized_reviews_test)
test_accuracy, test_report = clf.evaluate(X_test_embeddings, df_test['sentiment'])

print("======== Accuracy on train set for Word2vec and SVC ========\n", train_accuracy)
print("======== Accuracy on test set for Word2vec and SVC ========\n", test_accuracy)


 0.86836
 0.85576
