In [17]:
# basic imports
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
# display matplotlib graphics in notebook
%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud


# disable warnings for libraries
import warnings
warnings.filterwarnings("ignore")

# configure logger
import logging
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S')
logger = logging.getLogger(__name__)

In [74]:
#for embedding and classification
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score


In [9]:
#Expected rate for each word in the vocabulary

def load_embeddings():
    # Load rates
    logger.info("Loading embedding vectors...")
    rates= np.genfromtxt('aclImdb/imdbEr.txt')
    logger.info(f"Loaded vectors with shape: {rates.shape}")
    
    # Load associated words
    logger.info("Loading words...")
    with open('aclImdb/imdb.vocab', 'r', encoding='utf-8') as f:
        words = [line.strip() for line in f.readlines()]
    logger.info(f"Loaded {len(words)} words")
    
    if len(words) != rates.shape[0]:
        raise ValueError(f"Mismatch between number of words ({len(words)}) and vectors ({rates.shape[0]})")
    
    logger.info("Verification complete - sizes match!")
    
    return rates, words

rates, words = load_embeddings()

print("\nDataset information:")
print(f"Number of words (vocab): {len(words)}")
print(f"Number of vectors (rates): {rates.shape[0]}")
print(f"\nFirst few words and their expected rates {[(word, rate) for (word, rate) in zip(words[:5], rates[:5])]}")


10:17:30 INFO:Loading embedding vectors...
10:17:30 INFO:Loaded vectors with shape: (89527,)
10:17:30 INFO:Loading words...
10:17:30 INFO:Loaded 89527 words
10:17:30 INFO:Verification complete - sizes match!



Dataset information:
Number of words (vocab): 89527
Number of vectors (rates): 89527

First few words and their expected rates [('the', 0.0490972013402), ('and', 0.201363575849), ('a', 0.0333946807184), ('of', 0.099837669572), ('to', -0.0790210365788)]


In [None]:
df_train = pd.read_csv('aclImdb/df_train')
X = df_train['comment']
y = df_train['sentiment']

In [78]:
device = torch.device(
    'cuda' if torch.cuda.is_available() 
    else 'mps' if torch.backends.mps.is_available()
    else 'cpu'
)
print(f'Using device: {device}')

Using device: cpu


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)
model.eval()

In [None]:
def generate_bert_embeddings(reviews, batch_size=32):
    all_embeddings = []

    for i in tqdm(range(0, len(reviews), batch_size)):
        batch = reviews[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        # embeddings du token [CLS] (premier token)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(cls_embeddings)

    return np.vstack(all_embeddings)

#generate embeddings
reviews = X.astype(str).tolist()
embeddings = generate_bert_embeddings(reviews, batch_size=32)

np.save('aclImdb/embeddings/X_train_embeddings.npy', embeddings)


100%|██████████| 1/1 [00:21<00:00, 21.57s/it]


In [77]:
X = np.load('aclImdb/embeddings/X_train_embeddings.npy')
print(f"Embedding dimensions :{X.shape}")


Embedding dimensions :(10, 768)


In [64]:
X_train, X_val, y_train, y_val = train_test_split(X, y[:10], test_size=0.2, random_state=42)

In [68]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Prédictions
y_pred = clf.predict(X_val)

# Évaluation
print(f'Accuracy : {accuracy_score(y_val, y_pred):.4f}')
print(classification_report(y_val, y_pred))


Accuracy : 0.5000
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



In [75]:
svm_clf = LinearSVC()
svm_clf.fit(X_train, y_train)

# Prédictions et évaluation
y_pred_svm = svm_clf.predict(X_val)

print(f'Accuracy (SVM linéaire): {accuracy_score(y_val, y_pred_svm):.4f}')
print(classification_report(y_val, y_pred_svm))


Accuracy (SVM linéaire): 0.5000
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



In [None]:
#TO DO : word2vec