In [2]:
from roberta import RobertaTokenizer_imdb, RobertaModel_imdb
dataset_manager = RobertaTokenizer_imdb("roberta-base")
df_train = dataset_manager.load_data('aclImdb/df_train', max_length=1000)
tokenize_dataset =dataset_manager.prepare_dataset(df_train)
split_dataset = dataset_manager.split_dataset(tokenize_dataset)
test_dataset = split_dataset['test']
train_dataset = split_dataset['train']
test_loader = dataset_manager.create_dataloader(test_dataset)
train_loader = dataset_manager.create_dataloader(train_dataset)

model = RobertaModel_imdb("roberta-base", num_labels=2)
model.load_model(filepath="roberta_model.pt")
model.evaluate(test_loader)
model.evaluate(train_loader,split_name='train')

Map: 100%|██████████| 1000/1000 [00:01<00:00, 805.91 examples/s]


Using device: mps


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating on Validation: 100%|██████████| 25/25 [00:09<00:00,  2.61it/s]


Validation Accuracy: 0.9900


Evaluating on train: 100%|██████████| 100/100 [00:38<00:00,  2.59it/s]

train Accuracy: 0.9775





{'accuracy': 0.9775}

In [3]:
import torch

# Load saved models
bert_weights = torch.load("best_sentiment_model.pt")
roberta_weights = torch.load("roberta_model.pt")

# Check weight sizes
bert_size = sum(p.numel() for p in bert_weights.values())
roberta_size = sum(p.numel() for p in roberta_weights.values())

print(f"BERT weight size: {bert_size:,} parameters")
print(f"RoBERTa weight size: {roberta_size:,} parameters")


BERT weight size: 656,897 parameters
RoBERTa weight size: 124,647,170 parameters


In [4]:
print(bert_weights.keys())
print(roberta_weights.keys())


odict_keys(['model.0.weight', 'model.0.bias', 'model.2.weight', 'model.2.bias', 'model.4.weight', 'model.4.bias'])
odict_keys(['roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.0.intermediate.dense.bia

In [8]:
from transformers import RobertaForSequenceClassification
# Load the model
model = RobertaForSequenceClassification.from_pretrained("roberta-base")

# Get only the model's weights
state_dict = model.state_dict()

# Save the weights in .pt format
torch.save(state_dict, "roberta_model_clean.pt")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
roberta_weights = torch.load("roberta_model_clean.pt")
roberta_size = sum(p.numel() for p in roberta_weights.values())
print(f"RoBERTa weight size: {roberta_size:,} parameters")
print(roberta_weights.keys())

RoBERTa weight size: 124,647,170 parameters
odict_keys(['roberta.embeddings.word_embeddings.weight', 'roberta.embeddings.position_embeddings.weight', 'roberta.embeddings.token_type_embeddings.weight', 'roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.0.attention.self.query.bias', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.0.attention.self.key.bias', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.attention.self.value.bias', 'roberta.encoder.layer.0.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.output.dense.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.0.intermediate.dense.bias', 'roberta.encoder.layer.0.output.dense.weight', 'roberta.encoder.lay

In [8]:
import os
import pandas as pd
import re

def clean_review(text):
    """
    Removes <br /> 
    """
    text = re.sub(r'<br\s*/?>', ' ', text)
    return text

def process_movie_reviews(directory_path: str) -> pd.DataFrame:
    """
    Traite les fichiers de critiques de films et les combine dans un DataFrame.
    Args:
        directory_path (str): Chemin vers le répertoire contenant les fichiers texte
    Returns:
        pd.DataFrame: DataFrame contenant les colonnes id, rate, et comment
    """

    movie_ids = []
    rates = []
    comments = []
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            # id et note du nom du fichier

            parts = filename.split('_')
            if len(parts) == 2:
                movie_id = int(parts[0])
                rate = int(parts[1].split('.')[0])
                
                with open(os.path.join(directory_path, filename), 'r', encoding='utf-8') as f:
                    comment = f.read().strip()
                    comment = clean_review(comment) 
                
                # Ajouter les données aux listes
                movie_ids.append(movie_id)
                rates.append(rate)
                comments.append(comment)
    
    df = pd.DataFrame({
        'id': movie_ids,
        'rate': rates,
        'comment': comments
    })
    
    # Trier par id de film
    df = df.sort_values('id')
    
    return df

pos_df = process_movie_reviews('/Users/suzie/Downloads/aclImdb/test/pos')
neg_df = process_movie_reviews('/Users/suzie/Downloads/aclImdb/test/neg')

#labelling 
pos_df['sentiment'] = 1
neg_df['sentiment'] = 0

# Create dataset with positive and negative reviews
df = pd.concat([pos_df, neg_df], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
from roberta import RobertaTokenizer_imdb, RobertaModel_imdb
dataset_manager = RobertaTokenizer_imdb("roberta-base")
tokenize_dataset =dataset_manager.prepare_dataset(df)
test_loader = dataset_manager.create_dataloader(tokenize_dataset)

model = RobertaModel_imdb("roberta-base", num_labels=2)
model.load_model(filepath="roberta_model.pt")
model.evaluate(test_loader)

Map: 100%|██████████| 25000/25000 [00:23<00:00, 1046.48 examples/s]


Using device: mps


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating on Validation: 100%|██████████| 3125/3125 [20:05<00:00,  2.59it/s]

Validation Accuracy: 0.9467





{'accuracy': 0.94668}

Training : on teste plusieurs learning rates (9h de code qui tourne) 

In [1]:
from roberta import RobertaTokenizer_imdb, RobertaModel_imdb
dataset_manager = RobertaTokenizer_imdb("roberta-base")
df_train = dataset_manager.load_data('aclImdb/df_train')
tokenize_dataset =dataset_manager.prepare_dataset(df_train)
split_dataset = dataset_manager.split_dataset(tokenize_dataset)
test_dataset = split_dataset['test']
train_dataset = split_dataset['train']
test_loader = dataset_manager.create_dataloader(test_dataset)
train_loader = dataset_manager.create_dataloader(train_dataset)

model = RobertaModel_imdb("roberta-base", num_labels=2)
model.train(train_loader, num_epochs=3, learning_rate=1e-5, gradient_accumulation_steps=4, model_filepath="roberta_imdb_1e-5.pt", loss_filepath="roberta_training_losses_1e-5.pkl")
model.evaluate(test_loader)
model.evaluate(train_loader,split_name='train')

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 25000/25000 [00:24<00:00, 1006.98 examples/s]


Using device: mps


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


100%|██████████| 2500/2500 [46:23<00:00,  1.11s/it, loss=0.103]  


Model weights saved to roberta_imdb_1e-5.pt
Epoch 2/3


100%|██████████| 2500/2500 [46:17<00:00,  1.11s/it, loss=0.0775] 


Model weights saved to roberta_imdb_1e-5.pt
Epoch 3/3


100%|██████████| 2500/2500 [46:12<00:00,  1.11s/it, loss=0.0093] 


Model weights saved to roberta_imdb_1e-5.pt
Training complete


Evaluating on Validation: 100%|██████████| 625/625 [03:58<00:00,  2.62it/s]


Validation Accuracy: 0.9500


Evaluating on train: 100%|██████████| 2500/2500 [15:53<00:00,  2.62it/s]

train Accuracy: 0.9846





{'accuracy': 0.98455}

In [2]:
model_5e5 = RobertaModel_imdb("roberta-base", num_labels=2)
model_5e5.train(train_loader, num_epochs=3, learning_rate=5e-5, gradient_accumulation_steps=4, model_filepath="roberta_imdb_5e-5.pt", loss_filepath="roberta_training_losses_5e-5.pkl")
model_5e5.evaluate(test_loader)
model_5e5.evaluate(train_loader,split_name='train')

Using device: mps


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


100%|██████████| 2500/2500 [45:52<00:00,  1.10s/it, loss=0.158]  


Model weights saved to roberta_imdb_5e-5.pt
Epoch 2/3


100%|██████████| 2500/2500 [45:53<00:00,  1.10s/it, loss=0.0707] 


Model weights saved to roberta_imdb_5e-5.pt
Epoch 3/3


100%|██████████| 2500/2500 [45:52<00:00,  1.10s/it, loss=0.00633]


Model weights saved to roberta_imdb_5e-5.pt
Training complete


Evaluating on Validation: 100%|██████████| 625/625 [03:58<00:00,  2.62it/s]


Validation Accuracy: 0.9426


Evaluating on train: 100%|██████████| 2500/2500 [15:53<00:00,  2.62it/s]

train Accuracy: 0.9886





{'accuracy': 0.9886}

In [3]:
model_1e4 = RobertaModel_imdb("roberta-base", num_labels=2)
model_1e4.train(train_loader, num_epochs=3, learning_rate=1e-4, gradient_accumulation_steps=4, model_filepath="roberta_imdb_1e-4.pt", loss_filepath="roberta_training_losses_1e-4.pkl")
model_1e4.evaluate(test_loader)
model_1e4.evaluate(train_loader,split_name='train')

Using device: mps


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


100%|██████████| 2500/2500 [46:38<00:00,  1.12s/it, loss=0.118]  


Model weights saved to roberta_imdb_1e-4.pt
Epoch 2/3


100%|██████████| 2500/2500 [46:45<00:00,  1.12s/it, loss=0.0669] 


Model weights saved to roberta_imdb_1e-4.pt
Epoch 3/3


100%|██████████| 2500/2500 [46:38<00:00,  1.12s/it, loss=0.0465] 


Model weights saved to roberta_imdb_1e-4.pt
Training complete


Evaluating on Validation: 100%|██████████| 625/625 [03:58<00:00,  2.62it/s]


Validation Accuracy: 0.9228


Evaluating on train: 100%|██████████| 2500/2500 [15:53<00:00,  2.62it/s]

train Accuracy: 0.9655





{'accuracy': 0.96545}