In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, AutoTokenizer, AutoModelForMaskedLM, AutoConfig, AutoModel, AdamW, get_scheduler
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, CamembertForSequenceClassification, AutoModelForSeq2SeqLM
from transformers.utils import ModelOutput
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from datasets import load_metric

In [5]:
torch.cuda.is_available()

True

## Load data

In [6]:
df = pd.read_csv("../../tweets_labelled.csv") # load labelled dataset

In [8]:
df = df[df['A supprimer'] != True]

In [9]:
# remove incomplete data
df = df.dropna(subset=['Index', 'Sujet']).reset_index(drop=True)

In [10]:
# transform categories to digits
labels = df['Sujet']
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

print("Encoded labels:", encoded_labels)

decoded_labels = label_encoder.inverse_transform(encoded_labels)
print("Decoded labels:", decoded_labels)

Encoded labels: [0 0 0 ... 0 0 0]
Decoded labels: ['Loin du VE' 'Loin du VE' 'Loin du VE' ... 'Loin du VE' 'Loin du VE'
 'Loin du VE']


## Hyperparameters and split sets

In [11]:
nb_epoch = 10
batch_size = 32
max_seq_len = 128
train_size = 0.6
random_seed = 42

In [13]:
# split sets
X_train, X_test, y_train, y_test = train_test_split(df['Texte'], encoded_labels, train_size=train_size, random_state=random_seed)

In [14]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=random_seed) 

## Tokenization and Dataloaders

In [15]:
tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")

In [16]:
def tokenize_function(texts):
    return tokenizer(
        texts, 
        padding='max_length', 
        truncation=True, 
        max_length=max_seq_len, 
        return_tensors='pt'
    )

train_encodings = tokenize_function(X_train.tolist())
val_encodings = tokenize_function(X_val.tolist())
test_encodings = tokenize_function(X_test.tolist())

train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)
val_labels = torch.tensor(y_val)

In [17]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create the train and test datasets
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)
val_dataset = TextDataset(val_encodings, val_labels)

In [18]:
#shuffle=True,
train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size
)
eval_dataloader = DataLoader(
    val_dataset, batch_size=batch_size
)
test_dataloader = DataLoader(
    test_dataset, batch_size=batch_size
)

## Model

In [19]:
# load model and adapt bertweetfr-base to our 3-classes classification problem by changing the model head
class CustomModel(nn.Module):
    def __init__(self,checkpoint,num_labels): 
        super(CustomModel,self).__init__() 
        self.num_labels = num_labels 
        
        # Load Model at checkpoint
        self.model = model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
        self.dropout = nn.Dropout(0.1) 
        self.classifier = nn.Linear(768,num_labels) # load and initialize weights
    
    def forward(self, input_ids=None, attention_mask=None,labels=None):
        # Extract outputs from the body
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        
        # custom layers
        sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state
        
        logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # compute loss
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return ModelOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)


    def save_model(self, name):
        return self.model.save_model(name)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomModel(checkpoint="Yanzhu/bertweetfr-base",num_labels=3).to(device) 

Some weights of CamembertModel were not initialized from the model checkpoint at Yanzhu/bertweetfr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = nb_epoch
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

400




In [22]:
metric = load_metric("f1")

  metric = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [23]:
progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))


for epoch in range(num_epochs):
    print("Epoch", epoch+1)
    
    model.train()
    train_loss = []
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        train_loss.append(loss.item())
        progress_bar_train.update(1)
    
    print("Training", np.mean(train_loss))
    
    model.eval()
    val_loss = []
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
    
        loss = outputs.loss
        val_loss.append(loss.item())
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
        progress_bar_eval.update(1)
    
    print("Validation", np.mean(val_loss))
    print(metric.compute(average='weighted'))

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/140 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 5.67 GiB of which 8.81 MiB is free. Process 9129 has 2.06 GiB memory in use. Including non-PyTorch memory, this process has 3.57 GiB memory in use. Of the allocated memory 3.42 GiB is allocated by PyTorch, and 45.29 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Evaluation 

In [1]:
model.eval()
preds = []
true = []

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    preds.append(predictions)
    true.append(batch["labels"])
    metric.add_batch(predictions=predictions, references=batch["labels"])

res_metric = metric.compute(average='weighted')

print('f1 :', round(res_metric['f1'], 4))

In [2]:
#save model
#model.model.save_pretrained("bertweetfr-retrained")