In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, AutoTokenizer, AutoModelForMaskedLM, AutoConfig, AutoModel, AdamW, get_scheduler
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, CamembertForSequenceClassification, AutoModelForSeq2SeqLM
from transformers.utils import ModelOutput
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
# from datasets import load_metric

In [4]:
torch.cuda.is_available()

False

## Load data

In [6]:
df = pd.read_csv("tweets_labelled.csv") # load labelled dataset

In [7]:
df = df[df['A supprimer'] != True]

In [8]:
# remove incomplete data
df = df.dropna(subset=['Index', 'Sujet']).reset_index(drop=True)

In [9]:
# transform categories to digits
labels = df['Sujet']
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

print("Encoded labels:", encoded_labels)

decoded_labels = label_encoder.inverse_transform(encoded_labels)
print("Decoded labels:", decoded_labels)

Encoded labels: [0 0 0 ... 0 0 0]
Decoded labels: ['Loin du VE' 'Loin du VE' 'Loin du VE' ... 'Loin du VE' 'Loin du VE'
 'Loin du VE']


## Hyperparameters and split sets

In [10]:
nb_epoch = 10
batch_size = 32
max_seq_len = 128
train_size = 0.6
random_seed = 42

In [11]:
# split sets
X_train, X_test, y_train, y_test = train_test_split(df['Texte'], encoded_labels, train_size=train_size, random_state=random_seed)

In [12]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=random_seed) 

## Tokenization and Dataloaders

In [13]:
tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")



In [14]:
def tokenize_function(texts):
    return tokenizer(
        texts, 
        padding='max_length', 
        truncation=True, 
        max_length=max_seq_len, 
        return_tensors='pt'
    )

train_encodings = tokenize_function(X_train.tolist())
val_encodings = tokenize_function(X_val.tolist())
test_encodings = tokenize_function(X_test.tolist())

train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)
val_labels = torch.tensor(y_val)

In [15]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create the train and test datasets
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)
val_dataset = TextDataset(val_encodings, val_labels)

In [16]:
#shuffle=True,
train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size
)
eval_dataloader = DataLoader(
    val_dataset, batch_size=batch_size
)
test_dataloader = DataLoader(
    test_dataset, batch_size=batch_size
)

## Model

In [17]:
# load model and adapt bertweetfr-base to our 3-classes classification problem by changing the model head
class CustomModel(nn.Module):
    def __init__(self,checkpoint,num_labels): 
        super(CustomModel,self).__init__() 
        self.num_labels = num_labels 
        
        # Load Model at checkpoint
        self.model = model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
        self.dropout = nn.Dropout(0.1) 
        self.classifier = nn.Linear(768,num_labels) # load and initialize weights
    
    def forward(self, input_ids=None, attention_mask=None,labels=None):
        # Extract outputs from the body
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        
        # custom layers
        sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state
        
        logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # compute loss
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return ModelOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)


    def save_model(self, name):
        return self.model.save_model(name)

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomModel(checkpoint="Yanzhu/bertweetfr-base",num_labels=3).to(device) 

Some weights of the model checkpoint at Yanzhu/bertweetfr-base were not used when initializing CamembertModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertModel were not initialized from the model checkpoint at Yanzhu/bertweetfr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be 

In [19]:
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = nb_epoch
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

400




In [21]:
pip install datasets 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [22]:
from datasets import load_metric

metric = load_metric("f1")

ImportError: cannot import name 'load_metric' from 'datasets' (/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/datasets/__init__.py)

In [20]:
metric = load_metric("f1")

NameError: name 'load_metric' is not defined

In [23]:
import evaluate

metric = evaluate.load("f1")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading builder script: 100%|██████████| 6.77k/6.77k [00:00<00:00, 3.07MB/s]


In [24]:
progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))


for epoch in range(num_epochs):
    print("Epoch", epoch+1)
    
    model.train()
    train_loss = []
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        train_loss.append(loss.item())
        progress_bar_train.update(1)
    
    print("Training", np.mean(train_loss))
    
    model.eval()
    val_loss = []
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
    
        loss = outputs.loss
        val_loss.append(loss.item())
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
        progress_bar_eval.update(1)
    
    print("Validation", np.mean(val_loss))
    print(metric.compute(average='weighted'))

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1


 10%|█         | 40/400 [26:08<3:39:51, 36.64s/it]

Training 0.9123261943459511


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Validation 0.6753475538321904
{'f1': 0.7507684419707973}
Epoch 2


 20%|██        | 80/400 [52:24<2:17:41, 25.82s/it]

Training 0.5714998878538609


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Validation 0.5219900480338505
{'f1': 0.8253704653901107}
Epoch 3


 30%|███       | 120/400 [1:12:43<1:58:43, 25.44s/it]

Training 0.4041881822049618


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Validation 0.4921376534870693
{'f1': 0.8362965650701499}
Epoch 4


 40%|████      | 160/400 [14:50:14<1:51:19, 27.83s/it]    

Training 0.30014551877975465


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Validation 0.4916624405554363
{'f1': 0.8212939869732343}
Epoch 5


 41%|████      | 164/400 [14:54:03<2:38:15, 40.23s/it]

KeyboardInterrupt: 

## Evaluation 

In [1]:
model.eval()
preds = []
true = []

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    preds.append(predictions)
    true.append(batch["labels"])
    metric.add_batch(predictions=predictions, references=batch["labels"])

res_metric = metric.compute(average='weighted')

print('f1 :', round(res_metric['f1'], 4))

In [2]:
#save model
#model.model.save_pretrained("bertweetfr-retrained")