# Fine-tuning for Multilingual BERT (mBERT)

## Imports

In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset, RandomSampler


  from .autonotebook import tqdm as notebook_tqdm


In [1]:
# Verifying that we are using osx-arm64 arch
import platform
platform.platform(), platform.mac_ver()

('macOS-13.2.1-arm64-i386-64bit', ('13.2.1', ('', '', ''), 'arm64'))

In [3]:
torch.has_mps

True

In [4]:
device = torch.device('mps')

## Functions

In [5]:
# Definir una clase para el conjunto de datos
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = self.labels[index]
        encoding = self.tokenizer(text, 
                                  truncation=True, 
                                  max_length=self.max_len, 
                                  padding='max_length', 
                                  return_tensors='pt')
        return {'input_ids': encoding['input_ids'].squeeze(), 
                'attention_mask': encoding['attention_mask'].squeeze(), 
                'label': torch.tensor(label, dtype=torch.long)}

In [6]:
# Definir una función de entrenamiento
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        label = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, 
                        attention_mask=attention_mask, 
                        labels=label)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(dataloader)


## Importing model


In [7]:
# Cargar el modelo pre-entrenado multilingue de BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', 
                                                      num_labels=2)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

## Data

In [8]:
import pandas as pd, numpy as np

In [9]:
data_training_en_path = '../Author profiling/Datasets/PAN19-Author-Profiling-20200229/CSV/Clean/data_training_en.csv'
data_training_es_path = '../Author profiling/Datasets/PAN19-Author-Profiling-20200229/CSV/Clean/data_training_es.csv'
data_test_en_path = '../Author profiling/Datasets/PAN19-Author-Profiling-20200229/CSV/Clean/data_test_en.csv'
data_test_es_path = '../Author profiling/Datasets/PAN19-Author-Profiling-20200229/CSV/Clean/data_test_es.csv'

In [10]:
data_training_en = pd.read_csv(data_training_en_path)
data_test_en = pd.read_csv(data_test_en_path)

In [11]:
data_training_es = pd.read_csv(data_training_es_path)
data_test_es = pd.read_csv(data_test_es_path)

In [12]:
dataframes = [data_training_en, data_test_en, data_training_es, data_test_es]
dataframes_names = ['data_training_en', 'data_test_en', 'data_training_es', 'data_test_es']
columns = ['author', 'gender']
dictionary_list = [{'human': 0,
                    'bot':1},
                    {'male': 0,
                    'female':1,
                    'bot':2}]

In [13]:
data_training_en.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,tweet,author,gender
0,0,0,1008c35dc72c34ead679c539a0ed7c24,i can hear the black bull jukey calling,0,0
1,1,1,1008c35dc72c34ead679c539a0ed7c24,or the agincourt salute,0,0
2,2,2,1008c35dc72c34ead679c539a0ed7c24,insight into how challenging touring is for a ...,0,0
3,3,3,1008c35dc72c34ead679c539a0ed7c24,flight of the rat classic purps boogie on down,0,0
4,4,4,1008c35dc72c34ead679c539a0ed7c24,he switched his twitter aff give him a poke on...,0,0


In [14]:
# Data training to lists
x_en = data_training_en.tweet.to_list()
y_en = data_training_en.author.to_list()
x_es = data_training_es.tweet.to_list()
y_es = data_training_es.author.to_list()

In [15]:
# Data test to lists
xt_en = data_test_en.tweet.to_list()
yt_en = data_test_en.author.to_list()
xt_es = data_test_es.tweet.to_list()
yt_es = data_test_es.author.to_list()

In [16]:
X_train = x_en + x_es
y_train = y_en + y_es
X_test = xt_en + xt_es
y_test = yt_en + yt_es

In [17]:
len(X_train), len(y_train), len(X_test), len(y_test)

(123286, 123286, 68289, 68289)

## Hyperparams

In [18]:
batch_size = 64
learning_rate = 5e-5
num_epochs = 5
max_len = 128 # TODO: change it for real average of tweets
train_dataset = TextDataset(X_train, 
                            y_train, 
                            tokenizer, 
                            max_len)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, 
                              sampler=train_sampler, 
                              batch_size=batch_size)


In [19]:
optimizer = AdamW(model.parameters(), 
                  lr=learning_rate)
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

## Train model


In [4]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [21]:
for epoch in range(num_epochs):
    loss = train(model, 
                 train_dataloader, 
                 optimizer, 
                 device)
    print(f'Epoch: {epoch+1}, Loss: {loss}')

RuntimeError: MPS backend out of memory (MPS allocated: 2.74 GB, other allocations: 6.80 GB, max allowed: 9.07 GB). Tried to allocate 350.24 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

## Test model

In [None]:
from  torch.utils.data import SequentialSampler
test_dataset = TextDataset(X_test, 
                           y_test, 
                           tokenizer, 
                           max_len)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, 
                             sampler=test_sampler, 
                             batch_size=batch_size)

In [None]:
from sklearn.metrics import classification_report
# Obtener las etiquetas verdaderas y predichas
y_true = y_train
model.eval()
y_pred = []
for batch in test_dataloader:
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'token_type_ids': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    preds = np.argmax(logits, axis=1)
    y_pred.extend(preds)

# Generar el reporte de clasificación
target_names = ['human', 'bot']
print(classification_report(y_true, 
                            y_pred, 
                            target_names=target_names))