<a href="https://colab.research.google.com/github/mailaucq/Natural-Language-Processing/blob/master/C%C3%B3pia_de_xml_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py

#!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev


In [2]:
#!pip install transformers

In [3]:
#!pip install sentencepiece

In [4]:
import torch_xla
import torch_xla.core.xla_model as xm

In [5]:
import pandas as pd
import numpy as np
import os
import gc

import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# set a seed value
torch.manual_seed(555)

from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, accuracy_score

import transformers
from transformers import BertTokenizer, BertForSequenceClassification 
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW

import warnings
warnings.filterwarnings("ignore")


print(torch.__version__)

1.6.0a0+bf2bbd9


In [6]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

MODEL_TYPE = 'xlm-roberta-base'

tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)

In [7]:
tokenizer.vocab_size

250002

In [8]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'cls_token': '<s>',
 'eos_token': '</s>',
 'mask_token': '<mask>',
 'pad_token': '<pad>',
 'sep_token': '</s>',
 'unk_token': '<unk>'}

In [9]:
print('bos_token_id <s>:', tokenizer.bos_token_id)
print('eos_token_id </s>:', tokenizer.eos_token_id)
print('sep_token_id </s>:', tokenizer.sep_token_id)
print('pad_token_id <pad>:', tokenizer.pad_token_id)

bos_token_id <s>: 0
eos_token_id </s>: 2
sep_token_id </s>: 2
pad_token_id <pad>: 1


In [10]:
## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
dataset_path = "/content/drive/My Drive/Colab Notebooks/projeto-reconhecimento-autoria/dataset/"

In [12]:
data = pd.read_csv(dataset_path + "books_authorship_english.csv") 

In [13]:
data.shape

(78, 7)

In [14]:
min_size = 300

In [15]:
label_to_ix = {}
for label in data.label:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
label_to_ix

{'alan_poe': 3,
 'arthur_doyle': 9,
 'bram_stoker': 4,
 'charles_darwin': 8,
 'charles_dickens': 6,
 'daniel_defoe': 2,
 'george_eliot': 10,
 'hector_hugh': 0,
 'jane_austen': 11,
 'joseph_conrad': 12,
 'mark_twain': 5,
 'pelham_grenville': 7,
 'thomas_hardy': 1}

In [16]:
random_flag = True

In [17]:
df_train = pd.DataFrame(columns = ['text', 'label'])
corpus = [i.split() for i in data["words"]]
segmented_corpus = []

for book in corpus:
  partitions = int(round(len(book)/min_size,2) + 0.5)
  segments = [book[int(round(min_size * i)): int(round(min_size * (i + 1)))] for i in range(partitions)]
  segmented_corpus.append(segments)

for label, partitions in zip(data["label"], segmented_corpus):
  if random_flag:
    random_index = random.randint(0, len(partitions) - 1)
    text = " ".join(partitions[random_index])
    intent = label
    df_train = df_train.append({'text': text, 'label': intent}, ignore_index=True)
  else:
    for p in partitions:
      text = " ".join(p)
      intent = label
      df_train = df_train.append({'text': text, 'label': intent}, ignore_index=True)
df_train.tail()

Unnamed: 0,text,label
73,the criminal no doubt he wa selfish too but hi...,joseph_conrad
74,she ever have it have be drench in a ugly a lo...,joseph_conrad
75,his head even attempt have be make treacherous...,joseph_conrad
76,fresh track of wheel run under it the gate loo...,joseph_conrad
77,there in a trance do you hear we must he look ...,joseph_conrad


In [18]:
df_train.shape

(78, 2)

In [19]:
from sklearn.model_selection import KFold, StratifiedKFold

# shuffle
df = shuffle(df_train)

# initialize kfold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1024)

# for stratification
y = df['label']

# Note:
# Each fold is a tuple ([train_index_values], [val_index_values])
# fold_0, fold_1, fold_2, fold_3, fold_5 = kf.split(df, y)

# Put the folds into a list. This is a list of tuples.
fold_list = list(kf.split(df, y))

train_df_list = []
val_df_list = []

for i, fold in enumerate(fold_list):

    # map the train and val index values to dataframe rows
    df_train = df[df.index.isin(fold[0])]
    df_val = df[df.index.isin(fold[1])]
    
    train_df_list.append(df_train)
    val_df_list.append(df_val)
    
    

print(len(train_df_list))
print(len(val_df_list))


5
5


In [20]:
MODEL_TYPE = 'xlm-roberta-base'


L_RATE = 1e-5
MAX_LEN = 256 #256

NUM_EPOCHS = 3
BATCH_SIZE = 8 #32
NUM_CORES = os.cpu_count()

NUM_CORES

2

In [21]:
device = xm.xla_device()

print(device)

xla:1


In [22]:
df_train = train_df_list[0]

df_train.head()

Unnamed: 0,text,label
22,but in the corridor that follow the suite ther...,alan_poe
4,behind his return main guard for the purpose o...,hector_hugh
62,for he ll be a cunning a satan before long and...,george_eliot
63,i like to differ from everybody i think it be ...,george_eliot
72,lie open on heyst s knee slip suddenly and he ...,joseph_conrad


In [23]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

# xlm-roberta-large
print('Loading XLMRoberta tokenizer...')
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)


Loading XLMRoberta tokenizer...


In [24]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [25]:
class BookDataset(Dataset):

    def __init__(self, df):
        self.df_data = df



    def __getitem__(self, index):

        # get the sentence from the dataframe
        sentence1 = self.df_data.loc[index, 'text']

        # Process the sentence
        # ---------------------

        encoded_dict = tokenizer.encode_plus(
                    sentence1,           # Sentences to encode.
                    add_special_tokens = True,      # Add the special tokens.
                    max_length = MAX_LEN,           # Pad & truncate all sentences.
                    pad_to_max_length = True,
                    truncation=True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',          # Return pytorch tensors.
               )
        
        # These are torch tensors.
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        
        # Convert the target to a torch tensor
        target = torch.tensor(label_to_ix[self.df_data.loc[index, 'label']])

        sample = (padded_token_list, att_mask, target)


        return sample


    def __len__(self):
        return len(self.df_data)

In [26]:
train_data = BookDataset(df_train)
val_data = BookDataset(df_val)

train_dataloader = torch.utils.data.DataLoader(train_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

val_dataloader = torch.utils.data.DataLoader(val_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)




print(len(train_dataloader))
print(len(val_dataloader))


8
2


In [27]:
from transformers import XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained(
    MODEL_TYPE, 
    num_labels = len(list(label_to_ix.values())), # The number of output labels. 2 for binary classification.
)

# Send the model to the device.
model.to(device)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [28]:
# Create a batch of train samples
# We will set a small batch size of 8 so that the model's output can be easily displayed.

train_dataloader = torch.utils.data.DataLoader(train_data,
                                        batch_size=8,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

b_input_ids, b_input_mask, b_labels = next(iter(train_dataloader))

print(b_input_ids.shape)
print(b_input_mask.shape)
print(b_labels.shape)

torch.Size([8, 256])
torch.Size([8, 256])
torch.Size([8])


In [29]:
# Pass a batch of train samples to the model.

batch = next(iter(train_dataloader))

# Send the data to the device
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)

# Run the model
outputs = model(b_input_ids, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)

# The ouput is a tuple (loss, preds).
outputs

SequenceClassifierOutput([('loss',
                           tensor(2.5946, device='xla:1', grad_fn=<NllLossBackward>)),
                          ('logits',
                           tensor([[ 0.1276, -0.2763, -0.1773,  0.1942, -0.1199, -0.2912, -0.2340, -0.0090,
                                    -0.1208, -0.0334, -0.5123,  0.2834,  0.1726],
                                   [ 0.1231, -0.2743, -0.1803,  0.1989, -0.1155, -0.2862, -0.2266, -0.0029,
                                    -0.1298, -0.0419, -0.5036,  0.2869,  0.1681],
                                   [ 0.1258, -0.2694, -0.1784,  0.1953, -0.1166, -0.2913, -0.2307, -0.0081,
                                    -0.1235, -0.0403, -0.5092,  0.2818,  0.1736],
                                   [ 0.1281, -0.2758, -0.1790,  0.1936, -0.1154, -0.2907, -0.2294, -0.0048,
                                    -0.1234, -0.0412, -0.5030,  0.2861,  0.1718],
                                   [ 0.1328, -0.2708, -0.1761,  0.1926, -0.1153, 

In [30]:
preds = outputs[1].detach().cpu().numpy()

y_true = b_labels.detach().cpu().numpy()
y_pred = np.argmax(preds, axis=1)

y_pred

array([11, 11, 11, 11, 11, 11, 11, 11])

In [31]:
# This is the accuracy without fine tuning.

val_acc = accuracy_score(y_true, y_pred)

val_acc

0.25

In [32]:
# Define the optimizer
optimizer = AdamW(model.parameters(),
              lr = L_RATE, 
              eps = 1e-8 
            )

In [33]:
# Create the dataloaders.

train_data = BookDataset(df_train)
val_data = BookDataset(df_val)

train_dataloader = torch.utils.data.DataLoader(train_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)

val_dataloader = torch.utils.data.DataLoader(val_data,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True,
                                       num_workers=NUM_CORES)




print(len(train_dataloader))
print(len(val_dataloader))


8
2


In [34]:
%%time


# Set the seed.
seed_val = 101

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []


# For each epoch...
for epoch in range(0, NUM_EPOCHS):
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, NUM_EPOCHS))
    

    stacked_val_labels = []
    targets_list = []

    # ========================================
    #               Training
    # ========================================
    
    print('Training...')
    
    # put the model into train mode
    model.train()
    
    # This turns gradient calculations on and off.
    torch.set_grad_enabled(True)


    # Reset the total loss for this epoch.
    total_train_loss = 0

    for i, batch in enumerate(train_dataloader):
        
        train_status = 'Batch ' + str(i) + ' of ' + str(len(train_dataloader))
        
        print(train_status, end='\r')


        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        


        outputs = model(b_input_ids, 
                    attention_mask=b_input_mask,
                    labels=b_labels)
        
        # Get the loss from the outputs tuple: (loss, logits)
        loss = outputs[0]
        
        # Convert the loss from a torch tensor to a number.
        # Calculate the total loss.
        total_train_loss = total_train_loss + loss.item()
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        
        
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        
        
        # Use the optimizer to update the weights.
        
        # Optimizer for GPU
        # optimizer.step() 
        
        # Optimizer for TPU
        # https://pytorch.org/xla/
        xm.optimizer_step(optimizer, barrier=True)

    
    print('Train loss:' ,total_train_loss)


    # ========================================
    #               Validation
    # ========================================
    
    print('\nValidation...')

    # Put the model in evaluation mode.
    model.eval()

    # Turn off the gradient calculations.
    # This tells the model not to compute or store gradients.
    # This step saves memory and speeds up validation.
    torch.set_grad_enabled(False)
    
    
    # Reset the total loss for this epoch.
    total_val_loss = 0
    

    for j, batch in enumerate(val_dataloader):
        
        val_status = 'Batch ' + str(j) + ' of ' + str(len(val_dataloader))
        
        print(val_status, end='\r')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)      


        outputs = model(b_input_ids, 
                attention_mask=b_input_mask, 
                labels=b_labels)
        
        # Get the loss from the outputs tuple: (loss, logits)
        loss = outputs[0]
        
        # Convert the loss from a torch tensor to a number.
        # Calculate the total loss.
        total_val_loss = total_val_loss + loss.item()
        

        # Get the preds
        preds = outputs[1]


        # Move preds to the CPU
        val_preds = preds.detach().cpu().numpy()
        
        # Move the labels to the cpu
        targets_np = b_labels.to('cpu').numpy()

        # Append the labels to a numpy list
        targets_list.extend(targets_np)

        if j == 0:  # first batch
            stacked_val_preds = val_preds

        else:
            stacked_val_preds = np.vstack((stacked_val_preds, val_preds))

    
    # Calculate the validation accuracy
    y_true = targets_list
    y_pred = np.argmax(stacked_val_preds, axis=1)
    
    val_acc = accuracy_score(y_true, y_pred)
    
    
    print('Val loss:' ,total_val_loss)
    print('Val acc: ', val_acc)


    # Save the Model
    torch.save(model.state_dict(), 'model.pt')
    
    # Use the garbage collector to save memory.
    gc.collect()


Training...
Train loss: 20.681824445724487

Validation...
Val loss: 5.224736928939819
Val acc:  0.13333333333333333

Training...
Train loss: 20.55401062965393

Validation...
Val loss: 5.217558860778809
Val acc:  0.13333333333333333

Training...
Train loss: 20.808268785476685

Validation...
Val loss: 5.200766324996948
Val acc:  0.13333333333333333
CPU times: user 34.9 s, sys: 19.2 s, total: 54.1 s
Wall time: 3min 32s
