In [1]:
#Importing Libraries
import torch
import numpy as np
import os
from os import path
import pandas as pd
from numpy import array
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from itertools import combinations

import transformers

import re, string, timeit
import random

In [2]:
# Example dataframe with text in spanish
data = {'col_1': [0, 2, 1], 
        'col_2': ['hola como estan', 'alumnos queridos', 'vamos a hablar de matematicas']}

data = pd.DataFrame.from_dict(data)

In [3]:
tokenizer = transformers.BertTokenizerFast.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", do_lower_case=True)

In [13]:
encoded_dict = tokenizer.batch_encode_plus(
                        list(utterances_train),                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 120,
                        truncation = True,           # Pad & truncate all utterances.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

encoded_dict['labels'] = torch.tensor(labels_train, dtype=torch.long)

  encoded_dict['labels'] = torch.tensor(labels_train, dtype=torch.long)


In [4]:
# Tokenize all of the utterances and map the tokens to thier word IDs.
utterances_train = data.col_2.values
labels_train = data.col_1.values

input_ids_train = []
attention_masks_train = []

# For every sentence...
for utterance in utterances_train:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        utterance,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 120,
                        truncation = True,           # Pad & truncate all utterances.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids_train.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks_train.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids_train = torch.cat(input_ids_train, dim=0)
attention_masks_train = torch.cat(attention_masks_train, dim=0)
labels_train = torch.tensor(labels_train, dtype=torch.long).detach().clone()

# Print sentence 0, now as a list of IDs.
print('Original: ', utterances_train[0])
print('Token IDs:', input_ids_train[0])

Original:  hola como estan
Token IDs: tensor([   4, 1734, 1151, 5471,    5,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1])


  encoded_dict['labels'] = torch.tensor(labels_train, dtype=torch.long)


In [5]:
from torch.utils.data import TensorDataset

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)

In [6]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(dataset_train,  # The training samples.
                              sampler = RandomSampler(dataset_train), # Select batches randomly
                              batch_size = batch_size) # Trains with this batch size.

In [14]:
class CorpusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [15]:
dataset = CorpusDataset(encoded_dict)

In [16]:
encoded_dict

{'input_ids': tensor([[    4,  1734,  1151,  5471,     5,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1],
        [    4,  7484, 13046,     5, 

In [21]:
model = transformers.BertForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", 
                                                                   num_labels = data.col_2.nunique(), # The number of output labels--2 for binary classification.
                                                                   output_attentions = False, # Whether the model returns attentions weights.
                                                                   output_hidden_states = False, # Whether the model returns all hidden-states.
                                                                   ) 

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

In [17]:
training_args = transformers.TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=3,              # total # of training epochs
        per_device_train_batch_size=8,  # batch size per device during training
        per_device_eval_batch_size=8,   # batch size for evaluation 
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',
        load_best_model_at_end=True,
        evaluation_strategy="epoch",
        save_strategy = "epoch")

In [18]:
early_stopping = transformers.EarlyStoppingCallback(early_stopping_patience=5)

In [22]:
trainer = transformers.Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=dataset,               # training dataset
        eval_dataset=dataset,                # evaluation dataset
        callbacks = [early_stopping])

In [23]:
trainer.train()

***** Running training *****
  Num examples = 3
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,No log,1.252605
2,No log,1.250983
3,No log,1.2478


***** Running Evaluation *****
  Num examples = 3
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-1
Configuration saved in ./results/checkpoint-1/config.json
Model weights saved in ./results/checkpoint-1/pytorch_model.bin
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 3
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-2
Configuration saved in ./results/checkpoint-2/config.json
Model weights saved in ./results/checkpoint-2/pytorch_model.bin
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 3
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-3
Configuration saved in ./results/checkpoint-3/config.json
Model weights saved in ./results/checkpoint-3/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpo

TrainOutput(global_step=3, training_loss=1.201616684595744, metrics={'train_runtime': 39.8834, 'train_samples_per_second': 0.226, 'train_steps_per_second': 0.075, 'total_flos': 555004865520.0, 'train_loss': 1.201616684595744, 'epoch': 3.0})

In [27]:
logits = model(encoded_dict['input_ids'])['logits']

In [34]:
torch.nn.Softmax()(logits)

  torch.nn.Softmax()(logits)


tensor([[0.3205, 0.3388, 0.3407],
        [0.3432, 0.3195, 0.3373],
        [0.3516, 0.3106, 0.3378]], grad_fn=<SoftmaxBackward>)