<a href="https://colab.research.google.com/github/miataigeli/capstone_FHIS/blob/darya/bert_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## BERT

Based on tutorial here: https://www.youtube.com/watch?v=mw7ay38--ak

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 9.1MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 46.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 48.9MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25

In [2]:
import transformers

In [178]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast, BertModel
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [4]:
#specify GPU
device = torch.device("cuda")

In [5]:
#connect to my drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Load Dataset

In [6]:
# Read in all json files into one dataframe
import os

corpus_dir = "/content/drive/MyDrive/capstone/corpus"
corpus_df = pd.DataFrame([], columns = ['content', 'level'])

for filename in os.listdir(corpus_dir):
    if filename.endswith(".json"): 
         file_path = os.path.join(corpus_dir, filename)
         df = pd.read_json(file_path)
         df = df.drop(columns=['source', 'author', 'title'])
         corpus_df = pd.concat([corpus_df, df])
    else:
        continue

print(corpus_df.describe())

                                                  content level
count                                                 308   308
unique                                                308     5
top     Señor, porque sé que habréis placer de la gran...    A1
freq                                                    1    94


In [7]:
corpus_df['level'].value_counts(normalize = True)

A1    0.305195
B     0.288961
A2    0.201299
B1    0.136364
B2    0.068182
Name: level, dtype: float64

In [55]:
labels = corpus_df['level'].unique()
print(labels)

['A1' 'A2' 'B1' 'B2' 'B']


### Split into train, validation and test sets

In [143]:
train_text, test_text, train_levels, test_levels = train_test_split(list(df['content']), list(df['level']),
                                                                    random_state = 2021,
                                                                    test_size = 0.3) #did not include stratify

# split test into validation and test
val_text, test_text, val_levels, test_levels = train_test_split(test_text, test_levels,
                                                                random_state = 2021,
                                                                test_size=0.5)

End-to-end BERT Classification

In [119]:
model_path = 'dccuchile/bert-base-spanish-wwm-uncased'
# tokenizer from pre-trained BERT model
tokenizer = BertTokenizerFast.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', return_tensors='pt')
# Define label to number dictionary
lab2ind = {'A1': 1,
           'A2': 2,
           'B1': 3,
           'B2': 4,
           'B': 5}

In [169]:
# Prepare data
def prepare_data(text, levels, max_len = 32):

  # Tokenize text
  tokenized_texts = tokenizer.batch_encode_plus(list(text), padding=True, return_token_type_ids=False, return_tensors='pt')
  print ("Tokenize the first sentence:\n",tokenized_texts[0])
  print(tokenized_texts)
  print(tokenized_texts['input_ids'][0].shape)
  print(tokenized_texts['attention_mask'][0].shape)

  # Create label tensor
  labels = [lab2ind[i] for i in levels]
  labels = torch.tensor(labels)
  print("Labels:\n", labels)

  # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
  print ("Index numbers of the first sentence:\n",input_ids[0])

  # Pad our input seqeunce to the fixed length (i.e., max_len) with index of [PAD] token
  #pad_ind = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
  #input_ids = pad_sequences(list(input_ids), maxlen=max_len+2, dtype="long", truncating="post", padding="post", value=pad_ind)
  input_ids = tokenized_texts['input_ids']
  print ("Index numbers of the first sentence after padding:\n",input_ids[0])

  # Create attention masks
  #attention_masks = []

  # Create a mask of 1s for each token followed by 0s for pad tokens
  #for seq in input_ids:
  #    seq_mask = [float(i>1) for i in seq]
  #    attention_masks.append(seq_mask)
  attention_masks = tokenized_texts['attention_mask']

  # Convert all of our data into torch tensors, the required datatype for our model
  inputs = torch.tensor(input_ids)
  masks = torch.tensor(attention_masks)

  return inputs, labels, masks

In [170]:
# Training data
train_inputs, train_labels, train_masks = prepare_data(train_text, train_levels)
print(train_inputs.shape)
print(train_labels.shape)
print(train_masks.shape)

Tokenize the first sentence:
 Encoding(num_tokens=2648, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
{'input_ids': tensor([[   4, 5390, 1306,  ...,    1,    1,    1],
        [   4, 5390, 1129,  ...,    1,    1,    1],
        [   4, 5390, 1098,  ...,    1,    1,    1],
        ...,
        [   4, 5390, 1098,  ...,    1,    1,    1],
        [   4, 5390,  997,  ...,    1,    1,    1],
        [   4, 5390, 2116,  ...,    1,    1,    1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([2648])
torch.Size([2648])
Labels:
 tensor([1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2])
Index numbers of the first sentence:
 3
Index numbers of the first sentence after padding:
 tens



In [172]:
# Validation data
valid_inputs, valid_labels, valid_masks = prepare_data(val_text, val_levels)
print(valid_inputs[0])
print(valid_labels)
print(valid_masks)

Tokenize the first sentence:
 Encoding(num_tokens=2292, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
{'input_ids': tensor([[   4, 5390,  999,  ...,    1,    1,    1],
        [   4, 5390, 1001,  ...,    1,    1,    1],
        [   4, 5390, 1002,  ...,    1,    1,    1],
        ...,
        [   4, 5390, 1098,  ...,    1,    1,    1],
        [   4, 5390, 2116,  ...,    1,    1,    1],
        [   4, 5390, 1413,  ...,    1,    1,    1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([2292])
torch.Size([2292])
Labels:
 tensor([1, 1, 1, 2, 1, 2, 1, 1])
Index numbers of the first sentence:
 3
Index numbers of the first sentence after padding:
 tensor([   4, 5390,  999,  ...,    1,    1,    1])
tensor([   4, 5390,  999,  ...,    1,    1,    1



In [173]:
# Test data
test_inputs, test_labels, test_masks = prepare_data(test_text, test_levels)
print(test_inputs[0])
print(test_labels)
print(test_masks)

Tokenize the first sentence:
 Encoding(num_tokens=2097, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
{'input_ids': tensor([[   4, 5390, 2296,  ...,    1,    1,    1],
        [   4, 5390, 1673,  ...,    1,    1,    1],
        [   4, 5390, 1129,  ...,    1,    1,    1],
        ...,
        [   4, 5390, 1001,  ...,    1,    1,    1],
        [   4, 5390, 1129,  ...,    1,    1,    1],
        [   4, 5390, 2104,  ...,    1,    1,    1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
torch.Size([2097])
torch.Size([2097])
Labels:
 tensor([1, 1, 1, 1, 1, 1, 1, 2])
Index numbers of the first sentence:
 3
Index numbers of the first sentence after padding:
 tensor([   4, 5390, 2296,  ...,    1,    1,    1])
tensor([   4, 5390, 2296,  ...,    1,    1,    1



In [174]:
# Create an iterator for our data
batch_size = 32
# We'll take training samples in random order in each epoch. 
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, 
                              sampler = RandomSampler(train_data), # Select batches randomly
                              batch_size=batch_size)

# We'll just read validation set sequentially.
validation_data = TensorDataset(valid_inputs, valid_masks, valid_labels)
validation_dataloader = DataLoader(validation_data, 
                                   sampler = SequentialSampler(validation_data), # Pull out batches sequentially.
                                   batch_size=batch_size)

In [179]:
model_path = "dccuchile/bert-base-spanish-wwm-uncased"

bert_model = BertModel.from_pretrained(model_path, output_hidden_states=True, output_attentions=True).to(device)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

### BERT

In [42]:
# import BERT-based pretrained model
#bert = AutoModel.from_pretrained('bert-base-uncased')

bert_model = AutoModel.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased').to(device)#, output_hidden_states=True, output_attentions=True).to(device)


Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

In [45]:
#sample data
text = ['Me llamo Darya', 'vamos a probar un modelo de red neuronal.']

#encode text
tokenized_texts = tokenizer.batch_encode_plus(text, padding=True, return_token_type_ids=False, return_tensors='pt').to(device)
print(tokenized_texts)

{'input_ids': tensor([[    4,  1094,  5592,  1785,  1742,     5,     1,     1,     1,     1,
             1,     1],
        [    4,  1441,  1012,  6909,  1044,  4209,  1009,  2946, 12212,  7592,
          1008,     5]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


In [46]:
#input_ids = [tokenizer.convert_tokens_to_ids(x, return_tensors='pt') for x in tokenized_texts]
#print(input_ids)
outputs = bert_model(tokenized_texts['input_ids'])
print(outputs.keys())

odict_keys(['last_hidden_state', 'pooler_output'])


In [47]:
last_hidden_state = outputs['last_hidden_state']
pooler_output = outputs['pooler_output']

In [48]:
last_hidden_state.shape

torch.Size([2, 12, 768])

In [49]:
pooler_output.shape

torch.Size([2, 768])

We use `pooler_output` as context representation and pass it to a fully connected layer which outputs the prediction probabilities across all labels.

Two new feed-forward layers for classification are added on top of BERT. Each input is a context representation (`pooler_output`) that is a 768-dimensional vector, and the output is the probability distribution across all labels that is a 5-dimensional vector.

In [67]:
dense = nn.Linear(768, 768).to(device)
dropout = nn.Dropout(0.1).to(device)
fc = nn.Linear(768, 5).to(device)

In [84]:
dense_output = dense(pooler_output).to(device)
drop_output = dropout(dense_output).to(device)
print(drop_output)
fc_output = fc(drop_output).to(device)
print(fc_output)

tensor([[ 0.0000, -0.0498,  0.1908,  ...,  0.1056, -0.1257, -0.0713],
        [-0.1448,  0.0772,  0.0095,  ...,  0.3866,  0.1082, -0.3076]],
       device='cuda:0', grad_fn=<FusedDropoutBackward>)
tensor([[ 0.0202,  0.1299,  0.0990,  0.1591,  0.0502],
        [ 0.0592, -0.0887,  0.1519, -0.0142, -0.1652]], device='cuda:0',
       grad_fn=<AddmmBackward>)


In [None]:
# We use nn.CrossEntropyLoss() as our loss function. 
#fc_output = torch.cuda.FloatTensor(fc_output)
#print(fc_output)
#fc_output = fc_output.long()
# criterion = nn.CrossEntropyLoss()
# labels = [1, 2, 3, 4, 5]
# criterion(fc_output, torch.Tensor(labels))

In [11]:
# FROM HUGGINGFACE BETO TUTORIAL

text = "[CLS] Para solucionar los [MASK] de Chile, el presidente debe [MASK] de inmediato. [SEP]"
masked_indxs = (4,11)

tokens = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
tokens_tensor = torch.tensor([indexed_tokens])

predictions = bert(tokens_tensor)[0]

for i,midx in enumerate(masked_indxs):
    idxs = torch.argsort(predictions[0,midx], descending=True)
    predicted_token = tokenizer.convert_ids_to_tokens(idxs[:5])
    print('MASK',i,':',predicted_token)

MASK 0 : ['[unused400]', '[unused244]', '[unused678]', '[unused282]', '[unused599]']
MASK 1 : ['[unused282]', '[unused504]', '[unused145]', '[unused546]', '[unused749]']
