<a href="https://colab.research.google.com/github/miataigeli/capstone_FHIS/blob/darya/bert_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## BERT

Based on tutorial here: https://www.youtube.com/watch?v=mw7ay38--ak

Imports and Installations

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 14.2MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 48.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 42.3MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b2

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast, BertModel
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [3]:
#specify GPU
device = torch.device("cuda")

In [4]:
#connect to my drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Load Dataset

In [5]:
# Read in all json files into one pandas dataframe
import os

corpus_dir = "/content/drive/MyDrive/capstone/corpus"
corpus_df = pd.DataFrame([], columns = ['content', 'level'])

for filename in os.listdir(corpus_dir):
    if filename.endswith("aventura.json"): 
         file_path = os.path.join(corpus_dir, filename)
         df = pd.read_json(file_path)
         df = df.drop(columns=['source', 'author', 'title'])
         corpus_df = pd.concat([corpus_df, df])
    else:
        continue

print(corpus_df.describe())

                                                  content level
count                                                  53    53
unique                                                 53     2
top     CApÍtULO �\n\nLas actuaciones en El Dorado se ...    A1
freq                                                    1    42


In [6]:
corpus_df['level'].value_counts(normalize = True)

A1    0.792453
A2    0.207547
Name: level, dtype: float64

In [7]:
labels = corpus_df['level'].unique()
print(labels)

['A1' 'A2']


In [8]:
# Make sure the texts have <512 words each
content_length = df.content.astype(str).map(len)
print(len(df.loc[content_length.argmax(), 'content']))
print(len(df.loc[content_length.argmax(), 'content'].split(" ")))

10787
1695


### Split into train, validation and test sets

In [9]:
train_text, test_text, train_levels, test_levels = train_test_split(list(df['content']), list(df['level']),
                                                                    random_state = 2021,
                                                                    test_size = 0.3) #did not include stratify

# split test into validation and test
val_text, test_text, val_levels, test_levels = train_test_split(test_text, test_levels,
                                                                random_state = 2021,
                                                                test_size=0.5)

End-to-end BERT Classification

In [10]:
model_path = 'dccuchile/bert-base-spanish-wwm-uncased'
# tokenizer from pre-trained BERT model
tokenizer = BertTokenizerFast.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', return_tensors='pt')
# Define label to number dictionary
lab2ind = {'A1': 1,
           'A2': 2,
           'B1': 3,
           'B2': 4,
           'B': 5}

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=247723.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=486125.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=134.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=310.0, style=ProgressStyle(description_…




In [11]:
# Prepare data
def prepare_data(text, levels, max_len = 32):
  ''' Preprocesses the data for classification. '''
  
  # Tokenize text
  tokenized_texts = tokenizer.batch_encode_plus(text, padding=True, return_token_type_ids=False, return_tensors='pt')

  # Create label tensor
  labels = [lab2ind[i] for i in levels]
  labels = torch.tensor(labels)

  # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
  input_ids = tokenized_texts['input_ids'][:, :512] #TODO: change length properly!!
  attention_masks = tokenized_texts['attention_mask'][:, :512] #TODO: change length properly!!

  # Convert all of our data into torch tensors, the required datatype for our model
  inputs = torch.tensor(input_ids)
  masks = torch.tensor(attention_masks)

  return inputs, labels, masks

In [12]:
# Training data
train_inputs, train_labels, train_masks = prepare_data(train_text, train_levels)
print(train_inputs.shape)
print(train_labels.shape)
print(train_masks.shape)

Token indices sequence length is longer than the specified maximum sequence length for this model (2648 > 512). Running this sequence through the model will result in indexing errors


torch.Size([37, 512])
torch.Size([37])
torch.Size([37, 512])




In [13]:
# Validation data
valid_inputs, valid_labels, valid_masks = prepare_data(val_text, val_levels)
print(valid_inputs.shape)
print(valid_labels.shape)
print(valid_masks.shape)

torch.Size([8, 512])
torch.Size([8])
torch.Size([8, 512])




In [14]:
# Test data
test_inputs, test_labels, test_masks = prepare_data(test_text, test_levels)
print(test_inputs.shape)
print(test_labels.shape)
print(test_masks.shape)

torch.Size([8, 512])
torch.Size([8])
torch.Size([8, 512])




In [15]:
# Create an iterator for our data
batch_size = 32
# We'll take training samples in random order in each epoch. 
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, 
                              sampler = RandomSampler(train_data), # Select batches randomly
                              batch_size=batch_size)

# We'll just read validation set sequentially.
validation_data = TensorDataset(valid_inputs, valid_masks, valid_labels)
validation_dataloader = DataLoader(validation_data, 
                                   sampler = SequentialSampler(validation_data), # Pull out batches sequentially.
                                   batch_size=batch_size)

In [16]:
model_path = "dccuchile/bert-base-spanish-wwm-uncased"

bert_model = BertModel.from_pretrained(model_path, output_hidden_states=True, output_attentions=True).to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=650.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=439621341.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

In [17]:
dataiter = iter(train_dataloader)
batch = dataiter.next()
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
input_ids, input_mask, labels = batch

In [18]:
print(input_ids.shape)
print(input_mask.shape)
print(labels.shape)

torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32])


In [19]:
outputs = bert_model(input_ids[:, :200], attention_mask = input_mask[:, :200])
print(outputs.keys())

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states', 'attentions'])


In [20]:
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.5697, -0.3645,  0.9580,  ..., -0.0688, -0.3572, -1.1510],
         [-0.0755,  0.1158,  0.2016,  ...,  0.0295, -0.3942,  0.0056],
         [ 0.1865,  0.2098, -0.0156,  ...,  0.0825, -0.0615,  0.3884],
         ...,
         [-0.4533,  0.2608,  1.4901,  ...,  1.3285,  0.0113, -1.2284],
         [-0.6981,  0.4360,  1.2151,  ...,  0.2270, -0.2741, -1.3811],
         [-0.8998,  0.1951,  0.3485,  ...,  0.1530, -0.1090, -1.3883]],

        [[-0.2045,  0.0950,  0.3325,  ..., -0.7340, -0.4500, -1.1641],
         [ 0.3480,  0.1524,  0.4008,  ..., -0.4962, -0.4067, -0.0084],
         [ 0.3889,  0.1877,  0.2272,  ..., -0.5355, -0.6424,  0.2204],
         ...,
         [-0.3769, -0.1463,  0.3928,  ..., -0.2709, -0.6695, -0.3102],
         [ 0.1826,  0.4567, -0.1664,  ...,  0.1556, -0.3797, -0.0920],
         [ 0.0990, -0.4717, -0.1198,  ..., -0.2943, -0.5954, -0.1691]],

        [[-0.3194, -0.0187,  0.3545,  ..., -0.4020, -

In [21]:
last_hidden_state = outputs["last_hidden_state"]
pooler_output = outputs["pooler_output"]
#hidden_states = outputs["hidden_states"]
#attentions = outputs["attentions"]
print(last_hidden_state.shape)

torch.Size([32, 200, 768])


In [36]:
dense = nn.Linear(768, 768).to(device)
dropout = nn.Dropout(0.1).to(device)
fc = nn.Linear(768, 2).to(device)
softmax = nn.Softmax(dim=1)

In [38]:
dense_output = dense(pooler_output)
drop_output = dropout(dense_output)
fc_output = fc(drop_output)
fc_softmax_output = softmax(fc_output)

print(fc_softmax_output)

tensor([[0.5363, 0.4637],
        [0.5390, 0.4610],
        [0.5472, 0.4528],
        [0.5343, 0.4657],
        [0.5731, 0.4269],
        [0.5586, 0.4414],
        [0.5649, 0.4351],
        [0.5441, 0.4559],
        [0.5504, 0.4496],
        [0.5798, 0.4202],
        [0.5592, 0.4408],
        [0.5315, 0.4685],
        [0.5486, 0.4514],
        [0.5426, 0.4574],
        [0.5266, 0.4734],
        [0.5678, 0.4322],
        [0.5279, 0.4721],
        [0.5646, 0.4354],
        [0.5521, 0.4479],
        [0.5796, 0.4204],
        [0.5556, 0.4444],
        [0.5294, 0.4706],
        [0.5272, 0.4728],
        [0.5450, 0.4550],
        [0.5763, 0.4237],
        [0.5880, 0.4120],
        [0.5723, 0.4277],
        [0.5479, 0.4521],
        [0.5668, 0.4332],
        [0.5760, 0.4240],
        [0.5542, 0.4458],
        [0.5664, 0.4336]], device='cuda:0', grad_fn=<SoftmaxBackward>)


In [25]:
fc_output.shape

torch.Size([32, 2])

In [26]:
train_labels.shape

torch.Size([37])

In [28]:
train_labels.to(device)

tensor([1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2], device='cuda:0')

In [29]:
criterion = nn.CrossEntropyLoss()
criterion(fc_output, train_labels[:32])

RuntimeError: ignored

### BERT

In [21]:
# import BERT-based pretrained model
#bert = AutoModel.from_pretrained('bert-base-uncased')

bert_model = AutoModel.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased').to(device)#, output_hidden_states=True, output_attentions=True).to(device)


Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

In [22]:
#sample data
text = ['Me llamo Darya', 'vamos a probar un modelo de red neuronal.']

#encode text
tokenized_texts = tokenizer.batch_encode_plus(text, padding=True, return_token_type_ids=False, return_tensors='pt').to(device)
print(tokenized_texts)

{'input_ids': tensor([[    4,  1094,  5592,  1785,  1742,     5,     1,     1,     1,     1,
             1,     1],
        [    4,  1441,  1012,  6909,  1044,  4209,  1009,  2946, 12212,  7592,
          1008,     5]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


In [23]:
#input_ids = [tokenizer.convert_tokens_to_ids(x, return_tensors='pt') for x in tokenized_texts]
#print(input_ids)
outputs = bert_model(tokenized_texts['input_ids'])
print(outputs.keys())

odict_keys(['last_hidden_state', 'pooler_output'])


In [24]:
last_hidden_state = outputs['last_hidden_state']
pooler_output = outputs['pooler_output']

In [25]:
last_hidden_state.shape

torch.Size([2, 12, 768])

In [26]:
pooler_output.shape

torch.Size([2, 768])

We use `pooler_output` as context representation and pass it to a fully connected layer which outputs the prediction probabilities across all labels.

Two new feed-forward layers for classification are added on top of BERT. Each input is a context representation (`pooler_output`) that is a 768-dimensional vector, and the output is the probability distribution across all labels that is a 5-dimensional vector.

In [27]:
dense = nn.Linear(768, 768).to(device)
dropout = nn.Dropout(0.1).to(device)
fc = nn.Linear(768, 5).to(device)

In [28]:
dense_output = dense(pooler_output).to(device)
drop_output = dropout(dense_output).to(device)
print(drop_output)
fc_output = fc(drop_output).to(device)
print(fc_output)

tensor([[-0.0176,  0.4253,  0.7136,  ..., -0.0709,  0.0000, -0.1333],
        [ 0.1785, -0.0036, -0.0775,  ...,  0.4366,  0.0000, -0.0766]],
       device='cuda:0', grad_fn=<FusedDropoutBackward>)
tensor([[ 0.1812,  0.0752, -0.3178,  0.0171,  0.3602],
        [-0.0743,  0.0557, -0.0878, -0.0601,  0.0963]], device='cuda:0',
       grad_fn=<AddmmBackward>)


In [29]:
# We use nn.CrossEntropyLoss() as our loss function. 
#fc_output = torch.cuda.FloatTensor(fc_output)
#print(fc_output)
#fc_output = fc_output.long()
# criterion = nn.CrossEntropyLoss()
# labels = [1, 2, 3, 4, 5]
# criterion(fc_output, torch.Tensor(labels))

In [30]:
# FROM HUGGINGFACE BETO TUTORIAL

text = "[CLS] Para solucionar los [MASK] de Chile, el presidente debe [MASK] de inmediato. [SEP]"
masked_indxs = (4,11)

tokens = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
tokens_tensor = torch.tensor([indexed_tokens])

predictions = bert(tokens_tensor)[0]

for i,midx in enumerate(masked_indxs):
    idxs = torch.argsort(predictions[0,midx], descending=True)
    predicted_token = tokenizer.convert_ids_to_tokens(idxs[:5])
    print('MASK',i,':',predicted_token)

NameError: ignored