<a href="https://colab.research.google.com/github/miataigeli/capstone_FHIS/blob/darya/bert_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## BERT

Based on tutorial here: https://www.youtube.com/watch?v=mw7ay38--ak

Imports and Installations

In [3]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 2.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 18.8MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |███████

In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast, BertModel
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [5]:
#specify GPU
device = torch.device("cuda")

In [6]:
#connect to my drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Load Dataset

In [7]:
# Read in all json files into one pandas dataframe
import os

corpus_dir = "/content/drive/MyDrive/capstone/corpus"
corpus_df = pd.DataFrame([], columns = ['content', 'level'])

for filename in os.listdir(corpus_dir):
    if filename.endswith("aventura.json"): 
         file_path = os.path.join(corpus_dir, filename)
         df = pd.read_json(file_path)
         df = df.drop(columns=['source', 'author', 'title'])
         corpus_df = pd.concat([corpus_df, df])
    else:
        continue

print(corpus_df.describe())

                                                  content level
count                                                  53    53
unique                                                 53     2
top     CAPÍtULO �\n\nMartín está distraído toda la ma...    A1
freq                                                    1    42


In [8]:
corpus_df['level'].value_counts(normalize = True)

A1    0.792453
A2    0.207547
Name: level, dtype: float64

In [9]:
labels = corpus_df['level'].unique()
print(labels)

['A1' 'A2']


In [10]:
# Make sure the texts have <512 words each
content_length = df.content.astype(str).map(len)
print(len(df.loc[content_length.argmax(), 'content']))
print(len(df.loc[content_length.argmax(), 'content'].split(" ")))

10787
1695


### Split into train, validation and test sets

In [11]:
train_text, test_text, train_levels, test_levels = train_test_split(list(df['content']), list(df['level']),
                                                                    random_state = 2021,
                                                                    test_size = 0.3) #did not include stratify

# split test into validation and test
val_text, test_text, val_levels, test_levels = train_test_split(test_text, test_levels,
                                                                random_state = 2021,
                                                                test_size=0.5)

End-to-end BERT Classification

In [12]:
model_path = 'dccuchile/bert-base-spanish-wwm-uncased'
# tokenizer from pre-trained BERT model
tokenizer = BertTokenizerFast.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', return_tensors='pt')
# Define label to number dictionary
lab2ind = {'A1': 1,
           'A2': 2,
           'B1': 3,
           'B2': 4,
           'B': 5}

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=247723.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=486125.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=134.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=310.0, style=ProgressStyle(description_…




In [13]:
# Prepare data
def prepare_data(text, levels, max_len = 32):
  ''' Preprocesses the data for classification. '''
  
  # Tokenize text
  tokenized_texts = tokenizer.batch_encode_plus(text, padding=True, return_token_type_ids=False, return_tensors='pt')

  # Create label tensor
  labels = [lab2ind[i] for i in levels]
  labels = torch.tensor(labels)

  # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
  input_ids = tokenized_texts['input_ids'][:, :512] #TODO: change length properly!!
  attention_masks = tokenized_texts['attention_mask'][:, :512] #TODO: change length properly!!

  # Convert all of our data into torch tensors, the required datatype for our model
  inputs = torch.tensor(input_ids)
  masks = torch.tensor(attention_masks)

  return inputs, labels, masks

In [14]:
# Training data
train_inputs, train_labels, train_masks = prepare_data(train_text, train_levels)
print(train_inputs.shape)
print(train_labels.shape)
print(train_masks.shape)

Token indices sequence length is longer than the specified maximum sequence length for this model (2648 > 512). Running this sequence through the model will result in indexing errors


torch.Size([37, 512])
torch.Size([37])
torch.Size([37, 512])




In [15]:
# Validation data
valid_inputs, valid_labels, valid_masks = prepare_data(val_text, val_levels)
print(valid_inputs.shape)
print(valid_labels.shape)
print(valid_masks.shape)

torch.Size([8, 512])
torch.Size([8])
torch.Size([8, 512])




In [16]:
# Test data
test_inputs, test_labels, test_masks = prepare_data(test_text, test_levels)
print(test_inputs.shape)
print(test_labels.shape)
print(test_masks.shape)

torch.Size([8, 512])
torch.Size([8])
torch.Size([8, 512])




In [17]:
# Create an iterator for our data
batch_size = 32
# We'll take training samples in random order in each epoch. 
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, 
                              sampler = RandomSampler(train_data), # Select batches randomly
                              batch_size=batch_size)

# We'll just read validation set sequentially.
validation_data = TensorDataset(valid_inputs, valid_masks, valid_labels)
validation_dataloader = DataLoader(validation_data, 
                                   sampler = SequentialSampler(validation_data), # Pull out batches sequentially.
                                   batch_size=batch_size)

In [18]:
model_path = "dccuchile/bert-base-spanish-wwm-uncased"

bert_model = BertModel.from_pretrained(model_path, output_hidden_states=True, output_attentions=True).to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=650.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=439621341.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

In [19]:
dataiter = iter(train_dataloader)
batch = dataiter.next()
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
input_ids, input_mask, labels = batch

In [20]:
print(input_ids.shape)
print(input_mask.shape)
print(labels.shape)

torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([32])


In [21]:
outputs = bert_model(input_ids[:, :200], attention_mask = input_mask[:, :200])
print(outputs.keys())

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states', 'attentions'])


In [22]:
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.4325,  0.5333,  0.7569,  ..., -0.1910, -0.3507, -1.4643],
         [ 0.3430,  0.5017,  0.1167,  ..., -0.2157,  0.4126, -0.2599],
         [-0.1601,  0.0942,  0.5456,  ..., -0.1404,  0.2308, -0.7090],
         ...,
         [-0.4014, -0.0861,  0.0471,  ...,  0.3179, -0.1852, -1.0770],
         [ 0.4082, -0.5063, -0.0763,  ..., -0.6184, -0.0905, -0.4736],
         [-0.4125, -0.3833,  0.1753,  ..., -1.0183,  0.0492, -0.1679]],

        [[ 0.0168,  0.5249,  0.4899,  ..., -0.1671, -0.5506, -0.7268],
         [ 0.4481,  0.6684,  0.4824,  ...,  0.1237, -0.3066,  0.0445],
         [ 0.5388,  0.7585,  0.5167,  ...,  0.1665, -0.0355,  0.5409],
         ...,
         [-0.7328,  0.4824,  0.7945,  ...,  0.3026, -0.8071, -0.0237],
         [-0.5723,  0.3550,  1.2703,  ...,  0.1808,  0.1919, -0.8794],
         [ 0.1656,  0.6171,  0.8978,  ...,  0.2475, -0.1118, -1.9047]],

        [[-0.3194, -0.0187,  0.3545,  ..., -0.4020, -

In [23]:
last_hidden_state = outputs["last_hidden_state"]
pooler_output = outputs["pooler_output"]
#hidden_states = outputs["hidden_states"]
#attentions = outputs["attentions"]
print(last_hidden_state.shape)

torch.Size([32, 200, 768])


In [24]:
dense = nn.Linear(768, 768).to(device)
dropout = nn.Dropout(0.1).to(device)
fc = nn.Linear(768, 2).to(device)
softmax = nn.Softmax(dim=1)

In [25]:
dense_output = dense(pooler_output)
drop_output = dropout(dense_output)
fc_output = fc(drop_output)
fc_softmax_output = softmax(fc_output)

print(fc_softmax_output)

tensor([[0.4921, 0.5079],
        [0.4743, 0.5257],
        [0.4878, 0.5122],
        [0.4681, 0.5319],
        [0.4777, 0.5223],
        [0.5185, 0.4815],
        [0.5181, 0.4819],
        [0.4825, 0.5175],
        [0.5293, 0.4707],
        [0.4774, 0.5226],
        [0.5300, 0.4700],
        [0.5221, 0.4779],
        [0.5038, 0.4962],
        [0.5061, 0.4939],
        [0.4976, 0.5024],
        [0.4339, 0.5661],
        [0.5007, 0.4993],
        [0.4908, 0.5092],
        [0.5123, 0.4877],
        [0.4933, 0.5067],
        [0.4573, 0.5427],
        [0.5079, 0.4921],
        [0.4759, 0.5241],
        [0.4648, 0.5352],
        [0.4913, 0.5087],
        [0.4798, 0.5202],
        [0.4922, 0.5078],
        [0.5068, 0.4932],
        [0.4796, 0.5204],
        [0.5067, 0.4933],
        [0.4626, 0.5374],
        [0.5056, 0.4944]], device='cuda:0', grad_fn=<SoftmaxBackward>)


In [26]:
# predicted_vals = []
# for val in fc_softmax_output:
#   max_arg = torch.argmax(val)
#   predicted_vals.append(max_arg.item() + 1)

# predicted_vals = torch.Tensor(predicted_vals)
# print(predicted_vals)

In [27]:
# train_labels.shape

In [33]:
train_labels.to(device)

tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 1, 0], device='cuda:0')

In [40]:
train_labels.to(device)

tensor([-1,  0, -1, -1, -1, -1, -1, -1, -1,  0, -1,  0, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1,  0,  0, -1, -1,  0, -1],
       device='cuda:0')

In [35]:
train_labels = train_labels[:32] - 1
#train_labels

In [42]:
train_labels.to(device)
print(train_labels)
criterion = nn.CrossEntropyLoss()
criterion(fc_softmax_output, train_labels.to(device))

tensor([-1,  0, -1, -1, -1, -1, -1, -1, -1,  0, -1,  0, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1,  0,  0, -1, -1,  0, -1])


RuntimeError: ignored

### BERT

In [2]:
# import BERT-based pretrained model
#bert = AutoModel.from_pretrained('bert-base-uncased')

bert_model = AutoModel.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased').to(device)#, output_hidden_states=True, output_attentions=True).to(device)


NameError: ignored

In [None]:
#sample data
text = ['Me llamo Darya', 'vamos a probar un modelo de red neuronal.']

#encode text
tokenized_texts = tokenizer.batch_encode_plus(text, padding=True, return_token_type_ids=False, return_tensors='pt').to(device)
print(tokenized_texts)

In [None]:
#input_ids = [tokenizer.convert_tokens_to_ids(x, return_tensors='pt') for x in tokenized_texts]
#print(input_ids)
outputs = bert_model(tokenized_texts['input_ids'])
print(outputs.keys())

In [None]:
last_hidden_state = outputs['last_hidden_state']
pooler_output = outputs['pooler_output']

In [None]:
last_hidden_state.shape

In [None]:
pooler_output.shape

We use `pooler_output` as context representation and pass it to a fully connected layer which outputs the prediction probabilities across all labels.

Two new feed-forward layers for classification are added on top of BERT. Each input is a context representation (`pooler_output`) that is a 768-dimensional vector, and the output is the probability distribution across all labels that is a 5-dimensional vector.

In [None]:
dense = nn.Linear(768, 768).to(device)
dropout = nn.Dropout(0.1).to(device)
fc = nn.Linear(768, 5).to(device)

In [None]:
dense_output = dense(pooler_output).to(device)
drop_output = dropout(dense_output).to(device)
print(drop_output)
fc_output = fc(drop_output).to(device)
print(fc_output)

In [None]:
# We use nn.CrossEntropyLoss() as our loss function. 
#fc_output = torch.cuda.FloatTensor(fc_output)
#print(fc_output)
#fc_output = fc_output.long()
# criterion = nn.CrossEntropyLoss()
# labels = [1, 2, 3, 4, 5]
# criterion(fc_output, torch.Tensor(labels))

In [None]:
# FROM HUGGINGFACE BETO TUTORIAL

text = "[CLS] Para solucionar los [MASK] de Chile, el presidente debe [MASK] de inmediato. [SEP]"
masked_indxs = (4,11)

tokens = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
tokens_tensor = torch.tensor([indexed_tokens])

predictions = bert(tokens_tensor)[0]

for i,midx in enumerate(masked_indxs):
    idxs = torch.argsort(predictions[0,midx], descending=True)
    predicted_token = tokenizer.convert_ids_to_tokens(idxs[:5])
    print('MASK',i,':',predicted_token)