<a href="https://colab.research.google.com/github/miataigeli/capstone_FHIS/blob/darya/bert_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## BERT

Based on tutorial here: https://www.youtube.com/watch?v=mw7ay38--ak

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 15.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 46.8MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |██████

In [2]:
import transformers

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast, BertModel
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [4]:
#specify GPU
device = torch.device("cuda")

In [5]:
#connect to my drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Load Dataset

In [6]:
# Read in all json files into one dataframe
import os

corpus_dir = "/content/drive/MyDrive/capstone/corpus"
corpus_df = pd.DataFrame([], columns = ['content', 'level'])

for filename in os.listdir(corpus_dir):
    if filename.endswith("aalh.json"): 
         file_path = os.path.join(corpus_dir, filename)
         df = pd.read_json(file_path)
         df = df.drop(columns=['source', 'author', 'title'])
         corpus_df = pd.concat([corpus_df, df])
    else:
        continue

print(corpus_df.describe())

                                                  content level
count                                                  21    21
unique                                                 21     1
top     Cuando la rapaza entró, cargada con el haz de ...    B2
freq                                                    1    21


In [7]:
corpus_df['level'].value_counts(normalize = True)

B2    1.0
Name: level, dtype: float64

In [8]:
labels = corpus_df['level'].unique()
print(labels)

['B2']


In [9]:
# Make sure the texts have <512 words each
content_length = df.content.astype(str).map(len)
print(len(df.loc[content_length.argmax(), 'content']))
print(len(df.loc[content_length.argmax(), 'content'].split(" ")))

26680
3736


### Split into train, validation and test sets

In [10]:
train_text, test_text, train_levels, test_levels = train_test_split(list(df['content']), list(df['level']),
                                                                    random_state = 2021,
                                                                    test_size = 0.3) #did not include stratify

# split test into validation and test
val_text, test_text, val_levels, test_levels = train_test_split(test_text, test_levels,
                                                                random_state = 2021,
                                                                test_size=0.5)

End-to-end BERT Classification

In [11]:
model_path = 'dccuchile/bert-base-spanish-wwm-uncased'
# tokenizer from pre-trained BERT model
tokenizer = BertTokenizerFast.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', return_tensors='pt')
# Define label to number dictionary
lab2ind = {'A1': 1,
           'A2': 2,
           'B1': 3,
           'B2': 4,
           'B': 5}

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=247723.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=486125.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=134.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=310.0, style=ProgressStyle(description_…




In [12]:
lst = [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]
lst = np.array(lst)
print(lst[:,:3])

[[1 2 3]
 [7 8 9]]


In [13]:
# Prepare data
def prepare_data(text, levels, max_len = 32):

  # Tokenize text
  tokenized_texts = tokenizer.batch_encode_plus(text, padding=True, return_token_type_ids=False, return_tensors='pt')
  print("Tokenize the first sentence:\n",tokenized_texts[0])
  print(tokenized_texts['input_ids'][0].shape)
  print(tokenized_texts['attention_mask'][0].shape)

  # Create label tensor
  labels = [lab2ind[i] for i in levels]
  labels = torch.tensor(labels)
  print("Labels:\n", labels)

  # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts][:512] #TODO: change length properly!!
  print ("Index numbers of the first sentence:\n",input_ids[0])

  # Pad our input seqeunce to the fixed length (i.e., max_len) with index of [PAD] token
  #pad_ind = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
  #input_ids = pad_sequences(list(input_ids), maxlen=max_len+2, dtype="long", truncating="post", padding="post", value=pad_ind)
  input_ids = tokenized_texts['input_ids'][:, :512]
  print ("Index numbers of the first sentence after padding:\n",input_ids[0])

  # Create attention masks
  #attention_masks = []

  # Create a mask of 1s for each token followed by 0s for pad tokens
  #for seq in input_ids:
  #    seq_mask = [float(i>1) for i in seq]
  #    attention_masks.append(seq_mask)
  attention_masks = tokenized_texts['attention_mask'][:, :512]

  # Convert all of our data into torch tensors, the required datatype for our model
  inputs = torch.tensor(input_ids)
  masks = torch.tensor(attention_masks)

  return inputs, labels, masks

In [14]:
# Training data
train_inputs, train_labels, train_masks = prepare_data(train_text, train_levels)
print(train_inputs.shape)
print(train_labels.shape)
print(train_masks.shape)

Token indices sequence length is longer than the specified maximum sequence length for this model (6363 > 512). Running this sequence through the model will result in indexing errors


Tokenize the first sentence:
 Encoding(num_tokens=6363, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
torch.Size([6363])
torch.Size([6363])
Labels:
 tensor([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
Index numbers of the first sentence:
 3
Index numbers of the first sentence after padding:
 tensor([    4,  1076,  1039, 18531, 16764,  1019,  7184,  3814,  1019,  1035,
         1233, 21346,  1019,  1009, 21037,  2623,  5723,  1035,  4183,  1040,
         5171, 14884,  2574, 21344,  1076,  1067, 12365,  8547,  1018,  1009,
         1032,  2324,  1019,  3113,  1512,  1019, 18924,  1009,  5243, 22357,
         1019,  5243, 16147,  1019,  1048, 20987, 28226,  1009,  4132,  1019,
         1041, 16776,  1152, 14934,  1052,  1225,  1200,  1012,  1069, 19655,
         1040,  8893,  7106, 15456,  1008,  5012,  1067,  3154,  1019,  1094,
        15499,  5163,  1040, 22359,  1200,  1040,  1094,  2211,  1048,  1032,
        16692,  1179, 12680,  1009,



In [15]:
# Validation data
valid_inputs, valid_labels, valid_masks = prepare_data(val_text, val_levels)
print(valid_inputs[0])
print(valid_labels)
print(valid_masks)

Tokenize the first sentence:
 Encoding(num_tokens=5887, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
torch.Size([5887])
torch.Size([5887])
Labels:
 tensor([4, 4, 4])
Index numbers of the first sentence:
 3
Index numbers of the first sentence after padding:
 tensor([    4,  1120, 23286,  1081,  1265,  1109,  1120,  5688,  1009, 13218,
         1109,  1120,  5675,  6213,  1109,  1203,  7057,  4059,  1032,  1947,
        11580,  1048,  1069, 24156,  1171,  4523,  1019,  1151, 11580,  1039,
         4141,  1136,  5542,  2767,  1008,  1120,  1705,  1012,  3120,  1109,
            3,  1032,  7118, 30958,  1141, 23368,  1093,  1019,  1097, 13977,
         1190,  1081, 21592,  4606,  1085, 12677,  3795, 30956,  1019,  1040,
        23255,  1012,  1069,  9936,  8576,  1032, 25380,  2696,  1114,  1009,
         1203,  2911, 19666,  1008,  1120,  3351,  1019, 11030,  3416,  1019,
         3741,  1194,  4892,  1109,  1120,  1487,  1624,  1041,  103



In [16]:
# Test data
test_inputs, test_labels, test_masks = prepare_data(test_text, test_levels)
print(test_inputs[0])
print(test_labels)
print(test_masks)

Tokenize the first sentence:
 Encoding(num_tokens=1317, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
torch.Size([1317])
torch.Size([1317])
Labels:
 tensor([4, 4, 4, 4])
Index numbers of the first sentence:
 3
Index numbers of the first sentence after padding:
 tensor([    4,  1493,  1094,  3515,   995,  1054,  4878, 30978,  1136,  1922,
          993,  1493,  1094,  3515,   995,  1054,  4878, 30978,  1136,  6497,
          993,  1054,  1518, 19084,  1050,  1067,  2305,  1009,  1136,  6360,
         1019,  2314,  1009,  7884,  1008,  1337,  3836,  1155,  2725,  1593,
         1091, 11073, 16336,  1040,  1094,  7369,  1035,  1032,  4485,     3,
         1186, 12196,  1252,  1440, 19426,  1716,  1035,  1811, 11051,  1337,
         3153,  1008,  6731,  1626,  1019,  3249,  1626,  1041,  6820,  1019,
         4141,  1009,  7562,  8049,  1074,  6380,  1092,   995,  1857,  1019,
         1039,  3423,  4775, 10238,  1054,  1428,  1300,  1069,  



In [17]:
# Create an iterator for our data
batch_size = 32
# We'll take training samples in random order in each epoch. 
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, 
                              sampler = RandomSampler(train_data), # Select batches randomly
                              batch_size=batch_size)

# We'll just read validation set sequentially.
validation_data = TensorDataset(valid_inputs, valid_masks, valid_labels)
validation_dataloader = DataLoader(validation_data, 
                                   sampler = SequentialSampler(validation_data), # Pull out batches sequentially.
                                   batch_size=batch_size)

In [18]:
model_path = "dccuchile/bert-base-spanish-wwm-uncased"

bert_model = BertModel.from_pretrained(model_path, output_hidden_states=True, output_attentions=True).to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=650.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=439621341.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

In [19]:
dataiter = iter(train_dataloader)
batch = dataiter.next()
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
input_ids, input_mask, labels = batch

In [20]:
outputs = bert_model(input_ids, attention_mask = input_mask)
print(outputs.keys())

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states', 'attentions'])


In [34]:
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0878,  0.2755,  0.2995,  ..., -0.4584,  0.0567, -0.8371],
         [-0.3165,  0.3199, -0.1432,  ..., -0.1383,  0.1852, -0.9116],
         [-0.5792,  1.1189,  0.8135,  ...,  0.7403, -0.0974, -1.3424],
         ...,
         [ 0.1509,  0.5739,  0.0472,  ..., -0.2913,  0.0704, -0.6335],
         [ 0.1233,  0.5842,  0.0823,  ..., -0.2813,  0.0348, -0.6609],
         [ 0.0244,  0.5808,  0.1664,  ..., -0.3299,  0.0347, -0.7319]],

        [[-1.1553,  0.5077, -0.2936,  ...,  0.1498,  1.4441, -0.2687],
         [-0.7094, -0.1734,  0.0507,  ...,  0.3483,  1.2982,  0.5402],
         [-0.5140, -0.1226,  0.5642,  ...,  0.5088,  1.1387,  0.2346],
         ...,
         [-0.8605,  0.4890, -0.8784,  ..., -0.0933,  0.7292, -0.6575],
         [-1.4946,  0.4729, -0.5155,  ...,  0.0308,  1.2545, -0.1379],
         [-1.4704,  0.5382, -0.6398,  ...,  0.2199,  1.1024, -0.3601]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackw

In [37]:
last_hidden_state = outputs["last_hidden_state"]
pooler_output = outputs["pooler_output"]
#hidden_states = outputs["hidden_states"]
#attentions = outputs["attentions"]
print(last_hidden_state.shape)

torch.Size([2, 12, 768])


In [38]:
dense = nn.Linear(768, 768).to(device)
dropout = nn.Dropout(0.1).to(device)
fc = nn.Linear(768, 2).to(device)

In [39]:
dense_output = dense(pooler_output)
drop_output = dropout(dense_output)
fc_output = fc(drop_output)
print(fc_output)

tensor([[ 0.2304,  0.0614],
        [ 0.0948, -0.0628]], device='cuda:0', grad_fn=<AddmmBackward>)


In [41]:
criterion = nn.CrossEntropyLoss()
criterion(fc_output, train_labels)

ValueError: ignored

### BERT

In [21]:
# import BERT-based pretrained model
#bert = AutoModel.from_pretrained('bert-base-uncased')

bert_model = AutoModel.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased').to(device)#, output_hidden_states=True, output_attentions=True).to(device)


Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dens

In [22]:
#sample data
text = ['Me llamo Darya', 'vamos a probar un modelo de red neuronal.']

#encode text
tokenized_texts = tokenizer.batch_encode_plus(text, padding=True, return_token_type_ids=False, return_tensors='pt').to(device)
print(tokenized_texts)

{'input_ids': tensor([[    4,  1094,  5592,  1785,  1742,     5,     1,     1,     1,     1,
             1,     1],
        [    4,  1441,  1012,  6909,  1044,  4209,  1009,  2946, 12212,  7592,
          1008,     5]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


In [23]:
#input_ids = [tokenizer.convert_tokens_to_ids(x, return_tensors='pt') for x in tokenized_texts]
#print(input_ids)
outputs = bert_model(tokenized_texts['input_ids'])
print(outputs.keys())

odict_keys(['last_hidden_state', 'pooler_output'])


In [24]:
last_hidden_state = outputs['last_hidden_state']
pooler_output = outputs['pooler_output']

In [25]:
last_hidden_state.shape

torch.Size([2, 12, 768])

In [26]:
pooler_output.shape

torch.Size([2, 768])

We use `pooler_output` as context representation and pass it to a fully connected layer which outputs the prediction probabilities across all labels.

Two new feed-forward layers for classification are added on top of BERT. Each input is a context representation (`pooler_output`) that is a 768-dimensional vector, and the output is the probability distribution across all labels that is a 5-dimensional vector.

In [27]:
dense = nn.Linear(768, 768).to(device)
dropout = nn.Dropout(0.1).to(device)
fc = nn.Linear(768, 5).to(device)

In [28]:
dense_output = dense(pooler_output).to(device)
drop_output = dropout(dense_output).to(device)
print(drop_output)
fc_output = fc(drop_output).to(device)
print(fc_output)

tensor([[-0.0176,  0.4253,  0.7136,  ..., -0.0709,  0.0000, -0.1333],
        [ 0.1785, -0.0036, -0.0775,  ...,  0.4366,  0.0000, -0.0766]],
       device='cuda:0', grad_fn=<FusedDropoutBackward>)
tensor([[ 0.1812,  0.0752, -0.3178,  0.0171,  0.3602],
        [-0.0743,  0.0557, -0.0878, -0.0601,  0.0963]], device='cuda:0',
       grad_fn=<AddmmBackward>)


In [29]:
# We use nn.CrossEntropyLoss() as our loss function. 
#fc_output = torch.cuda.FloatTensor(fc_output)
#print(fc_output)
#fc_output = fc_output.long()
# criterion = nn.CrossEntropyLoss()
# labels = [1, 2, 3, 4, 5]
# criterion(fc_output, torch.Tensor(labels))

In [30]:
# FROM HUGGINGFACE BETO TUTORIAL

text = "[CLS] Para solucionar los [MASK] de Chile, el presidente debe [MASK] de inmediato. [SEP]"
masked_indxs = (4,11)

tokens = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
tokens_tensor = torch.tensor([indexed_tokens])

predictions = bert(tokens_tensor)[0]

for i,midx in enumerate(masked_indxs):
    idxs = torch.argsort(predictions[0,midx], descending=True)
    predicted_token = tokenizer.convert_ids_to_tokens(idxs[:5])
    print('MASK',i,':',predicted_token)

NameError: ignored