# Initialization

In [1]:
import logging

import numpy as np
import pandas as pd

import torch
import transformers

# Load Data

Load the text data from the 'imdb_reviews_small.tsv' file. 

It is a tab-separated values (TSV) file, which means each of the fields are separated by tabs (rather than by commas as you've seen in other Practicum tasks).

In [2]:
data = pd.read_csv('/datasets/imdb_reviews_small.tsv', sep='\t')

# BERT Tokenizer

Creating the BERT tokenizer from a pre-trained model which is called `'bert-base-uncased'` in transformers. You can quickly check out an overview of it [here](https://huggingface.co/transformers/pretrained_models.html), and for more details, you can read [here](https://huggingface.co/bert-base-uncased).

In [3]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

There is an example of getting tokens for a given single text. 

You can use it to process the whole data you've loaded above. As there are already many texts, and you are likely to process them in a loop, the min/max lengths of vectors can be calculated in two ways: either within a loop or after a loop.

In the latter case, vectors of numerical identifiers of tokens (`ids`) and attention masks (`attention_mask`) need to be stored in two separate lists. They can be called `ids_list` and `attention_mask_list`, respectively. The first case allows us to avoid building those lists unless you would like to use them for another purpose, e.g. for propagating into a BERT model. It is not required in this task but will be required in the project.

Given the above, you may want to combine both ways so that calculate the min/max lengths of vectors for tokens and attention masks and keep the result of the tokenizer for further processing. Please just bear in mind, it does not make much sense to keep vectors longer than 512 elements as this is the max length of vectors that BERT can accept.

In [4]:
# texts to tokens
text = 'It is very handy to use transformers'

# adding this trick to supress warnings of lengthy outputs
# we do not normally need to, but in this case we'd like to explore 
# what is the max length of ids for our set of reviews 
# therefore we do not truncate the output (ids) to the max_length
# with the parameters max_length=max_length and truncation=True
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)
        
ids = tokenizer.encode(text.lower(), add_special_tokens=True)

# padding (appending zeroes to the vector to make its length equal to n)
n = 512
padded = np.array(ids[:n] + [0]*(n - len(ids)))

# creating the attention mask to distinguish tokens we are interested in
attention_mask = np.where(padded != 0, 1, 0)

In [5]:
print(ids)

[101, 2009, 2003, 2200, 18801, 2000, 2224, 19081, 102]


In [6]:
print(padded)

[  101  2009  2003  2200 18801  2000  2224 19081   102     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [7]:
print(attention_mask)

[1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

Compose your code to tokenize the loaded text data.

In [8]:
def tokenize_with_bert(texts):
    
    ids_list = []
    attention_mask_list = []

    min_tokenized_text_length = 1e7
    max_tokenized_text_length = 0

    for text in texts:
        # Tokenize the text
        ids = tokenizer.encode(text.lower(), add_special_tokens=True, max_length=512, truncation=True)
        
        # Identify min and max lengths
        min_tokenized_text_length = min(min_tokenized_text_length, len(ids))
        max_tokenized_text_length = max(max_tokenized_text_length, len(ids))
        
        # Pad the tokenized text to 512 tokens
        padded = np.array(ids + [0] * (512 - len(ids)))
        
        # Create attention mask
        attention_mask = np.where(padded != 0, 1, 0)
        
        ids_list.append(padded)
        attention_mask_list.append(attention_mask)
    
    print(f'The minimum length of vectors: {min_tokenized_text_length}')
    print(f'The maximum length of vectors: {max_tokenized_text_length}')
        
    return ids_list, attention_mask_list


Run the tokenizer for the whole data. It can take some time as 

In [9]:
ids_list, attention_mask_list = tokenize_with_bert(texts=data['review'])

The minimum length of vectors: 18
The maximum length of vectors: 512
