# CREATE EMBEDINGS WITH BERT MODEL

___

In [1]:
import torch
import torch.nn as nn
import numpy as np

from datasets import load_dataset

from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel

from tqdm import tqdm

from transformers import DataCollatorWithPadding

from torch.utils.data import Subset

from torch.utils.data import DataLoader



___

### LOAD DATA

In [2]:
dataset = load_dataset("imdb", split="train")

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


___

### NECESSARY FUNCTIONS

In [3]:
def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']
    
    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }
    
    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }
    
    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [4]:
def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)

In [5]:
@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    labels = []
    
    for batch in tqdm(loader):
        labels.append(batch['labels'].unsqueeze(1))

        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0), torch.cat(labels, dim=0).to(torch.float32)

___

### EMBEDDINGS WITH BERT MODEL

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)

cuda:0


In [16]:
tokenizer, model = get_model('bert')
model = model.to(device)

dataset = dataset.map(tokenization, batched=True)

dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/25 [00:00<?, ?ba/s]

KeyError: 'text'

In [8]:
np.random.seed(100)
idx = np.random.randint(len(dataset), size=200)
idx = idx.tolist()

train_subset = Subset(dataset, idx)

In [9]:
train_subset_loader = DataLoader(train_subset, 
                                 batch_size=32, 
                                 collate_fn=data_collator, 
                                 pin_memory=True, 
                                 shuffle=False)

In [10]:
train_embeddings, train_labels = get_embeddings_labels(model, train_subset_loader)

  0%|          | 0/7 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 7/7 [00:07<00:00,  1.14s/it]


In [11]:
train_embeddings.shape, train_labels.shape

(torch.Size([200, 768]), torch.Size([200, 1]))

In [12]:
torch.save(train_embeddings, 'bert_embeddings.pt')

___

### EMBEDDINGS WITH ROBERTA MODEL

In [7]:
tokenizer, model = get_model('roberta')

model = model.to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
dataset = dataset.map(tokenization, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

In [9]:
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
np.random.seed(100)
idx = np.random.randint(len(dataset), size=200)
idx = idx.tolist()

train_subset = Subset(dataset, idx)

In [12]:
train_subset_loader = DataLoader(train_subset, 
                                 batch_size=32, 
                                 collate_fn=data_collator, 
                                 pin_memory=True, 
                                 shuffle=False)

In [13]:
train_embeddings, train_labels = get_embeddings_labels(model, train_subset_loader)

  0%|          | 0/7 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 7/7 [00:08<00:00,  1.21s/it]


In [14]:
train_embeddings.shape, train_labels.shape

(torch.Size([200, 768]), torch.Size([200, 1]))

In [15]:
torch.save(train_embeddings, 'roberta_embeddings.pt')

___

### EMBEDDINGS WITH DISTILBERT MODEL

In [7]:
tokenizer, model = get_model('distilbert')

model = model.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

In [8]:
dataset = dataset.map(tokenization, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

In [9]:
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
np.random.seed(100)
idx = np.random.randint(len(dataset), size=200)
idx = idx.tolist()

train_subset = Subset(dataset, idx)

In [12]:
train_subset_loader = DataLoader(train_subset, 
                                 batch_size=32, 
                                 collate_fn=data_collator, 
                                 pin_memory=True, 
                                 shuffle=False)

In [13]:
train_embeddings, train_labels = get_embeddings_labels(model, train_subset_loader)

  0%|          | 0/7 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 7/7 [00:05<00:00,  1.19it/s]


In [14]:
train_embeddings.shape, train_labels.shape

(torch.Size([200, 768]), torch.Size([200, 1]))

In [15]:
torch.save(train_embeddings, 'distilbert_embeddings.pt')

___