In [266]:
# For the BERT model (tokenizer,...)
from transformers import AutoModel, AutoTokenizer, BertTokenizer

# PyTorch
import torch

from torch.utils.data import DataLoader, Dataset

# For Progress Bars
from tqdm import tqdm

# For JSON input
import json

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from sentence_transformers import SentenceTransformer

## Load Data

In [267]:
with open('../../Datasets/RSICD/dataset_rsicd.json', 'r') as f:
    data = json.load(f)['images']
    
data[0]

{'filename': 'airport_1.jpg',
 'imgid': 0,
 'sentences': [{'tokens': ['many',
    'planes',
    'are',
    'parked',
    'next',
    'to',
    'a',
    'long',
    'building',
    'in',
    'an',
    'airport'],
   'raw': 'many planes are parked next to a long building in an airport .',
   'imgid': 0,
   'sentid': 0},
  {'tokens': ['many',
    'planes',
    'are',
    'parked',
    'next',
    'to',
    'a',
    'long',
    'building',
    'in',
    'an',
    'airport'],
   'raw': 'many planes are parked next to a long building in an airport .',
   'imgid': 0,
   'sentid': 1},
  {'tokens': ['many',
    'planes',
    'are',
    'parked',
    'next',
    'to',
    'a',
    'long',
    'building',
    'in',
    'an',
    'airport'],
   'raw': 'many planes are parked next to a long building in an airport .',
   'imgid': 0,
   'sentid': 2},
  {'tokens': ['many',
    'planes',
    'are',
    'parked',
    'next',
    'to',
    'a',
    'long',
    'building',
    'in',
    'an',
    'airport

## Tokenize Input

In [289]:
# Load the pretrained BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-mini')

sentences = []

# for elem in data:
#     for idx in range(5):
#         sentences.append(elem['sentences'][idx]['raw'])

for elem in data:
    sentences.append(elem['sentences'][0]['raw'])

sentences[20:25]

['many planes are parked in an airport near a large parking lot .',
 'many planes are parked in an airport .',
 'a airport in side while with some square meadow besides .',
 'four planes are parked in an airport near several buildings with parking lots .',
 'some planes are parked in an airport near a piece of green trees .']

In [290]:
len(sentences)

10921

In [291]:
# Create a custom Dataset class
class SentenceDataset(Dataset):
    
    def __init__(self, sentences, tokenizer, max_length=128):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]

        # Tokenize the sentence
        inputs = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        
        inputs['input_ids'] = inputs['input_ids'].squeeze(0)
        inputs['attention_mask'] = inputs['attention_mask'].squeeze(0)

        return inputs

# Create an instance of the custom dataset
custom_dataset = SentenceDataset(sentences, tokenizer, max_length=64)

# Create a DataLoader for the custom dataset
dataloader = DataLoader(custom_dataset, batch_size=64, shuffle=True)


## Embed

In [292]:
cuda_device = 'cuda:0'
device = torch.device(cuda_device if torch.cuda.is_available() else "cpu")

In [293]:
bert = AutoModel.from_pretrained("prajjwal1/bert-mini", output_hidden_states=True)

### Last Layer

In [None]:
with torch.no_grad():
    model = bert.to(device)
    
    batch_outputs = []
    
    for batch in tqdm(dataloader, desc='Getting Embeddings (batches): '):
        
        # Load to CUDA device for performance
        batch.to(device)
        
        # Run through pre-trained BERT_Mini
        out = bert(**batch)
        
        # Take last hidden state only
        last_hidden = out.last_hidden_state
        
        # Obtain sentence level embedding by mean pooling
        sentence_embeddings = torch.mean(last_hidden, dim=1)
        batch_outputs.append(sentence_embeddings)

    # combine outputs in vertical stack
    output = torch.vstack(batch_outputs)
    embeddings = output.cpu().numpy()

print(embeddings[:5])
print('Embeddings shape:', embeddings.shape)
print('Closest sentences:')
print_closest_sentences(embeddings)

Getting Embeddings (batches): 100%|███████████████████████████████████████████████████████████████████████████████████| 854/854 [10:03<00:00,  1.42it/s]


[[-0.21703902  0.24483639  0.84084994 ... -0.9410807   0.5486869
  -0.8347917 ]
 [-0.9579422   0.8365362  -0.07090097 ... -1.0124918   0.7626926
  -0.67816687]
 [-0.92578536  0.27937323  0.05952085 ... -0.7978405   0.31176674
  -0.23223938]
 [-0.2563166   0.312091    0.12094273 ... -0.77831846  0.10287943
  -1.0198553 ]
 [-0.6365057   0.45622563 -0.87495977 ... -0.4821485   0.8690314
   0.52157336]]
Embeddings shape: (54605, 256)
Closest sentences:


### Last **n** layers, sum

In [None]:
with torch.no_grad():
    model = bert.to(device)
    
    batch_outputs = []
    hs = [i for i in range(-4, 0)]
    
    for batch in tqdm(dataloader, desc='Getting Embeddings (batches): '):
        
        # Load to CUDA device for performance
        batch.to(device)
        
        # Run through pre-trained BERT_Mini
        out = bert(**batch)
        
        # Get last n hidden layers
        last_n_hidden = out.hidden_states[-4:]
        
        # stack and sum
        hiddens = torch.stack(last_n_hidden)
        resulting_states = torch.sum(hiddens, dim=0)
        
        # take the mean for combination of token level embeddings to sentence level
        sentence_embeddings = torch.mean(resulting_states, dim=1)
        batch_outputs.append(sentence_embeddings)
          
    # add everything to vstack and convert back to numpy
    output = torch.vstack(batch_outputs)
    embeddings = output.cpu().numpy()
    
# print interesting info
print(embeddings[:5])
print('Embeddings shape:', embeddings.shape)
print('Closest sentences:')
print_closest_sentences(embeddings)

### Last *n* layers, concat

In [None]:
with torch.no_grad():
    model = bert.to(device)
    
    batch_outputs = []
    hs = [i for i in range(-4, 0)]
    
    for batch in tqdm(dataloader, desc='Getting Embeddings (batches): '):
        
        # Load to CUDA device for performance
        batch.to(device)
        
        # Run through pre-trained BERT_Mini
        out = bert(**batch)
        
        # Get last n hidden layers
        last_n_hidden = out.hidden_states[-4:]
        
        # concatenate last n hidden states
        hiddens = torch.stack(last_n_hidden)
        resulting_states = torch.cat(last_n_hidden, dim=-1)
        
        # take the mean for combination of token level embeddings to sentence level
        sentence_embeddings = torch.mean(resulting_states, dim=1)
        batch_outputs.append(sentence_embeddings)
          
    # add everything to vstack and convert back to numpy
    output = torch.vstack(batch_outputs)
    embeddings = output.cpu().numpy()
    
# print interesting info
print(embeddings[:5])
print('Embeddings shape:', embeddings.shape)
print('Closest sentences:')
print_closest_sentences(embeddings)

### SBERT

In [None]:
# Well-performing SBERT model
model = SentenceTransformer('all-mpnet-base-v2')

# Embed sentences using the SBERT model
embeddings = model.encode(sentences)

# Print interesting info
print(embeddings[:5])
print('Embedding shape:', embeddings.shape)
print('Closest sentences:')
print_closest_sentences(embeddings)

## [CLS]

In [294]:
with torch.no_grad():
    model = bert.to(device)
    
    batch_outputs = []
    
    for batch in tqdm(dataloader, desc='Getting Embeddings (batches): '):
        
        # Load to CUDA device for performance
        batch.to(device)
        
        # Run through pre-trained BERT_Mini
        out = bert(**batch)
        
        sentence_embeddings = out.last_hidden_state[:, 0, :]
        batch_outputs.append(sentence_embeddings)

    # combine outputs in vertical stack
    output = torch.vstack(batch_outputs)
    embeddings = output.cpu().numpy()

print(embeddings[:5])
print('Embeddings shape:', embeddings.shape)
print('Closest sentences:')
print_closest_sentences(embeddings)

Getting Embeddings (batches): 100%|███████████████████████████████| 171/171 [02:12<00:00,  1.29it/s]


[[-0.3040098  -0.50292176  0.46082702 ... -1.324414   -0.4234199
  -1.3706305 ]
 [-0.4365894  -0.6877635   0.46830153 ... -1.6171923   0.47544035
  -0.8923465 ]
 [-0.31683505 -1.1239557   0.45919332 ... -1.0899997   0.05394313
  -0.7031704 ]
 [-0.6155357  -1.1996818   0.6698872  ...  0.5729731  -0.76200986
   0.06233907]
 [-0.4074057  -1.4478451  -0.22949722 ... -0.99145806  0.5018565
  -0.29991972]]
Embeddings shape: (10921, 256)
Closest sentences:
a building is surrounded by green trees and meadows . <-> the playground next to the street has a total of three tennis courts and an orange runway .
in the middle of the picture is the lawn around the building . <-> the playground next to the street has a total of three tennis courts and an orange runway .
a white plane is on the runway . <-> the playground next to the street has a total of three tennis courts and an orange runway .
look the mountain is green . <-> the playground next to the street has a total of three tennis courts and an

## Helpers

In [None]:
def print_closest_sentences(embeddings):

    cosine_similarities = cosine_similarity(embeddings, embeddings)
    np.fill_diagonal(cosine_similarities, -np.inf)

    closest_indices = np.argmax(cosine_similarities, axis=1)

    biggest_closest = np.argpartition(closest_indices,-5)[-5:]

    for idx in biggest_closest:
        value = closest_indices[idx]
        print(sentences[idx], '<->', sentences[value])