# Step 2. Representation text dataset and building BERT-vectors
Here we work with text dataset and create BERT vectors

## 0. Preparation

In [None]:
# import necessary libraries
import torch
import os
import numpy as np
import json
from transformers import BertTokenizer, BertModel

In [14]:
# function for file opening from folder
path = '/Users/mishafoniakov/Documents/Thesis'
def file(folder, file):
    return os.path.join(path, folder, file)

# 1. Representation of classes and functions

### 1.1. Dataset class: it is responsible for all operations with dataset

In [15]:
class TextDataset:
    def __init__(self, text_dir_path):
        self.text_dir_path = text_dir_path
    
    def json_file_opening(self, json_file, attr_1, attr_2, dictionary=False, reshaping=False, tensor=False):
        with open(json_file, 'r') as file:
            img_features = [(json.loads(line)[attr_1],
                 json.loads(line)[attr_2])
                  for line in file]
        img_list = [feature[0] for feature in img_features]
        feature_list = [feature[1] for feature in img_features]
        if dictionary == False:
            return img_list, feature_list
        else:
            dict_list = {}
            assert len(img_list) == len(feature_list)
            for i in range(len(img_list)):
                if reshaping == True:
                    dict_list[img_list[i]] = np.array(feature_list[i]).reshape(1, -1)
                elif tensor == True:
                    dict_list[img_list[i]] = torch.FloatTensor(feature_list[i]).unsqueeze(dim=0)
                else:
                    dict_list[img_list[i]] = feature_list[i]
            return dict_list

### 1.2. BERT class: it is responsible for building BERT embeddings

In [37]:
#BERT model representation
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True)
bert_model = bert_model.eval()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [39]:
class BERT:
    def __init__(self, model, img_list, text_data):
        self.model = model
        self.img_list = img_list
        self.text_data = text_data

    def bert_embeddings_list(self):
        with open(file('Dataset_preparation', 'bert.json'), 'w') as bert:
            pass
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(device)
        n = len(self.text_data)
        for i in range(n):
            encodings = tokenizer(self.text_data[i], padding=True, return_tensors='pt', max_length=50, add_special_tokens = True)
            encodings = encodings.to(device)
            with torch.no_grad():
                embeds = self.model(**encodings)
            sentence_embedding = embeds[0][0, 0, :].cpu().tolist()
            img_bert = {'img': self.img_list[i], 'bert': sentence_embedding}
            with open(file('Dataset_preparation', 'bert.json'), 'a') as bert:
                json.dump(img_bert, bert)
                bert.write('\n')

# 2. Building BERT-vectors

In [None]:
#We form text array with all text files
text = TextDataset(file('Dataset_representation/Dataset_texts', 'image_big_text_dataset.json'))
text_description = text.json_file_opening(file('Dataset_representation/Dataset_texts', 
                                               'image_big_text_dataset.json'), 'img', 'txt', dictionary=True, reshaping=False, tensor=False)
#We form image array with all image files
img_list = list(text_description.keys())
img_number = len(img_list)

In [None]:
#We build BERT-vectors using obtained image and text array
bert = BERT(bert_model, img_list, text_data)
bert_embeddings = bert.bert_embeddings_list()