In [1]:
import torch
import numpy
import json
from transformers import BertModel, BertTokenizer

In [2]:
datapath = '/Users/manuelladron/iCloud_archive/Documents/_CMU/PHD-CD/PHD-CD_Research/ADARI/json_files/cleaned/ADARI_v2/furniture/ADARI_furniture_pairs.json'

In [3]:
def open_json(path):
    f = open(path) 
    data = json.load(f) 
    f.close()
    return data 

In [4]:
data = open_json(datapath)

In [42]:
running_len = 0.0
max_ = 0
min_ = 100000

for i, sample in enumerate(data):
    text = sample['text'][0]
    running_len += len(text)
    if len(text) > max_:
        max_ = len(text)
    if len(text) < min_:
        min_ = len(text)
avg_len = running_len/len(data)
print(max_)
print(min_)
print(avg_len)

86
1
15.805355920602327


In [5]:
running_len = 0.0
num_sents = 0
for sample in data:
    text = sample['text'][0]
    for sent in text:
        running_len += len(sent)
        num_sents += 1
avg_len = running_len/num_sents

In [6]:
avg_len

128.70563100979973

In [8]:
data[0]['text'][0]

['well-priced furniture with a modern craft heritage',
 'i had to refurbish a room and was looking for a certain object – a contemporary but country-friendly, simple three-legged stool that would also work as a side table',
 'ten years on and my passion for well-made, well-priced furniture with a modern craft heritage has not waned',
 "the collection was born from a mutual love between our brands of the countryside, namely dorset, and the idea to create furniture for the 'modern farmhouse', which draws inspiration from furniture found in old farmhouses and brings them up to date to be used in modern homes",
 'the modern farmhouse pieces are designed to be practical in the home, for example, the settle, could be used in the kitchen against a wall or to break a spacious living room, with concealed storage underneath to hide and clear a space',
 "the same goes for the hallway bench, a multi-functioning piece, allowing you to hang coats, store shoes and somewhere to sit when you're on your

In [9]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

In [10]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [20]:
inputs_ids = []
masks = []
for i, sent in enumerate(data[0]['text'][0]):
    tokens = tokenizer(
                sent,
                max_length = 130,
                truncation = True,
                padding = 'max_length',
                return_tensors = 'pt')

    inputs_ids.append(tokens['input_ids'])
    masks.append(tokens['attention_mask'])


In [21]:
data[0]['text'][0]

['well-priced furniture with a modern craft heritage',
 'i had to refurbish a room and was looking for a certain object – a contemporary but country-friendly, simple three-legged stool that would also work as a side table',
 'ten years on and my passion for well-made, well-priced furniture with a modern craft heritage has not waned',
 "the collection was born from a mutual love between our brands of the countryside, namely dorset, and the idea to create furniture for the 'modern farmhouse', which draws inspiration from furniture found in old farmhouses and brings them up to date to be used in modern homes",
 'the modern farmhouse pieces are designed to be practical in the home, for example, the settle, could be used in the kitchen against a wall or to break a spacious living room, with concealed storage underneath to hide and clear a space',
 "the same goes for the hallway bench, a multi-functioning piece, allowing you to hang coats, store shoes and somewhere to sit when you're on your

In [22]:
inputs_ids[0].shape

torch.Size([1, 130])

In [23]:
embeds = torch.cat(inputs_ids, dim=0).long()
masks = torch.cat(masks, dim=0).long()
print(embeds.shape)
print(masks.shape)


torch.Size([12, 130])
torch.Size([12, 130])


In [25]:
with torch.no_grad():
    outputs = model(embeds,
                    attention_mask = masks)

In [31]:
hidden_states = outputs[2]

AttributeError: 'tuple' object has no attribute 'stack'

In [33]:
hs = torch.stack(hidden_states, dim=0)

In [35]:
hs.shape # [13 layers, batch, seq_len, hid_dim]

torch.Size([13, 12, 130, 768])

In [43]:
# hs = hs.permute(2,1,0,3)
# hs.shape

torch.Size([13, 12, 130, 768])

In [49]:
sents_emb = hs[-2, :, :, :]
sents_emb.shape

torch.Size([12, 130, 768])

In [50]:
sents_emb = torch.mean(sents_emb, dim=1)

In [51]:
sents_emb.shape

torch.Size([12, 768])