In [1]:
%load_ext autoreload
%autoreload 2
%cd ..

%matplotlib inline

/home/mbrzozowski/projects/media_monitoring/roberta_for_longer_texts


In [17]:
import pandas as pd
import numpy as np

from config import VISIBLE_GPUS

import os
os.environ["CUDA_VISIBLE_DEVICES"]= VISIBLE_GPUS
import torch

from lib.main import BERTClassificationModelWithPooling, load_pretrained_model
from lib.text_preprocessors import BERTTokenizerPooled

# Get embedding vectors for longer sequences

## Load example text longer than 512 tokens

In [3]:
SAMPLE_DATA_PATH = 'test/sample_data/sample_data_eng.csv'

In [4]:
df = pd.read_csv(SAMPLE_DATA_PATH)

In [5]:
df['number_of_words'] = df['sentence'].apply(lambda x: len(x.split()))

In [6]:
df.sort_values(by='number_of_words', ascending=False)

Unnamed: 0,sentence,target,number_of_words
961,"Okay, so I'm not a big video game buff, but wa...",0,1316
1054,Jim Carrey is back to much the same role that ...,1,1277
1456,THE SHOP AROUND THE CORNER is one of the sweet...,1,1148
1711,I won't try to speculate as to what Brando was...,1,1000
68,"Pier Paolo Pasolini, or Pee-pee-pee as I prefe...",0,997
...,...,...,...
859,Predictable plot. Simple dialogue. Shockingly ...,0,33
1151,This unpretentious Horror film is probably des...,1,32
1374,"THis movie shows us once again, how genius the...",1,30
1655,The only thing serious about this movie is the...,1,28


In [8]:
example_text = df.loc[961,'sentence']

In [47]:
example_text

"Okay, so I'm not a big video game buff, but was the game House of the Dead really famous enough to make a movie from? Sure, they went as far as to actually put in quick video game clips throughout the movie, as though justifying any particular scene of violence, but there are dozens and dozens of games that look exactly the same, with the hand in the bottom on the screen, supposedly your own, holding whatever weapon and goo-ing all kinds of aliens or walking dead or snipers or whatever the case may be.<br /><br />It's an interesting premise in House of the Dead, with a lot of college kids (LOADED college kids, as it were, kids who are able to pay some fisherman something like $1,500 just for a ride after they miss their boat) trying to get out to this island for what is supposed to be the rave of the year. The first thing that comes to mind about House of the Dead after watching it is that it has become increasingly clear that modern horror movies have become nothing more than an exer

## Load BERT tokenizer and model

In [13]:
tokenizer, bert = load_pretrained_model()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Split longer texts into chunks and tokenize

### Set splitting parameters

In [18]:
size = 510
step = 256
minimal_length = 1

### Split and tokenize

In [19]:
preprocessor = BERTTokenizerPooled(tokenizer,size,step,minimal_length)

In [20]:
model_inputs = preprocessor.preprocess([example_text])

Token indices sequence length is longer than the specified maximum sequence length for this model (1611 > 512). Running this sequence through the model will result in indexing errors


In [21]:
model_inputs

{'input_ids': [tensor([[  101,  3100,  1010,  ...,  2893,  6248,   102],
          [  101,  2037,  2126,  ...,  2058,  2007,   102],
          [  101,  2730,  1012,  ...,  1998,  2007,   102],
          ...,
          [  101, 15843,  1997,  ...,  2074,  1996,   102],
          [  101,  3168,  1997,  ...,     0,     0,     0],
          [  101,  1996,  3185,  ...,     0,     0,     0]])],
 'attention_mask': [tensor([[1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 1, 1, 1],
          ...,
          [1, 1, 1,  ..., 1, 1, 1],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32)]}

In [22]:
model_inputs['input_ids'][0].shape

torch.Size([7, 512])

Hence the text was divided into 7 chunks. Each consists of 512 tokens.

### Get tensors

In [25]:
input_ids = model_inputs['input_ids'][0]
attention_masks = model_inputs['attention_mask'][0]

### Put them on GPU

In [32]:
input_ids = input_ids.to('cuda',)

In [34]:
attention_masks = attention_masks.to('cuda')

### Check that all tensors and model are on the same device

In [35]:
input_ids.device

device(type='cuda', index=0)

In [36]:
attention_masks.device

device(type='cuda', index=0)

In [37]:
bert.device

device(type='cuda', index=0)

## Get embedding vectors

In [39]:
outputs = bert(input_ids,attention_masks)

In [45]:
embedding = outputs.last_hidden_state

In [46]:
embedding.shape

torch.Size([7, 512, 768])

Obtained embedding is a tensor of the size:
- 7 (number of text chunks) times
- 512 (number of tokens per text chunk) times
- 768 (embedding space)

In [48]:
embedding

tensor([[[ 0.1334, -0.1360,  0.0732,  ...,  0.1593,  0.7489,  0.0912],
         [ 0.6643,  0.3044,  0.2671,  ...,  0.6427,  1.4492,  0.5313],
         [-0.3382,  0.2624,  0.5635,  ..., -0.0744,  0.6455,  0.9595],
         ...,
         [ 0.2310,  0.5090,  0.0892,  ...,  0.1037,  0.0765, -0.2995],
         [ 0.9082,  0.1428, -0.1060,  ..., -0.3137,  0.3477, -1.2582],
         [ 0.5154,  0.3602,  0.1636,  ...,  0.5111,  0.3805, -0.1014]],

        [[ 0.0297, -0.0376,  0.2539,  ..., -0.1259,  0.7356,  0.1644],
         [-0.3283, -0.0473, -0.3935,  ...,  0.2280,  1.2034,  0.4672],
         [-0.2061, -0.4288,  0.2502,  ...,  0.0781,  0.2309,  0.4196],
         ...,
         [ 0.8271, -0.8116, -0.3419,  ...,  0.9582,  0.1719, -0.1047],
         [-0.1106, -0.0820, -0.6049,  ...,  0.3202,  0.4953, -0.3978],
         [ 0.2317,  0.5210,  0.3277,  ...,  0.6629,  0.2948, -0.2424]],

        [[ 0.1567, -0.1907,  0.3564,  ...,  0.0833,  0.4848,  0.2843],
         [ 1.1954,  0.3024,  0.9446,  ...,  0