In [1]:
import torch
import transformers
import pandas as pd
import numpy as np
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModel
from argparse import Namespace

In [2]:
# Possible values
# Smaller-LABSE: setu4993/smaller-LaBSE
# LABSE: setu4993/LaBSE
# XLMT: cardiffnlp/twitter-xlm-roberta-base-sentiment
config = {
    "model_ckpt": "setu4993/smaller-LaBSE",
    "batch_size": 1000,
    "max_length": 32,
    "cuda_device": "cuda:2",
    "seed": 42,
    "dataset": "../../Violence_data/geo_corpus.0.0.1_dataset_for_train",
    "output_dir_ckpt": "../../Violence_data/geo_corpus.0.0.1_datasets_hidden_labse"
}

args = Namespace(**config)

### Read the dataset and delete unneccessary colums

In [3]:
violence = load_from_disk(args.dataset)

In [4]:
violence

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 16769932
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 4192483
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 2329158
    })
})

In [5]:
train_ds = violence["train"]

In [6]:
train_ds.features

{'text': Value(dtype='string', id=None),
 'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}

In [7]:
train_ds.column_names

['text', 'labels']

### From text to tokens

In [8]:
# Load the model checkpoint
model_ckpt = args.model_ckpt
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [9]:
text = "Tokenizing text is a core task of NLP"
encoded_text = tokenizer(text)
print(encoded_text)

{'input_ids': [101, 71697, 10049, 125918, 100627, 74701, 85521, 125181, 109963, 63495, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [10]:
# Convert ids back into tokens
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

['[CLS]', 'Token', '##izing', 'text', 'is', 'a', 'core', 'task', 'of', 'NLP', '[SEP]']


In [11]:
print(tokenizer.convert_tokens_to_string(tokens))

[CLS] Tokenizing text is a core task of NLP [SEP]


In [12]:
tokenizer.vocab_size

173347

In [13]:
tokenizer.model_max_length

1000000000000000019884624838656

In [14]:
tokenizer.model_input_names

['input_ids', 'token_type_ids', 'attention_mask']

### Tokenizing the whole dataset

In [15]:
def tokenize(batch):
     return tokenizer(batch["text"], padding=True, truncation=True, max_length=args.max_length)

In [16]:
print(tokenize(violence["train"].select(range(1000))[:2]))

{'input_ids': [[101, 72921, 90930, 85944, 43099, 131735, 103297, 54882, 86754, 121491, 90930, 46377, 43095, 73126, 43096, 102], [101, 106, 60436, 371, 21940, 838, 15459, 335, 8112, 24129, 74695, 43101, 43101, 43101, 74697, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [17]:
# Tokenizing the entire dataset
# %time violence_encoded = violence["train"].select(range(1000)).map(tokenize, batched=True, batch_size=None)
%time violence_encoded = violence.map(tokenize, batched=True, batch_size=args.batch_size)

  0%|          | 0/16770 [00:00<?, ?ba/s]

  0%|          | 0/4193 [00:00<?, ?ba/s]

  0%|          | 0/2330 [00:00<?, ?ba/s]

CPU times: user 3h 4min 27s, sys: 11min 51s, total: 3h 16min 18s
Wall time: 19min 11s


In [18]:
violence_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16769932
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4192483
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2329158
    })
})

### Model

In [19]:
device = torch.device(args.cuda_device if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [20]:
# Extract last hidden state
text = "this is a very interesting text"
inputs = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {inputs['input_ids'].size()}") # [batch_size, n_tokens]

Input tensor shape: torch.Size([1, 8])


In [21]:
inputs = {k:v.to(device) for k,v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.7044,  0.1361,  0.3484,  ..., -0.1931,  0.8320, -0.0427],
         [ 1.3874, -0.7434,  0.5686,  ...,  0.1172,  0.0612, -0.3482],
         [ 1.1464, -0.6417,  0.4695,  ...,  0.0973,  0.2719, -0.4838],
         ...,
         [ 0.9029, -0.9030,  0.2529,  ...,  0.0120,  0.4795, -0.7121],
         [ 1.1578, -0.0563,  0.8323,  ...,  0.5555,  0.6348, -0.4267],
         [ 0.7044,  0.1361,  0.3484,  ..., -0.1931,  0.8320, -0.0427]]],
       device='cuda:2'), pooler_output=tensor([[ 8.3118e-01, -8.7583e-01, -7.3634e-01, -9.5450e-01, -8.1539e-01,
          6.2844e-01, -7.8560e-01, -6.3620e-01, -8.6172e-01, -9.1410e-02,
          3.7426e-01, -3.0649e-01,  2.5643e-01,  6.8717e-01, -6.3883e-01,
         -3.1957e-02, -3.3372e-02,  7.2407e-01,  9.1128e-01, -9.1514e-01,
         -8.9409e-01, -6.6875e-01,  4.3481e-01, -3.8829e-02, -1.6667e-01,
         -1.3169e-01, -9.8688e-01,  5.1277e-01, -4.8089e-01,  1.4972e-01,
         -9.

In [22]:
outputs.last_hidden_state.size() # batch_size, n_tokens, hidden_dim (768)

torch.Size([1, 8, 768])

In [23]:
# Extract CLS token
outputs.last_hidden_state[:,0].size()

torch.Size([1, 768])

In [24]:
# Create function to extract hidden state
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
             if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [25]:
violence_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16769932
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4192483
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2329158
    })
})

In [26]:
violence_encoded.set_format("torch",
                           columns=["input_ids", "attention_mask", 
                                    "labels"])

In [27]:
%time violence_hidden = violence_encoded.map(extract_hidden_states, batched=True, batch_size=args.batch_size)

  0%|          | 0/16770 [00:00<?, ?ba/s]

  0%|          | 0/4193 [00:00<?, ?ba/s]

  0%|          | 0/2330 [00:00<?, ?ba/s]

CPU times: user 3h 33min 50s, sys: 1min 26s, total: 3h 35min 16s
Wall time: 3h 34min 51s


### Save dataset (including the tokenizer) to disk

In [None]:
violence_hidden["train"].features

In [None]:
%time violence_hidden.save_to_disk(args.output_dir_ckpt)

In [None]:
!nvidia-smi