In [7]:
import torch
import transformers
import pandas as pd
import numpy as np
from datasets import load_from_disk, DatasetDict
from transformers import AutoTokenizer, AutoModel
from argparse import Namespace

In [2]:
# Possible values
# Smaller-LABSE: setu4993/smaller-LaBSE
# Smaller-LABSE: /data3/mmendieta/models/small_labse
# LABSE: setu4993/LaBSE
# LABSE: /data3/mmendieta/models/labse
# XLMT: cardiffnlp/twitter-xlm-roberta-base-sentiment
# XLMT: /data3/mmendieta/models/xlmt
# ML-E5-LARGE: /data3/mmendieta/models/ml_e5_large 
config = {
    "model_ckpt": "/data3/mmendieta/models/ml_e5_large",
    "batch_size": 1024,
    "max_length": 32,
    "cuda_device": "cuda:0",
    "seed": 42,
    "dataset": "/data3/mmendieta/Violence_data/geo_corpus.0.0.1_dataset_for_train_all_labels",
    "output_dir_ckpt": "/data4/mmendieta/data/geo_corpus.0.0.1_datasets_hidden_e5_all_labels"
}

args = Namespace(**config)

### Read the dataset and delete unneccessary colums

In [3]:
violence = load_from_disk(args.dataset)

In [6]:
violence

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 16769932
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 4192483
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 2329158
    })
})

In [None]:
train_ds = violence["train"]

In [None]:
train_ds.features

In [None]:
train_ds.column_names

### From text to tokens

In [4]:
# Load the model checkpoint
model_ckpt = args.model_ckpt
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
text = "Tokenizing text is a core task of NLP"
encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
# Convert ids back into tokens
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.model_max_length

In [None]:
tokenizer.model_input_names

### Tokenizing the whole dataset

In [5]:
def tokenize(batch):
     return tokenizer(batch["text"], padding=True, truncation=True, max_length=args.max_length)

In [None]:
print(tokenize(violence["train"].select(range(1000))[:2]))

In [9]:
# [WARNING:] Run the next 2 following cells only if you want to sample the dataset and then tokenize it
# Sample the required number of rows from each split
train_sample = violence["train"].shuffle(seed=42).select(range(1500000))
val_sample = violence["validation"].shuffle(seed=42).select(range(400000))
test_sample = violence["test"].shuffle(seed=42).select(range(200000))

In [10]:
# Create a new DatasetDict
violence_sampled = DatasetDict({
    "train": train_sample,
    "validation": val_sample,
    "test": test_sample
})

In [11]:
# Tokenizing the entire dataset
# %time violence_encoded = violence["train"].select(range(1000)).map(tokenize, batched=True, batch_size=None)
# %time violence_encoded = violence.map(tokenize, batched=True, batch_size=args.batch_size)
%time violence_encoded = violence_sampled.map(tokenize, batched=True, batch_size=args.batch_size)

  0%|          | 0/1465 [00:00<?, ?ba/s]

  0%|          | 0/391 [00:00<?, ?ba/s]

  0%|          | 0/196 [00:00<?, ?ba/s]

CPU times: user 21min 39s, sys: 50 s, total: 22min 29s
Wall time: 2min 17s


In [None]:
violence_encoded

### Model

In [12]:
device = torch.device(args.cuda_device if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
# Extract last hidden state
text = "this is a very interesting text"
inputs = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {inputs['input_ids'].size()}") # [batch_size, n_tokens]

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

In [None]:
outputs.last_hidden_state.size() # batch_size, n_tokens, hidden_dim (768)

In [None]:
# Extract CLS token
outputs.last_hidden_state[:,0].size()

In [13]:
# Create function to extract hidden state
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
             if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
violence_encoded

In [14]:
violence_encoded.set_format("torch",
                           columns=["input_ids", "attention_mask", 
                                    "labels"])

In [15]:
%time violence_hidden = violence_encoded.map(extract_hidden_states, batched=True, batch_size=args.batch_size)

  0%|          | 0/1465 [00:00<?, ?ba/s]

  0%|          | 0/391 [00:00<?, ?ba/s]

  0%|          | 0/196 [00:00<?, ?ba/s]

CPU times: user 1h 23min 47s, sys: 17.7 s, total: 1h 24min 5s
Wall time: 54min 20s


### Save dataset (including the tokenizer) to disk

In [16]:
violence_hidden["train"].features

{'text': Value(dtype='string', id=None),
 'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'hidden_state': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)}

In [17]:
%time violence_hidden.save_to_disk(args.output_dir_ckpt)

CPU times: user 262 ms, sys: 5.19 s, total: 5.45 s
Wall time: 5.45 s


In [18]:
violence_hidden

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 1500000
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 400000
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 200000
    })
})