In [1]:
import torch
import transformers
import pandas as pd
import numpy as np
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModel

### Read the dataset and delete unneccessary colums

In [2]:
violence = load_from_disk("../../Violence_data/geo_corpus.0.0.1_datasets")

In [None]:
violence

In [None]:
train_ds = violence["train"]

In [None]:
train_ds.features

In [None]:
train_ds.column_names

In [3]:
remove_cols = ['tweetid', 'retweetid', 'date', 'timestamp', 'username', 'geo_x', 'geo_y', 'key']
# remove_cols

In [4]:
violence = violence.remove_columns(remove_cols)

In [None]:
print(violence["train"][:5])

### From text to tokens

In [5]:
# Load XLM-T: A Multilingual Language Model Toolkit for Twitter
model_ckpt = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
text = "Tokenizing text is a core task of NLP"
encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
# Convert ids back into tokens
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.model_max_length

In [None]:
tokenizer.model_input_names

### Tokenizing the whole dataset

In [6]:
def tokenize(batch):
     return tokenizer(batch["text"], padding=True, truncation=True, max_length=32)

In [None]:
print(tokenize(violence["train"].select(range(1000))[:2]))

In [None]:
# Tokenizing the entire dataset
# %time violence_encoded = violence["train"].select(range(1000)).map(tokenize, batched=True, batch_size=None)
%time violence_encoded = violence.map(tokenize, batched=True, batch_size=10000)

  0%|          | 0/1677 [00:00<?, ?ba/s]

In [None]:
violence_encoded

### Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
# Extract last hidden state
text = "this is a very interesting text"
inputs = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {inputs['input_ids'].size()}") # [batch_size, n_tokens]

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

In [None]:
outputs.last_hidden_state.size() # batch_size, n_tokens, hidden_dim (768)

In [None]:
# Extract CLS token
outputs.last_hidden_state[:,0].size()

In [None]:
# Create function to extract hidden state
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
             if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
violence_encoded.set_format("torch",
                           columns=["input_ids", "attention_mask"])

In [None]:
%time violence_hidden = violence_encoded.map(extract_hidden_states, 
                                             batched=True, batch_size=1000)

### Save dataset (including the tokenizer) to disk

In [None]:
violence_hidden["train"].features

In [None]:
%time violence_hidden.save_to_disk("../../Violence_data/geo_corpus.0.0.1_datasets_hidden_xlmt")

In [None]:
violence_hidden

In [None]:
!nvidia-smi