In [1]:
import torch
import transformers
import pandas as pd
import numpy as np
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModel
from transformers import BertModel, BertTokenizerFast
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler

2022-10-29 17:58:36.460534: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


### Read the dataset and delete unneccessary colums

In [2]:
violence = load_from_disk("../../Violence_data/geo_corpus.0.0.1_datasets")

In [None]:
violence

In [None]:
train_ds = violence["train"]

In [None]:
train_ds.features

In [None]:
train_ds.column_names

In [3]:
remove_cols = ['tweetid', 'retweetid', 'date', 'timestamp', 'username', 'geo_x', 'geo_y', 'key']
# remove_cols

In [4]:
violence = violence.remove_columns(remove_cols)

In [None]:
print(violence["train"][:5])

### From text to tokens

In [5]:
# Load LABSE model
model_ckpt = "setu4993/LaBSE"
# tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenizer = BertTokenizerFast.from_pretrained(model_ckpt)

In [6]:
text = "Tokenizing text is a core task of NLP"
encoded_text = tokenizer(text)
print(encoded_text)

{'input_ids': [101, 154820, 73766, 19733, 15002, 170, 31721, 52725, 14997, 124755, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [7]:
# Convert ids back into tokens
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

['[CLS]', 'Token', '##izing', 'text', 'is', 'a', 'core', 'task', 'of', 'NLP', '[SEP]']


In [8]:
print(tokenizer.convert_tokens_to_string(tokens))

[CLS] Tokenizing text is a core task of NLP [SEP]


In [9]:
tokenizer.vocab_size

501153

In [10]:
tokenizer.model_max_length

1000000000000000019884624838656

In [11]:
tokenizer.model_input_names

['input_ids', 'token_type_ids', 'attention_mask']

### Tokenizing the whole dataset

In [12]:
def tokenize(batch):
     return tokenizer(batch["text"], padding=True, truncation=True, max_length=32)

In [None]:
print(tokenize(violence["train"].select(range(1000))[:2]))

In [13]:
# Tokenizing the entire dataset
# %time violence_encoded = violence["train"].select(range(1000)).map(tokenize, batched=True, batch_size=None)
%time violence_encoded = violence.map(tokenize, batched=True, batch_size=10000)

  0%|          | 0/1677 [00:00<?, ?ba/s]

  0%|          | 0/420 [00:00<?, ?ba/s]

  0%|          | 0/233 [00:00<?, ?ba/s]

CPU times: user 2h 34min 28s, sys: 7min 38s, total: 2h 42min 7s
Wall time: 40min 27s


In [14]:
violence_encoded

DatasetDict({
    train: Dataset({
        features: ['lang', 'text', 'post1geo10', 'post1geo20', 'post1geo30', 'post1geo50', 'post1geo70', 'post2geo10', 'post2geo20', 'post2geo30', 'post2geo50', 'post2geo70', 'post3geo10', 'post3geo20', 'post3geo30', 'post3geo50', 'post3geo70', 'post7geo10', 'post7geo20', 'post7geo30', 'post7geo50', 'post7geo70', 'pre1geo10', 'pre1geo20', 'pre1geo30', 'pre1geo50', 'pre1geo70', 'pre2geo10', 'pre2geo20', 'pre2geo30', 'pre2geo50', 'pre2geo70', 'pre3geo10', 'pre3geo20', 'pre3geo30', 'pre3geo50', 'pre3geo70', 'pre7geo10', 'pre7geo20', 'pre7geo30', 'pre7geo50', 'pre7geo70', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16769932
    })
    validation: Dataset({
        features: ['lang', 'text', 'post1geo10', 'post1geo20', 'post1geo30', 'post1geo50', 'post1geo70', 'post2geo10', 'post2geo20', 'post2geo30', 'post2geo50', 'post2geo70', 'post3geo10', 'post3geo20', 'post3geo30', 'post3geo50', 'post3geo70', 'post7geo10', 'post7geo20', 'post7g

### Model

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = AutoModel.from_pretrained(model_ckpt).to(device)
model = BertModel.from_pretrained(model_ckpt).to(device)

In [16]:
# Extract last hidden state
text = "this is a very interesting text"
inputs = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {inputs['input_ids'].size()}") # [batch_size, n_tokens]

Input tensor shape: torch.Size([1, 8])


In [17]:
inputs = {k:v.to(device) for k,v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.7190,  0.1322,  0.3555,  ..., -0.1876,  0.8301, -0.0487],
         [ 1.3976, -0.7485,  0.5753,  ...,  0.1236,  0.0599, -0.3537],
         [ 1.1573, -0.6460,  0.4748,  ...,  0.1060,  0.2727, -0.4909],
         ...,
         [ 0.9173, -0.9071,  0.2581,  ...,  0.0162,  0.4743, -0.7233],
         [ 1.1724, -0.0632,  0.8376,  ...,  0.5579,  0.6323, -0.4354],
         [ 0.7190,  0.1322,  0.3555,  ..., -0.1876,  0.8301, -0.0487]]],
       device='cuda:0'), pooler_output=tensor([[ 8.3196e-01, -8.7722e-01, -7.3517e-01, -9.5406e-01, -8.1483e-01,
          6.2774e-01, -7.8682e-01, -6.3098e-01, -8.6309e-01, -8.9142e-02,
          3.8321e-01, -3.0272e-01,  2.5195e-01,  6.8621e-01, -6.3897e-01,
         -2.9318e-02, -2.5280e-02,  7.2706e-01,  9.1096e-01, -9.1336e-01,
         -8.9190e-01, -6.6604e-01,  4.3566e-01, -4.7941e-02, -1.6338e-01,
         -1.2722e-01, -9.8685e-01,  5.1037e-01, -4.7686e-01,  1.4591e-01,
         -9.

In [18]:
outputs.last_hidden_state.size() # batch_size, n_tokens, hidden_dim (768)

torch.Size([1, 8, 768])

In [19]:
# Extract CLS token
outputs.last_hidden_state[:,0].size()

torch.Size([1, 768])

In [20]:
# Create function to extract hidden state
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
             if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [30]:
violence_encoded.set_format("torch",
                           columns=["input_ids", "token_type_ids", "attention_mask", 
                                    "pre7geo10", "pre7geo30", "pre7geo50",
                                    "post7geo10", "post7geo30", "post7geo50"])

In [32]:
%time violence_hidden = violence_encoded.map(extract_hidden_states, batched=True, batch_size=1000)

  0%|          | 0/16770 [00:00<?, ?ba/s]

  0%|          | 0/4193 [00:00<?, ?ba/s]

  0%|          | 0/2330 [00:00<?, ?ba/s]

CPU times: user 22h 37min 52s, sys: 4min 14s, total: 22h 42min 6s
Wall time: 12h 32min 25s


### Save dataset (including the tokenizer) to disk

In [34]:
%time violence_hidden.save_to_disk("../../Violence_data/geo_corpus.0.0.1_datasets_hidden_labse")

CPU times: user 9.37 s, sys: 1min 14s, total: 1min 23s
Wall time: 2min 11s


In [33]:
violence_hidden

DatasetDict({
    train: Dataset({
        features: ['lang', 'text', 'post1geo10', 'post1geo20', 'post1geo30', 'post1geo50', 'post1geo70', 'post2geo10', 'post2geo20', 'post2geo30', 'post2geo50', 'post2geo70', 'post3geo10', 'post3geo20', 'post3geo30', 'post3geo50', 'post3geo70', 'post7geo10', 'post7geo20', 'post7geo30', 'post7geo50', 'post7geo70', 'pre1geo10', 'pre1geo20', 'pre1geo30', 'pre1geo50', 'pre1geo70', 'pre2geo10', 'pre2geo20', 'pre2geo30', 'pre2geo50', 'pre2geo70', 'pre3geo10', 'pre3geo20', 'pre3geo30', 'pre3geo50', 'pre3geo70', 'pre7geo10', 'pre7geo20', 'pre7geo30', 'pre7geo50', 'pre7geo70', 'input_ids', 'token_type_ids', 'attention_mask', 'hidden_state'],
        num_rows: 16769932
    })
    validation: Dataset({
        features: ['lang', 'text', 'post1geo10', 'post1geo20', 'post1geo30', 'post1geo50', 'post1geo70', 'post2geo10', 'post2geo20', 'post2geo30', 'post2geo50', 'post2geo70', 'post3geo10', 'post3geo20', 'post3geo30', 'post3geo50', 'post3geo70', 'post7geo10', 'post

In [26]:
violence_hidden["train"][0]

{'post7geo30': tensor(1),
 'pre7geo30': tensor(0),
 'input_ids': tensor([   101,  31932,  14980,  38446,    117,    194,  14979, 168932,  14976,
          27403,  14980,  73122,    113,  16222,    114,    102,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0]),
 'token_ty