In [1]:
import torch
import transformers
import pandas as pd
import numpy as np
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModel
from transformers import BertModel, BertTokenizerFast
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler

2022-10-09 23:30:52.562795: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


### Read the dataset and delete unneccessary colums

In [2]:
violence = load_from_disk("../../Violence_data/geo_corpus.0.0.1_datasets")

In [None]:
violence

In [None]:
train_ds = violence["train"]

In [None]:
train_ds.features

In [None]:
train_ds.column_names

In [3]:
remove_cols = ['tweetid', 'retweetid', 'date', 'timestamp', 'username', 'geo_x', 'geo_y', 'key']
# remove_cols

In [None]:
violence = violence.remove_columns(remove_cols)

In [None]:
print(violence["train"][:5])

### From text to tokens

In [4]:
# Load LABSE model
model_ckpt = "setu4993/smaller-LaBSE"
# tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenizer = BertTokenizerFast.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/275 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
text = "Tokenizing text is a core task of NLP"
encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
# Convert ids back into tokens
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

In [None]:
print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.model_max_length

In [None]:
tokenizer.model_input_names

### Tokenizing the whole dataset

In [5]:
def tokenize(batch):
     return tokenizer(batch["text"], padding=True, truncation=True, max_length=128)

In [None]:
print(tokenize(violence["train"].select(range(1000))[:2]))

In [None]:
# Tokenizing the entire dataset
# %time violence_encoded = violence["train"].select(range(1000)).map(tokenize, batched=True, batch_size=None)
%time violence_encoded = violence.map(tokenize, batched=True, batch_size=10000)

  0%|          | 0/1677 [00:00<?, ?ba/s]

In [9]:
violence_encoded

DatasetDict({
    train: Dataset({
        features: ['tweetid', 'retweetid', 'date', 'timestamp', 'username', 'geo_x', 'geo_y', 'lang', 'text', 'key', 'post1geo10', 'post1geo20', 'post1geo30', 'post1geo50', 'post1geo70', 'post2geo10', 'post2geo20', 'post2geo30', 'post2geo50', 'post2geo70', 'post3geo10', 'post3geo20', 'post3geo30', 'post3geo50', 'post3geo70', 'post7geo10', 'post7geo20', 'post7geo30', 'post7geo50', 'post7geo70', 'pre1geo10', 'pre1geo20', 'pre1geo30', 'pre1geo50', 'pre1geo70', 'pre2geo10', 'pre2geo20', 'pre2geo30', 'pre2geo50', 'pre2geo70', 'pre3geo10', 'pre3geo20', 'pre3geo30', 'pre3geo50', 'pre3geo70', 'pre7geo10', 'pre7geo20', 'pre7geo30', 'pre7geo50', 'pre7geo70', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16769932
    })
    validation: Dataset({
        features: ['tweetid', 'retweetid', 'date', 'timestamp', 'username', 'geo_x', 'geo_y', 'lang', 'text', 'key', 'post1geo10', 'post1geo20', 'post1geo30', 'post1geo50', 'post1geo70', 'post2geo10

### Model

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = AutoModel.from_pretrained(model_ckpt).to(device)
model = BertModel.from_pretrained(model_ckpt).to(device)

In [11]:
# Extract last hidden state
text = "this is a very interesting text"
inputs = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {inputs['input_ids'].size()}") # [batch_size, n_tokens]

Input tensor shape: torch.Size([1, 8])


In [12]:
inputs = {k:v.to(device) for k,v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.7044,  0.1361,  0.3484,  ..., -0.1931,  0.8320, -0.0427],
         [ 1.3874, -0.7434,  0.5686,  ...,  0.1172,  0.0612, -0.3482],
         [ 1.1464, -0.6417,  0.4695,  ...,  0.0973,  0.2719, -0.4838],
         ...,
         [ 0.9029, -0.9030,  0.2529,  ...,  0.0120,  0.4795, -0.7121],
         [ 1.1578, -0.0563,  0.8323,  ...,  0.5555,  0.6348, -0.4267],
         [ 0.7044,  0.1361,  0.3484,  ..., -0.1931,  0.8320, -0.0427]]],
       device='cuda:0'), pooler_output=tensor([[ 8.3118e-01, -8.7583e-01, -7.3634e-01, -9.5450e-01, -8.1539e-01,
          6.2844e-01, -7.8560e-01, -6.3620e-01, -8.6172e-01, -9.1410e-02,
          3.7426e-01, -3.0649e-01,  2.5643e-01,  6.8717e-01, -6.3883e-01,
         -3.1956e-02, -3.3372e-02,  7.2407e-01,  9.1128e-01, -9.1514e-01,
         -8.9409e-01, -6.6875e-01,  4.3481e-01, -3.8829e-02, -1.6667e-01,
         -1.3169e-01, -9.8688e-01,  5.1277e-01, -4.8089e-01,  1.4972e-01,
         -9.

In [13]:
outputs.last_hidden_state.size() # batch_size, n_tokens, hidden_dim (768)

torch.Size([1, 8, 768])

In [14]:
# Extract CLS token
outputs.last_hidden_state[:,0].size()

torch.Size([1, 768])

In [15]:
# Create function to extract hidden state
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
             if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [16]:
violence_encoded.set_format("torch",
                           columns=["input_ids", "token_type_ids", "attention_mask", "pre7geo30", "post7geo30"])

In [None]:
%time violence_hidden = violence_encoded.map(extract_hidden_states, batched=True, 
                                             batch_size=10000)

  0%|          | 0/16770 [00:00<?, ?ba/s]

### Save dataset (including the tokenizer) to disk

In [None]:
%time violence_hidden.save_to_disk("../../Violence_data/geo_corpus.0.0.1_datasets_hidden_small_labse")

In [None]:
violence_hidden

In [None]:
!nvidia-smi