In [1]:
import torch
import transformers
import pandas as pd
import numpy as np
from argparse import Namespace
from tqdm.auto import tqdm
from datasets import load_from_disk
from transformers import AutoTokenizer

In [2]:
ds = load_from_disk("/data3/mmendieta/Violence_data/geo_corpus.0.0.1_dataset_for_inference")

In [3]:
ds

Dataset({
    features: ['tweetid', 'geo_x', 'geo_y', 'lang', 'text', 'labels'],
    num_rows: 2329158
})

In [4]:
ds[0]

{'tweetid': '388328898662268928',
 'geo_x': 35.49442,
 'geo_y': 33.888940000000005,
 'lang': 'en',
 'text': 'talking abt my case ☺️',
 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}

In [21]:
# Possible values
# Smaller-LABSE: setu4993/smaller-LaBSE
# LABSE: setu4993/LaBSE
# XLMT: cardiffnlp/twitter-xlm-roberta-base-sentiment
config = {
    "model_ckpt": "cardiffnlp/twitter-xlm-roberta-base-sentiment",
    "batch_size": 1024,
    "num_labels" : 6,
    "max_length": 32,
    "seed": 42,
    "fout": "/data3/mmendieta/Violence_data/geo_corpus.0.0.1_tok_ds_xlmt_inference"
}

args = Namespace(**config)

In [22]:
# Instantiate the tokenizer
model_ckpt = args.model_ckpt
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, 
                                              model_max_length=args.max_length)

Downloading:   0%|          | 0.00/841 [00:00<?, ?B/s]

In [15]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)

In [23]:
%time tokenized_ds = ds.map(tokenize, batched=True)

  0%|          | 0/2330 [00:00<?, ?ba/s]

CPU times: user 24min 5s, sys: 17.6 s, total: 24min 22s
Wall time: 1min 32s


In [None]:
# tokenized_ds = tokenized_ds.remove_columns('text')

In [24]:
tokenized_ds.set_format('torch')

In [25]:
tokenized_ds

Dataset({
    features: ['tweetid', 'geo_x', 'geo_y', 'lang', 'text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 2329158
})

In [26]:
tokenized_ds.features

{'tweetid': Value(dtype='string', id=None),
 'geo_x': Value(dtype='float64', id=None),
 'geo_y': Value(dtype='float64', id=None),
 'lang': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [27]:
tokenized_ds.save_to_disk(args.fout)