In [1]:
import torch
import transformers
import pandas as pd
import numpy as np
from argparse import Namespace
from tqdm.auto import tqdm
from datasets import load_from_disk
from transformers import AutoTokenizer

In [2]:
ds = load_from_disk("/data3/mmendieta/Violence_data/geo_corpus.0.0.1_dataset_for_train_all_labels")

In [None]:
ds['train'][0]

In [3]:
config = {
    "model_ckpt": "/data3/mmendieta/models/ml_e5_large",
    "batch_size": 1024,
    "num_labels" : 40,
    "max_length": 32,
    "seed": 42,
    "fout": "/data4/mmendieta/data/geo_corpus.0.0.1_tok_ds_e5large_all_labels"
}

args = Namespace(**config)

In [4]:
# Instantiate the tokenizer
model_ckpt = args.model_ckpt
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, 
                                          model_max_length=args.max_length,
                                          padding_side="right",
                                          local_files_only=False,
                                          use_fast=True)

In [5]:
def tokenize(batch):
    return tokenizer(
        batch["text"], 
        truncation=True,
        padding="max_length"
    )

In [6]:
%time tokenized_ds = ds.map(tokenize, batched=True, batch_size=args.batch_size, keep_in_memory=True)

  0%|          | 0/16377 [00:00<?, ?ba/s]

  0%|          | 0/4095 [00:00<?, ?ba/s]

  0%|          | 0/2275 [00:00<?, ?ba/s]

CPU times: user 2h 57min 21s, sys: 8min 39s, total: 3h 6min 1s
Wall time: 25min 10s


In [7]:
tokenized_ds = tokenized_ds.remove_columns('text')

In [8]:
tokenized_ds.set_format('torch')

In [9]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 16769932
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 4192483
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2329158
    })
})

In [10]:
tokenized_ds["train"].features

{'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [11]:
tokenized_ds.save_to_disk(args.fout)