In [1]:
import json
import os
from dotenv import load_dotenv
from pathlib import Path

dotenv_path = Path("../../.env")
if dotenv_path.exists():
    print("Loaded .env file!")
    load_dotenv(str(dotenv_path))


data = json.load(open(Path(os.environ["PROJECT_HOME_DIR"]) / "data/train.json"))

Loaded .env file!


In [2]:
from transformers import AutoTokenizer
from datasets import Dataset

ds = Dataset.from_dict({"text": [d["full_text"] for d in data]})

In [3]:
models = [
    "allenai/longformer-large-4096",
    "roberta-base",
    "microsoft/deberta-v3-base",
    "mistralai/Mistral-7B-v0.1"
]

In [4]:
import numpy as np

def tokenize(batch):
    tokenized = tokenizer(batch["text"], truncation=False, padding=False)

    tokenized["length"] = [len(t) for t in tokenized["input_ids"]]

    return tokenized
                           

for m in models:

    tokenizer = AutoTokenizer.from_pretrained(m)
    ds = ds.map(tokenize, batched=True, num_proc=8)

    quantiles = np.percentile(ds["length"], [25, 50, 75, 90, 95, 100])

    print(f"Model: {m}")
    print(f"Quantiles: {quantiles}")

Map (num_proc=8):   0%|          | 0/6807 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4412 > 4096). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4271 > 4096). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4162 > 4096). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4227 > 4096). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4205 > 4096). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

Model: allenai/longformer-large-4096
Quantiles: [ 560.  744.  965. 1220. 1423. 5557.]


Map (num_proc=8):   0%|          | 0/6807 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (977 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1113 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (612 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (740 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (835 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

Model: roberta-base
Quantiles: [ 560.  744.  965. 1220. 1423. 5557.]




Map (num_proc=8):   0%|          | 0/6807 [00:00<?, ? examples/s]

Model: microsoft/deberta-v3-base
Quantiles: [ 493.   657.   850.5 1053.4 1216.  3076. ]


Map (num_proc=8):   0%|          | 0/6807 [00:00<?, ? examples/s]

Model: mistralai/Mistral-7B-v0.1
Quantiles: [ 579.   767.   994.  1255.  1471.7 5943. ]
