In [3]:
import os
from functools import partial
import datetime
from pathlib import Path

import torch
from transformers import Trainer, TrainingArguments, AutoConfig, DataCollatorWithPadding, AutoTokenizer
from transformers.trainer_utils import set_seed
from data import NERDataModule
from config import get_configs
from model import get_pretrained
from datasets import Dataset
import numpy as np


config_file = "j-dv1l-idpt-pl-0.yml"
output = config_file.split(".")[0]
cfg, args = get_configs(config_file)
set_seed(args["seed"])

cfg, args = get_configs(config_file)
args["output_dir"] = output

args = TrainingArguments(**args)



if Path("dv1l-processed").exists():
    dataset = Dataset.load_from_disk("dv1l-processed")
else:
    
    datamodule = NERDataModule(cfg)
    datamodule.prepare_datasets()
    
    dataset = datamodule.dataset
    dataset = dataset.map(lambda x: {"length": len(x["input_ids"])}, num_proc=cfg["num_proc"])
    dataset = dataset.sort("length")

    dataset.save_to_disk("dv1l-processed")


tokenizer = AutoTokenizer.from_pretrained(cfg["model_name_or_path"])
    
print(f"dataset length: {len(dataset)}")

model_config = AutoConfig.from_pretrained(cfg["model_name_or_path"], use_auth_token=os.environ.get("HUGGINGFACE_HUB_TOKEN", True))
model = get_pretrained(model_config, cfg["model_name_or_path"]+"/pytorch_model.bin")

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    return_tensors="pt",
    padding="longest",
    pad_to_multiple_of=8,
)

trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

dataset length: 71500


In [5]:
chunk = 10_000

ds = dataset.remove_columns([x for x in dataset.column_names if x not in {"input_ids", "token_type_ids", "attention_mask"}])
for i, start in enumerate(range(0, len(dataset), chunk)):
    end = min(start+chunk, len(dataset))
    small_ds = ds.select(range(start,end))

    preds = trainer.predict(small_ds)
    
    np.save(f"dv1l-ner-3-f0-chunk{i}.npy", preds[0])