In [15]:
from transformers import BertForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
import json

In [74]:
data_path="../data/yelpreview/yelp-train-tiny-gptj.jsonl"
output_path="../data/yelpreview/yelp-train-tiny-bert-trunc.jsonl"

In [107]:
def bert_train(data_path, output_path, pretrained_model):
    bertmodel = BertForSequenceClassification.from_pretrained(pretrained_model).cuda()
    tkn = AutoTokenizer.from_pretrained(pretrained_model)

    data = []
    with open(data_path, "r") as f:
        while line:=f.readline():
            data.append(json.loads(line))
    
    for item in tqdm(data):
        text = item["text"]
        tokenized = tkn([text], truncation=True, max_length = 51)
        item["text"] = tkn.decode(tokenized['input_ids'][0])[6:-5]
        
        tokenized = {k: torch.tensor(tokenized[k]).cuda() for k in tokenized.keys()}
        probs = F.softmax(bertmodel(tokenized["input_ids"]).logits, dim=-1)
        item["pseudo_gt"] = torch.argmax(probs[0]).item()
        for i in range(5):
            item[str(i)] = probs[0][i].item()
    
    with open(output_path, "w") as f:
        for row in data:
            f.write(json.dumps(row)+"\n")
    
    return 

In [None]:
bert_train(data_path, output_path, "rttl-ai/bert-base-uncased-yelp-reviews")

 77%|███████▋  | 9989/13000 [01:42<00:30, 98.60it/s] 