In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer

llama_path = "models/vicuna-13b"
model = LlamaForCausalLM.from_pretrained(llama_path, load_in_8bit=True)
tokenizer = LlamaTokenizer.from_pretrained(llama_path)
tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})

In [None]:
from spoiler_generation.utils.spoiler_generation.utils.dataset_class import Dataset
import torch

dataset = Dataset.from_jsonl("data/train.jsonl")
clickbaits = dataset.df["postText"].tolist()

In [None]:
PROMPT = "Below is a sentence from which write question.\n" "### Sentence:\n{question}\n### Question:"

input_ids = tokenizer([PROMPT.format(question=c) for c in clickbaits[:20]], padding=True, return_tensors="pt").to("cuda")
with torch.no_grad():
    generated_ids = model.generate(**input_ids, max_new_tokens=30)

batch = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

In [None]:
print(batch[4])

In [None]:
import re

sen_comp = re.compile(r"Sentence:\n(.*)\n", re.MULTILINE)
sen_ques = re.compile(r"Question:\n(.*)\n?", re.MULTILINE)

for b in batch:
    print(next(sen_comp.finditer(b)).groups(0)[0])
    print(next(sen_ques.finditer(b)).groups(0)[0])

    print("__________________________")

In [None]:
len(clickbaits)

In [None]:
import torch
import re
from tqdm import tqdm

generated_questions = []
sen_ques = re.compile(r"Question:\n(.*)\n?", re.MULTILINE)
j = 0
for i in tqdm(range(len(clickbaits) // 50)):
    clickbaits_batch = clickbaits[i * 50 : (i + 1) * 50]
    input_ids = tokenizer([PROMPT.format(question=c) for c in clickbaits_batch], padding=True, return_tensors="pt")
    with torch.inference_mode():
        generated_ids = model.generate(**input_ids, max_new_tokens=30)

    questions_batch = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    for question in tqdm(questions_batch):
        try:
            generated = next(sen_ques.finditer(question)).groups(0)[0]
        except StopIteration:
            generated = clickbaits[j]
        j += 1
        generated_questions.append(generated)

In [None]:
import pandas as pd

df = pd.read_csv("data/spoiler_generation/train_questions.csv")
df["vicuna"] = generated_questions
df["clickbait"] = clickbaits

In [None]:
df.to_csv("question_comparison.csv", index=False)

In [None]:
len(generated_questions)

In [None]:
import pandas as pd

pd.DataFrame(generated_questions, columns=["generated_questions"]).to_csv("data/spoiler_generation/vicuna/train_questions.csv", index=False)