# Spoiler generation

In [24]:
import pandas as pd
from datasets import load_dataset
import torch
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from evaluate import load
import numpy as np
import itertools

from spoiler_generation.utils.stats import prepare_stats
from spoiler_generation.utils.dataset_class import Dataset


In [None]:
# The dataset is private
dataset = load_dataset("MateuszW/clickbait_spoiling_test")
test_df = pd.DataFrame(dataset["test"])
test_df["spoiler"] = test_df["spoiler"].apply(Dataset.preprocess_func)

In [31]:
test_df.shape

(1000, 6)

In [2]:
models_outputs = load_dataset(
    "MateuszW/spoiler_generation",
    data_files={
        "baseline": "models_output/deberta-baseline_output.csv",
        "deberta_paqott": "models_output/deberta-paqott_output.csv",
        "llama_pwot": "models_output/llama-pwot_output.csv",
        "vicuna_pwot": "models_output/vicuna-pwot_output.csv",
        "opt_pwot": "models_output/opt-pwot_output.csv",
        "llama_pwt": "models_output/llama-pwt_output.csv",
        "vicuna_pwt": "models_output/vicuna-pwt_output.csv",
        "opt_pwt": "models_output/opt-pwt_output.csv",
        "vicuna_ppt": "models_output/vicuna-ppt_output.csv",
    },
)

Found cached dataset csv (C:/Users/Mateusz/.cache/huggingface/datasets/MateuszW___csv/MateuszW--spoiler_generation-2d60d0350a7a6926/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/9 [00:00<?, ?it/s]

In [32]:
for output_name in models_outputs.keys():
    models_outputs[output_name] = pd.DataFrame(models_outputs[output_name])
    models_outputs[output_name]["spoiler"] = models_outputs[output_name]["spoiler"].apply(Dataset.preprocess_func)

# Regressor

### Load model

In [None]:
model_path = "MateuszW/regressor-deberta-iter1-iter2"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=1).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
PROMPT = "For given question:\n {} \nanswer:\n {} \ncontext:\n{}"

### Select spoilers without using models for specific type

In [10]:
merged_df = pd.DataFrame(
    zip(
        models_outputs["llama_pwt"]["spoiler"],
        models_outputs["vicuna_pwt"]["spoiler"],
        models_outputs["opt_pwt"]["spoiler"],
    )
)

In [12]:
merged_df.shape

(1000, 3)

In [105]:
selected_spoilers = []
for n in tqdm(range(merged_df.shape[0])):
    spoilers = merged_df.loc[n].tolist()
    data = [PROMPT.format(test_df.loc[n, "question"], i, test_df.loc[n, "context"]) for i in spoilers]
    input_ids = tokenizer(data, return_tensors="pt", padding="max_length", truncation=True).to("cuda")
    with torch.no_grad():
        outputs = model(**input_ids)

    selected_spoilers.append(spoilers[outputs.logits.argmax()])

100%|██████████| 988/988 [02:57<00:00,  5.56it/s]


In [137]:
prepare_stats(test_df, pd.DataFrame(selected_spoilers, columns=["spoiler"]))

### Select spoilers with models for specific type

In [33]:
test_not_multi_spoilers = test_df[test_df["type"]!="multi"].reset_index(drop=True)
test_multi_spoilers = test_df[test_df["type"]=="multi"].reset_index(drop=True)

In [34]:
multi_vicuna_pwt_output = models_outputs["vicuna_pwt"][test_df["type"]=="multi"].reset_index(drop=True)
multi_llama_pwt_output = models_outputs["llama_pwt"][test_df["type"]=="multi"].reset_index(drop=True)
multi_vicuna_ppt_output = models_outputs["vicuna_ppt"][test_df["type"]=="multi"].reset_index(drop=True)

In [20]:
multi_merged_df = pd.DataFrame(
    zip(
        multi_vicuna_pwt_output["spoiler"],
        multi_vicuna_ppt_output["spoiler"],
        multi_llama_pwt_output["spoiler"],
    )
)

In [35]:
not_multi_vicuna_ppt_output = models_outputs["vicuna_ppt"][test_df["type"]!="multi"].reset_index(drop=True)
not_multi_deberta_paqott_output = models_outputs["deberta_paqott"][test_df["type"]!="multi"].reset_index(drop=True)

In [22]:
not_multi_merged_df = pd.DataFrame(
    zip(
        not_multi_vicuna_ppt_output["spoiler"],
        not_multi_deberta_paqott_output["spoiler"],
    )
)

In [126]:
not_multi_selected_spoilers = []
for n in tqdm(range(not_multi_merged_df.shape[0])):
    spoilers = not_multi_merged_df.loc[n].tolist()
    data = [PROMPT.format(test_not_multi_spoilers.loc[n, "question"], i, test_not_multi_spoilers.loc[n, "context"]) for i in spoilers]
    input_ids = tokenizer(data, return_tensors="pt", padding="max_length", truncation=True).to("cuda")

    with torch.no_grad():
        outputs = model(**input_ids)

    not_multi_selected_spoilers.append(spoilers[outputs.logits.argmax()])

100%|██████████| 814/814 [01:39<00:00,  8.16it/s]


In [127]:
multi_selected_spoilers = []
for n in tqdm(range(multi_merged_df.shape[0])):
    spoilers = multi_merged_df.loc[n].tolist()
    data = [PROMPT.format(test_multi_spoilers.loc[n, "question"], i, test_multi_spoilers.loc[n, "context"]) for i in spoilers]
    input_ids = tokenizer(data, return_tensors="pt", padding="max_length", truncation=True).to("cuda")

    with torch.no_grad():
        outputs = model(**input_ids)

    multi_selected_spoilers.append(spoilers[outputs.logits.argmax()])

100%|██████████| 174/174 [00:21<00:00,  7.94it/s]


In [130]:
pred = pd.concat(
    [
        pd.DataFrame(not_multi_selected_spoilers, columns=["spoiler"]),
        pd.DataFrame(multi_selected_spoilers, columns=["spoiler"]),
    ]
).reset_index(drop=True)
ref = pd.concat([test_not_multi_spoilers, test_multi_spoilers]).reset_index(drop=True)
prepare_stats(ref, pred)

{'bleu': 0.44303314639062596,
 'precision': 0.9096897679422549,
 'recall': 0.913005078852418,
 'f1': 0.9108124781354718,
 'exact_match': 0.3208502024291498,
 'meteor': 0.5301155410177051}

### Check best score selecting by max bleu per example

In [36]:
bleu = load("bleu")

def calc_bleu(true_spoiler, predicted_spoiler):
    bleu_score = []
    for reference, hypothesis in zip(true_spoiler, predicted_spoiler):
        try:
            val = bleu.compute(
                predictions=[hypothesis],
                references=[[reference]],
                max_order=min(4, len(reference.split(" "))),
            )["bleu"]
        except ZeroDivisionError:
            val = 0
        bleu_score.append(val)
    return bleu_score


best_bleu_not_multi = pd.DataFrame(
    zip(
        calc_bleu(test_not_multi_spoilers["spoiler"], not_multi_vicuna_ppt_output["spoiler"]),
        calc_bleu(test_not_multi_spoilers["spoiler"], not_multi_deberta_paqott_output["spoiler"]),
    )
)
best_bleu_multi = pd.DataFrame(
    zip(
        calc_bleu(test_multi_spoilers["spoiler"], multi_llama_pwt_output["spoiler"]),
        calc_bleu(test_multi_spoilers["spoiler"], multi_vicuna_pwt_output["spoiler"]),
        calc_bleu(test_multi_spoilers["spoiler"], multi_vicuna_ppt_output["spoiler"]),
    )
)

In [37]:
best_bleu_not_multi.mean(axis=0), best_bleu_multi.mean(axis=0)

(0    0.429699
 1    0.453047
 dtype: float64,
 0    0.256311
 1    0.255186
 2    0.259893
 dtype: float64)

In [38]:
best_possible_bleu = pd.concat([best_bleu_not_multi.max(axis=1), best_bleu_multi.max(axis=1)]).mean()

##### Best possible bleu to achive

In [39]:
best_possible_bleu

0.5363586793501569

# Classifier

#### Load model

In [None]:
model_path = "MateuszW/classifier-distilbert"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
PROMPT = "For given question:\n{question}\nchoose what answer is better\n\n\n## Answer1:\n{ans1}\n\n## Answer2:\n{ans2}\n\n## Context:\n{context}"

### Select spoilers without using models for specific type

In [45]:
merged_df = pd.DataFrame(
    zip(
        models_outputs["vicuna_pwt"]["spoiler"],
        models_outputs["llama_pwt"]["spoiler"],
    )
).reset_index(drop=True)

In [46]:
import numpy as np
import itertools
from tqdm import tqdm

selected_spoilers = []
for n in tqdm(range(merged_df.shape[0])):
    spoilers = merged_df.loc[n].tolist()
    score = [0] * len(spoilers)
    for comb in itertools.combinations(range(len(spoilers)), 2):
        if spoilers[comb[0]] == "":
            score[comb[1]] += 1
            continue
        if spoilers[comb[1]] == "":
            score[comb[0]] += 1
            continue
        data = [
            PROMPT.format(question=test_df.loc[n, "question"], ans1=spoilers[comb[0]], ans2=spoilers[comb[1]], context=test_df.loc[n, "context"])
        ]
        input_ids = tokenizer(data, return_tensors="pt", padding="max_length", truncation=True).to("cuda")
        with torch.no_grad():
            outputs = model(**input_ids)
        if outputs.logits.argmax() == 1:
            score[comb[0]] += 1
        else:
            score[comb[1]] += 1

    selected_spoilers.append(spoilers[np.argmax(score)])

100%|██████████| 988/988 [00:21<00:00, 45.73it/s]


In [50]:
prepare_stats(test_df, pd.DataFrame(selected_spoilers, columns=["spoiler"]))



### Select spoilers with models for specific type

In [None]:
test_not_multi_spoilers = test_df[test_df["type"]!="multi"].reset_index(drop=True)
test_multi_spoilers = test_df[test_df["type"]=="multi"].reset_index(drop=True)

In [29]:
not_multi_merged_df = pd.DataFrame(
    zip(
        not_multi_vicuna_ppt_output["spoiler"],
        not_multi_deberta_paqott_output["spoiler"],
    )
)

In [None]:
multi_merged_df = pd.DataFrame(
    zip(
        multi_vicuna_pwt_output["spoiler"],
        multi_vicuna_ppt_output["spoiler"],
        multi_llama_pwt_output["spoiler"],
    )
)

In [30]:
not_multi_selected_spoilers = []
for n in tqdm(range(not_multi_merged_df.shape[0])):
    spoilers = not_multi_merged_df.loc[n].tolist()
    if spoilers[0] == "":
        not_multi_selected_spoilers.append(spoilers[1])
        continue
    if spoilers[1] == "":
        not_multi_selected_spoilers.append(spoilers[0])
        continue
    data = [PROMPT.format(question=test_not_multi_spoilers.loc[n, "question"], ans1=spoilers[0], ans2=spoilers[1], context=test_not_multi_spoilers.loc[n, "context"])]
    input_ids = tokenizer(data, return_tensors="pt", padding="max_length", truncation=True).to("cuda")
    with torch.no_grad():
        outputs = model(**input_ids)

    not_multi_selected_spoilers.append(spoilers[1 - outputs.logits.argmax()])

100%|██████████| 814/814 [00:17<00:00, 47.46it/s]


In [43]:
multi_selected_spoilers = []
for n in tqdm(range(multi_merged_df.shape[0])):
    spoilers = multi_merged_df.loc[n].tolist()
    score = [0] * len(spoilers)
    for comb in itertools.combinations(range(len(spoilers)), 2):
        if spoilers[comb[0]] == "":
            score[comb[1]] += 1
            continue
        if spoilers[comb[1]] == "":
            score[comb[0]] += 1
            continue
        data = [
            PROMPT.format(
                question=test_multi_spoilers.loc[n, "question"], ans1=spoilers[comb[0]], ans2=spoilers[comb[1]], context=test_multi_spoilers.loc[n, "context"]
            )
        ]
        input_ids = tokenizer(data, return_tensors="pt", padding="max_length", truncation=True).to("cuda")
        with torch.no_grad():
            outputs = model(**input_ids)
        if outputs.logits.argmax() == 1:
            score[comb[0]] += 1
        else:
            score[comb[1]] += 1

    multi_selected_spoilers.append(spoilers[np.argmax(score)])

100%|██████████| 174/174 [00:03<00:00, 44.88it/s]


In [44]:
pred = pd.concat(
    [
        pd.DataFrame(not_multi_selected_spoilers, columns=["spoiler"]),
        pd.DataFrame(multi_selected_spoilers, columns=["spoiler"]),
    ]
).reset_index(drop=True)
ref = pd.concat([test_not_multi_spoilers, test_multi_spoilers]).reset_index(drop=True)
prepare_stats(ref, pred)

{'bleu': 0.43455372191988867,
 'precision': 0.9114617712584584,
 'recall': 0.9109391218978866,
 'f1': 0.9106571890323268,
 'exact_match': 0.31275303643724695,
 'meteor': 0.5198940468784699}