In [1]:
%load_ext autoreload
%autoreload 2

In [25]:
import numpy as np
from src.pipeline import *
from src.utils import PropertyDict
from datasets.utils.logging import disable_progress_bar
import pandas as pd
from src.evaluation import evaluate_conversation
from src.utils import print_5_worst_source_answers

In [3]:
config = PropertyDict(
    seed=42,
    checkpoint_name="bert_tiny",
    model_name="bert_tiny",
    model_type="encoder_decoder",
    initialize_cross_attention=True,
    # generative_loss_weight=0.06,
    # rationale_loss_weight=1.44,
    generative_loss_weight=1.,
    rationale_loss_weight=1.,
    yng_loss_weight=1.,
    batch_size=32,
    val_batch_size=256,
    num_workers=0,
    num_epochs=3,
    optimizer_name="AdamW",
    learning_rate=5e-4,
    #loss_learning_rate=5e-1,
    scheduler="linear",
    warmup_fraction=0.1,
    accumulation_steps=1,
    mixed_precision="fp16",
    checkpoint_interval=600,
    log_interval=600,
    cpu=False,
)

set_seed(config.seed)
# disable_progress_bar()

In [6]:
tokenizer, model = make_model(config)
# print(model)

# Make the data
train_data = get_data("train", config).shuffle(42)
val_data = get_data("validation", config).shuffle(42)
train_dataloader = make_dataloader(train_data, tokenizer, config, split="train")
val_dataloader = make_dataloader(val_data, tokenizer, config, split="validation")

# Make the loss, the optimizer and the scheduler
loss_fn = make_loss(config)
optimizer = make_optimizer(model, loss_fn, config)
scheduler = make_scheduler(
    optimizer, steps_per_epoch=len(train_dataloader), config=config
)

train(
    model,
    train_dataloader,
    val_dataloader,
    loss_fn,
    optimizer,
    scheduler,
    config,
)

torch.save(model.state_dict(), f"checkpoints/{config.model_name}_{config.seed}_nh.pt")

SyntaxError: f-string: empty expression not allowed (3015379063.py, line 27)

In [5]:
tokenizer, model = make_model(config)
model.load_state_dict(torch.load("checkpoints/bert_tiny.pt"))

set_size = None
train_set = get_data("train", config).shuffle(42)
val_set = get_data("validation", config).shuffle(42)
test_set = get_data("test", config).shuffle(42)

if set_size is not None:
    train_set = train_set.select(range(set_size))
    val_set = val_set.select(range(set_size))
    test_set = test_set.select(range(set_size))

results = evaluate(model, tokenizer, train_set, val_set, test_set, config)

print()
for key, (data, res)in results.items():
    print(f"\033[1m Evaluating {key} set: \033[0m")

    print("Example of Q&A generated:")
    for i in range(5):
        print("A_pred:", data["pred_answer"][i])
        print("A_true:", tokenizer.decode(np.abs(data["labels"][i]), skip_special_tokens=True ))
        print()

    print(f"YesNoGen head f1: {res['yng_f1']*100:.2f} %")
    print(f"Rationale f1: {res['rationales_f1']*100:.2f} %")
    print(f"SQUAD-f1: {res['answers_squad_f1']*100:.2f} %")
    print("_"*30)
    print()

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertLMHeadModel: ['cls.seq_re

Cross-attention has been initialized with self-attention weights.


Loading cached shuffled indices for dataset at /home/matteo/uni/nlp/nlp_assignment2/data/train/train_no_history/bert_tiny/train/cache-63078ed47e5b4d10.arrow
Loading cached shuffled indices for dataset at /home/matteo/uni/nlp/nlp_assignment2/data/train/train_no_history/bert_tiny/validation/cache-6228d23398cad876.arrow
Loading cached shuffled indices for dataset at /home/matteo/uni/nlp/nlp_assignment2/data/train/train_no_history/bert_tiny/test/cache-20451cac29f656b2.arrow


Map:   0%|          | 0/85574 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Map:   0%|          | 0/85574 [00:00<?, ? examples/s]

Map:   0%|          | 0/85574 [00:00<?, ? examples/s]

Map:   0%|          | 0/85574 [00:00<?, ? examples/s]

Map:   0%|          | 0/85574 [00:00<?, ? examples/s]

Map:   0%|          | 0/85574 [00:00<?, ? examples/s]

Map:   0%|          | 0/21441 [00:00<?, ? examples/s]

Map:   0%|          | 0/21441 [00:00<?, ? examples/s]

Map:   0%|          | 0/21441 [00:00<?, ? examples/s]

Map:   0%|          | 0/21441 [00:00<?, ? examples/s]

Map:   0%|          | 0/21441 [00:00<?, ? examples/s]

Map:   0%|          | 0/21441 [00:00<?, ? examples/s]

Map:   0%|          | 0/7918 [00:00<?, ? examples/s]

Map:   0%|          | 0/7918 [00:00<?, ? examples/s]

Map:   0%|          | 0/7918 [00:00<?, ? examples/s]

Map:   0%|          | 0/7918 [00:00<?, ? examples/s]

Map:   0%|          | 0/7918 [00:00<?, ? examples/s]

Map:   0%|          | 0/7918 [00:00<?, ? examples/s]


[1m Evaluating train set: [0m
Example of Q&A generated:
A_pred: progressive growth
A_true: progressive growth

A_pred: mr. weston
A_true: place of rendezvous

A_pred: waterman
A_true: junk food

A_pred: sam
A_true: sam and kumar are

A_pred: mawda
A_true: dawda jawara

YesNoGen head f1: 77.66 %
Rationale f1: 50.78 %
SQUAD-f1: 36.94 %
______________________________

[1m Evaluating val set: [0m
Example of Q&A generated:
A_pred: every day
A_true: each month

A_pred: four dollars
A_true: swim across

A_pred: scott
A_true: the public

A_pred: a guning
A_true: with the pontoon train

A_pred: admiration
A_true: it concerns him

YesNoGen head f1: 72.05 %
Rationale f1: 50.26 %
SQUAD-f1: 25.48 %
______________________________

[1m Evaluating test set: [0m
Example of Q&A generated:
A_pred: yes
A_true: yes

A_pred: six
A_true: over 11 million

A_pred: yes
A_true: no

A_pred: no
A_true: no

A_pred: photos
A_true: with a prayer

YesNoGen head f1: 71.66 %
Rationale f1: 50.48 %
SQUAD-f1: 26.42 

In [14]:
if 'tokenizer' not in locals() or 'model' not in locals():
    tokenizer, model = make_model(config)
    model.load_state_dict(torch.load("checkpoints/bert_tiny.pt"))

set_size = 1000
if 'train_set' not in locals(): train_set = datasets.load_from_disk(CONFIG.dataset.processed_dir, )["train"].shuffle(42)
if 'val_set' not in locals(): val_set = datasets.load_from_disk(CONFIG.dataset.processed_dir)["validation"].shuffle(42)
if 'test_set' not in locals(): test_set = datasets.load_from_disk(CONFIG.dataset.processed_dir)["test"].shuffle(42)

if set_size is not None:
    train_set = train_set.select(range(set_size))
    val_set = val_set.select(range(set_size))
    test_set = test_set.select(range(set_size))

results = evaluate(model, tokenizer, train_set, val_set, test_set, config)

print()
for key, (data, res)in results.items():
    print(f"\033[1mEvaluating {key} set: \033[0m\n")

    print("Example of Q&A generated:")
    for i in range(5):
        print("Q:", data["question"][i])
        print("A_pred:", data["pred_answer"][i])
        print("A_true:", data["answer"][i])
        print()

    tot_squad_f1 = res.pop("tot_squad_f1")

    print(f"Total {key} dataset SQUAD-f1: {tot_squad_f1[0]:.2f}")

    for k,v in res.items():
        print(f" - {k} = {v[0]:.2f} ({v[1]:.1f} %)")

    print("_"*30)
    print()


[1mEvaluating train set: [0m

Example of Q&A generated:
Q: How did they greet?
A_pred: a pipe
A_true: The two men shook hands

Q: what?
A_pred: gloves
A_true: gloves

Q: Are they proud?
A_pred: yes
A_true: Yes

Q: How many terms did he serve?
A_pred: two
A_true: Three

Q: What was her opponent ranked?
A_pred: fourth
A_true: Fourth

Total train dataset SQUAD-f1: 0.37
 - yes_ans_f1 = 0.74 (10.0 %)
 - no_ans_f1 = 0.61 (8.2 %)
 - mc_quest_f1 = 0.40 (0.6 %)
 - wh_quest_f1 = 0.31 (76.4 %)
______________________________

[1mEvaluating val set: [0m

Example of Q&A generated:
Q: How often?
A_pred: every day
A_true: each month

Q: How?
A_pred: four dollars
A_true: swim across

Q: Who will decide the outcome of the election?
A_pred: scott
A_true: the public

Q: With what?
A_pred: a guning
A_true: With the pontoon train

Q: How does Horn feel about the plan?
A_pred: admiration
A_true: It concerns him

Total val dataset SQUAD-f1: 0.26
 - yes_ans_f1 = 0.70 (10.4 %)
 - no_ans_f1 = 0.49 (9.6 %)
 

In [None]:
if 'tokenizer' not in locals() or 'model' not in locals():
    tokenizer, model = make_model(config)
    model.load_state_dict(torch.load("checkpoints/bert_tiny.pt"))

if 'dataset' not in locals(): dataset = datasets.load_from_disk(CONFIG.dataset.filtered_dir).shuffle(42)

df_test = pd.DataFrame(dataset["test"]).iloc[:10]

for source, df in df_test.groupby(by=['source']):

    conversations_results = pd.DataFrame(evaluate_conversation(model, tokenizer, df))
    conversations_results = conversations_results.sort_values(by='conversation_f1_score', ascending=True, inplace=False).iloc[:5, :].reset_index(drop=True)
    print(conversations_results)
    raise

In [36]:
print_5_worst_source_answers(conversations_results)

  source                                            passage  \
0    cnn  Los Angeles (CNN) -- A man convicted of stalki...   
1    cnn  (CNN) -- Lionel Messi is not for sale. \n\nTha...   
2    cnn  (CNN) -- We should all be so lucky to have fri...   

                                           questions  \
0  [Who was arrested?, Did he escape from anywher...   
1  [who is not for sale, who was this message fro...   
2  [What author is this about?, What news media w...   

                                             answers  \
0  [Robert Dewey Hoskins,, A mental hospital, A w...   
1  [Lionel Messi, Barcelona's new president, the ...   
2  [Robert Crais, CNN, stars of crime novels, Pri...   

                                   predicted_answers  \
0  [a man, yes, friday, murder, police officer, p...   
1  [lionel messi, barcelona, manager, argentine, ...   
2  [billais, cnn, joe, oscar, los angeles, 1987, ...   

                                   answers_f1_scores  conversation_f1_sc