In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install rouge-score
!pip install bert_score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=affa607eb394f8f503425fb11106ee444e447c4cf0d7f5f3d672c6dae5e9dc11
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [3]:
import numpy as np
import pandas as pd
import csv
import nltk
import random
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import torch
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
from transformers import GPT2LMHeadModel, GPT2Tokenizer,GPT2Model, GPT2Config, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from bert_score import BERTScorer
from transformers import BertTokenizer, BertForMaskedLM, BertModel

In [4]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and digits
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [5]:
# Define a function to tokenize, convert text to indices, and pad sequences
def tokenize_and_pad(data_list, max_article_length=1021, max_highlights_length=1024):
    tokenized_data_list = []
    for article, highlights in data_list:
        # Tokenize and convert to indices
        article_tokens = tokenizer.encode(article, add_special_tokens=True)
        highlights_tokens = tokenizer.encode(highlights, add_special_tokens=True)

        # Pad sequences to specified lengths
        padded_article_tokens = torch.tensor(article_tokens + [tokenizer.convert_tokens_to_ids(pad_token)] * (max_article_length - len(article_tokens)))
        padded_highlights_tokens = torch.tensor(highlights_tokens + [tokenizer.convert_tokens_to_ids(pad_token)] * (max_highlights_length - len(highlights_tokens)))

        # Append to the tokenized_data_list only if both token lists are not empty
        if len(article_tokens) > 0 and len(highlights_tokens) > 0:
            tokenized_data_list.append((padded_article_tokens, padded_highlights_tokens))

    return tokenized_data_list

In [30]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction


In [31]:
def calculate_bleu_score(machine_results, reference_texts):
    bleu_score = corpus_bleu([[ref.split()] for ref in reference_texts], [gen.split() for gen in machine_results],smoothing_function=SmoothingFunction.method1)
    return bleu_score

In [7]:
def calculate_rouge_scores(generated_answers, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_rouge1, total_rouge2, total_rougeL = 0, 0, 0
    for gen, ref in zip(generated_answers, ground_truth):
        scores = scorer.score(gen, ref)
        total_rouge1 += scores['rouge1'].fmeasure
        total_rouge2 += scores['rouge2'].fmeasure
        total_rougeL += scores['rougeL'].fmeasure
    average_rouge1 = total_rouge1 / len(generated_answers)
    average_rouge2 = total_rouge2 / len(generated_answers)
    average_rougeL = total_rougeL / len(generated_answers)
    return average_rouge1, average_rouge2, average_rougeL

In [8]:
def calculate_bert_score(generated_answers, ground_truth):
    scorer = BERTScorer(model_type='bert-base-uncased')
    P, R, F1 = scorer.score(generated_answers, ground_truth)
    avg_precision = sum(p.mean() for p in P) / len(P)
    avg_recall = sum(r.mean() for r in R) / len(R)
    avg_f1 = sum(f1.mean() for f1 in F1) / len(F1)
    return avg_precision, avg_recall, avg_f1

In [9]:
import csv

def read_csv_data(file_path):
    """Read a CSV file and return a list of tuples containing articles and highlights."""
    data_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            article = row.get('article', '')
            highlights = row.get('highlights', '')
            data_list.append((article, highlights))
    return data_list

# File paths
test_csv_path = '/content/drive/MyDrive/test.csv'
train_csv_path = '/content/drive/MyDrive/train.csv'
val_csv_path = '/content/drive/MyDrive/validation.csv'

# Read data from each CSV file
test_data_list = read_csv_data(test_csv_path)
train_data_list = read_csv_data(train_csv_path)
val_data_list = read_csv_data(val_csv_path)

In [10]:
def sample_one_percent(data_list):
    # Calculate 1% of the original data size
    one_percent_size = int(0.01 * len(data_list))

    # Randomly sample 1% of the data
    return random.sample(data_list, one_percent_size)

random.seed(14)

# Sample 1% from each dataset
onetest_data_list = sample_one_percent(test_data_list)
onetrain_data_list = sample_one_percent(train_data_list)
oneval_data_list = sample_one_percent(val_data_list)

In [11]:
print(len(onetest_data_list))
print(len(onetrain_data_list))
print(len(oneval_data_list))

114
2871
133


In [12]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
ptrain_data_list = [(preprocess_text(article), preprocess_text(highlights)) for article, highlights in onetrain_data_list]
ptest_data_list = [(preprocess_text(article), preprocess_text(highlights)) for article, highlights in onetest_data_list]
pval_data_list = [(preprocess_text(article), preprocess_text(highlights)) for article, highlights in oneval_data_list]


In [14]:
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Define a pad token and add it to the tokenizer
pad_token = tokenizer.eos_token
tokenizer.add_tokens([pad_token])

# Apply tokenization and padding to your datasets
max_article_length = 1021
max_highlights_length = 1024
tokenized_train_data_list = tokenize_and_pad(ptrain_data_list, max_article_length, max_highlights_length)
tokenized_test_data_list = tokenize_and_pad(ptest_data_list, max_article_length, max_highlights_length)
tokenized_val_data_list = tokenize_and_pad(pval_data_list, max_article_length, max_highlights_length)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1114 > 1024). Running this sequence through the model will result in indexing errors


In [15]:
import torch

def process_tokenized_data(tokenized_data_list, max_article_length, max_highlights_length):
    input_ids = []
    target_ids = []

    for padded_article_tokens, padded_highlights_tokens in tokenized_data_list:
        # Truncate article tokens if greater than max_article_length
        truncated_article_tokens = padded_article_tokens[:max_article_length]

        # Truncate highlights tokens if greater than max_highlights_length
        truncated_highlights_tokens = padded_highlights_tokens[:max_highlights_length]

        # Append truncated tokens to the respective lists
        input_ids.append(truncated_article_tokens)
        target_ids.append(truncated_highlights_tokens)

    # Convert the lists to PyTorch tensors
    input_ids_tensor = torch.stack(input_ids)
    target_ids_tensor = torch.stack(target_ids)

    return input_ids_tensor, target_ids_tensor

# Specify maximum lengths
max_article_length = 1021
max_highlights_length = 1024

# Process training and validation data
input_ids_train, target_ids_train = process_tokenized_data(tokenized_train_data_list, max_article_length, max_highlights_length)
input_ids_val, target_ids_val = process_tokenized_data(tokenized_val_data_list, max_article_length, max_highlights_length)

In [29]:
input_ids_test, target_ids_test = process_tokenized_data(tokenized_test_data_list, max_article_length, max_highlights_length)

In [16]:

print(input_ids_train.shape)
print(input_ids_train)

print(target_ids_train.shape)
print(target_ids_train)

torch.Size([2871, 1021])
tensor([[   83,  7167, 24044,  ..., 50256, 50256, 50256],
        [  283, 11098, 11277,  ..., 50256, 50256, 50256],
        [   76,  1228,  2429,  ..., 50256, 50256, 50256],
        ...,
        [   71,  1018,    78,  ..., 50256, 50256, 50256],
        [   75,  3301,   308,  ..., 50256, 50256, 50256],
        [  368,  2611,   266,  ..., 50256, 50256, 50256]])
torch.Size([2871, 1024])
tensor([[25878,   785,   649,  ..., 50256, 50256, 50256],
        [  418,   292, 16298,  ..., 50256, 50256, 50256],
        [24622,  6294,  2994,  ..., 50256, 50256, 50256],
        ...,
        [ 4102,   289,  2040,  ..., 50256, 50256, 50256],
        [ 1102,  8534,   276,  ..., 50256, 50256, 50256],
        [  368,  2611,   266,  ..., 50256, 50256, 50256]])


In [17]:
# Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
gpt2_model = GPT2LMHeadModel.from_pretrained(model_name)

# Define the number of prompts and embedding size
num_prompts_token = 3  # "summarize the following text"
embedding_size = 768

# Define a specific sentence
sentence = "summarize"

# Tokenize the sentence
input_ids = tokenizer.encode(sentence, return_tensors='pt')

# Get the embeddings for the input_ids from the GPT-2 model
gpt2_embeddings = gpt2_model.transformer.wte(input_ids)

# Create an embedding layer for soft prompts and initialize with the sentence embeddings
soft_prompt_embeddings = nn.Embedding(num_prompts_token, embedding_size)
soft_prompt_embeddings.weight.data.copy_(gpt2_embeddings.squeeze(0))


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tensor([[ 0.0004, -0.1206,  0.0394,  ...,  0.2581, -0.1128,  0.0265],
        [ 0.1030,  0.0311,  0.0340,  ..., -0.0216, -0.0657, -0.2662],
        [ 0.0113,  0.1548, -0.0212,  ...,  0.1057,  0.2224, -0.0694]],
       grad_fn=<CopyBackwards>)

In [18]:
print("Shape of soft prompt embeddings:", soft_prompt_embeddings.weight.data.shape)

Shape of soft prompt embeddings: torch.Size([3, 768])


In [19]:
# Concatenate soft prompt embeddings at the beginning of the input sequence
class GPT2WithPromptTuning(nn.Module):
    def __init__(self, gpt2_model, soft_prompt_embeddings):
        super(GPT2WithPromptTuning, self).__init__()
        self.gpt2_model = gpt2_model
        self.soft_prompt_embeddings = soft_prompt_embeddings

    def forward(self, input_ids, soft_prompt_ids):
        # Get the embeddings for the input_ids from the GPT-2 model
        gpt2_embeddings = self.gpt2_model.transformer.wte(input_ids)
        # Get the embeddings for the soft prompts
        soft_prompt_embeds = self.soft_prompt_embeddings(soft_prompt_ids)


        # Concatenate the embeddings
        embeddings = torch.cat([soft_prompt_embeds, gpt2_embeddings], dim=0)


        # Pass the concatenated embeddings through the GPT-2 model
        outputs = self.gpt2_model(inputs_embeds=embeddings)

        return outputs

In [20]:
# Initialize the model
model = GPT2WithPromptTuning(gpt2_model, soft_prompt_embeddings)

# Freeze GPT-2 model weights
for param in model.gpt2_model.parameters():
    param.requires_grad = False

# Define hyperparameters
batch_size = 8
epochs = 1
learning_rate = 2e-3
gradient_clip_value = 1.0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to GPU
model.to(device)

# Define optimizer and criterion
optimizer = torch.optim.AdamW(model.soft_prompt_embeddings.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=-100)

soft_prompt_ids = torch.tensor([0, 1, 2])

In [21]:
# Lists to store scores
train_bleu_scores = []
train_bert_scores = []
train_rouge1_scores = []
train_rouge2_scores = []
train_rougeL_scores = []

val_bleu_scores = []
val_bert_scores = []
val_rouge1_scores = []
val_rouge2_scores = []
val_rougeL_scores = []

In [22]:
def calculate_metrics(predictions, references):
    # Convert tensor predictions and references to lists
    predictions_decoded = tokenizer.decode(predictions)
    references_decoded =tokenizer.decode(references)

    # Calculate BLEU Score
    bleu_score = calculate_bleu_score([predictions_decoded], [references_decoded])

    # Calculate BERTScore
    bert_precision, bert_recall, bert_f1 = calculate_bert_score([predictions_decoded], [references_decoded])

    # Calculate ROUGE Scores
    rouge1, rouge2, rougeL = calculate_rouge_scores([predictions_decoded], [references_decoded])

    return bleu_score, bert_f1, rouge1, rouge2, rougeL

In [23]:
def train_epoch(model, data_iterator, optimizer, criterion, device, soft_prompt_ids, gradient_clip_value):
    model.train()
    train_bleu_scores = []
    train_bert_scores = []
    train_rouge1_scores = []
    train_rouge2_scores = []
    train_rougeL_scores = []

    for input_ids, target_ids in data_iterator:
        optimizer.zero_grad()

        # Move input and target tensors to GPU
        input_ids, target_ids = input_ids.to(device), target_ids.to(device)

        # Forward pass
        outputs = model(input_ids, soft_prompt_ids.to(device))
        logits = outputs.logits if hasattr(outputs, "logits") else outputs.last_hidden_state

        loss = criterion(logits, target_ids)
        loss.backward()

        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip_value)
        optimizer.step()

        # Update the progress bar description with the current loss
        data_iterator.set_postfix(loss=loss.item())

        # Predictions and references
        predictions = logits.argmax(dim=-1).squeeze(0).tolist()
        references = target_ids.squeeze(0).tolist()

        # Calculate metrics
        bleu_score, bert_f1, rouge1, rouge2, rougeL = calculate_metrics(predictions, references)
        train_bleu_scores.append(bleu_score)
        train_bert_scores.append(bert_f1)
        train_rouge1_scores.append(rouge1)
        train_rouge2_scores.append(rouge2)
        train_rougeL_scores.append(rougeL)

    return train_bleu_scores, train_bert_scores, train_rouge1_scores, train_rouge2_scores, train_rougeL_scores

In [24]:
def validate_epoch(model, input_ids_val, target_ids_val, criterion, device, soft_prompt_ids):
    model.eval()
    val_losses = []
    val_bleu_scores_epoch = []
    val_bert_scores_epoch = []
    val_rouge1_scores_epoch = []
    val_rouge2_scores_epoch = []
    val_rougeL_scores_epoch = []

    with torch.no_grad():
        for input_ids_val, target_ids_val in zip(input_ids_val, target_ids_val):
            input_ids_val, target_ids_val = input_ids_val.to(device), target_ids_val.to(device)
            outputs_val = model(input_ids_val, soft_prompt_ids.to(device))
            logits_val = outputs_val.logits if hasattr(outputs_val, "logits") else outputs_val.last_hidden_state

            loss_val = criterion(logits_val, target_ids_val)
            val_losses.append(loss_val.item())

            # Predictions and references
            predictions_val = logits_val.argmax(dim=-1).squeeze(0).tolist()
            references_val = target_ids_val.squeeze(0).tolist()

            # Calculate metrics
            bleu_score_val, bert_f1_val, rouge1_val, rouge2_val, rougeL_val = calculate_metrics(predictions_val, references_val)
            val_bleu_scores_epoch.append(bleu_score_val)
            val_bert_scores_epoch.append(bert_f1_val)
            val_rouge1_scores_epoch.append(rouge1_val)
            val_rouge2_scores_epoch.append(rouge2_val)
            val_rougeL_scores_epoch.append(rougeL_val)

    return val_losses, val_bleu_scores_epoch, val_bert_scores_epoch, val_rouge1_scores_epoch, val_rouge2_scores_epoch, val_rougeL_scores_epoch

In [25]:
# Training loop
for epoch in range(epochs):
    data_iterator = tqdm(zip(input_ids_train, target_ids_train), desc=f'Epoch {epoch + 1}', total=len(input_ids_train))
    train_bleu_scores, train_bert_scores, train_rouge1_scores, train_rouge2_scores, train_rougeL_scores = train_epoch(
        model, data_iterator, optimizer, criterion, device, soft_prompt_ids, gradient_clip_value
    )

    # Validation loop
    val_losses, val_bleu_scores_epoch, val_bert_scores_epoch, val_rouge1_scores_epoch, val_rouge2_scores_epoch, val_rougeL_scores_epoch = validate_epoch(
        model, input_ids_val, target_ids_val, criterion, device, soft_prompt_ids
    )

    # Average validation loss
    avg_val_loss = sum(val_losses) / len(val_losses)
    print(f"Epoch: {epoch + 1}, Train Loss: {train_bleu_scores[-1]}, Val Loss: {avg_val_loss}")

    # Average validation scores
    avg_bleu_score_val = sum(val_bleu_scores_epoch) / len(val_bleu_scores_epoch)
    avg_bert_score_val = sum(val_bert_scores_epoch) / len(val_bert_scores_epoch)
    avg_rouge1_score_val = sum(val_rouge1_scores_epoch) / len(val_rouge1_scores_epoch)
    avg_rouge2_score_val = sum(val_rouge2_scores_epoch) / len(val_rouge2_scores_epoch)
    avg_rougeL_score_val = sum(val_rougeL_scores_epoch) / len(val_rougeL_scores_epoch)

    print("Validation BLEU Score:", avg_bleu_score_val)
    print("Validation BERTScore:", avg_bert_score_val)
    print("Validation ROUGE-1 Score:", avg_rouge1_score_val)
    print("Validation ROUGE-2 Score:", avg_rouge2_score_val)
    print("Validation ROUGE-L Score:", avg_rougeL_score_val)

    # Append validation scores
    val_bleu_scores.append(avg_bleu_score_val)
    val_bert_scores.append(avg_bert_score_val)
    val_rouge1_scores.append(avg_rouge1_score_val)
    val_rouge2_scores.append(avg_rouge2_score_val)
    val_rougeL_scores.append(avg_rougeL_score_val)

# Close the tqdm progress bar
data_iterator.close()

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Epoch 1: 100%|██████████| 2871/2871 [1:11:54<00:00,  1.50s/it, loss=0.456]


Epoch: 1, Train Loss: 0, Val Loss: 0.44752691137163264
Validation BLEU Score: 0.0
Validation BERTScore: tensor(0.9558)
Validation ROUGE-1 Score: 0.9637385689459647
Validation ROUGE-2 Score: 0.9636880463128791
Validation ROUGE-L Score: 0.9637385689459647


In [26]:
def print_average_scores(scores, prefix):
    avg_bleu_score = sum(scores['bleu']) / len(scores['bleu']) if scores['bleu'] else 0
    avg_bert_score = sum(scores['bert']) / len(scores['bert']) if scores['bert'] else 0
    avg_rouge1_score = sum(scores['rouge1']) / len(scores['rouge1']) if scores['rouge1'] else 0
    avg_rouge2_score = sum(scores['rouge2']) / len(scores['rouge2']) if scores['rouge2'] else 0
    avg_rougeL_score = sum(scores['rougeL']) / len(scores['rougeL']) if scores['rougeL'] else 0

    print(f"Average {prefix} BLEU Score:", avg_bleu_score)
    print(f"Average {prefix} BERTScore:", avg_bert_score)
    print(f"Average {prefix} ROUGE-1 Score:", avg_rouge1_score)
    print(f"Average {prefix} ROUGE-2 Score:", avg_rouge2_score)
    print(f"Average {prefix} ROUGE-L Score:", avg_rougeL_score)

In [27]:
# Calculate and print average scores for training
train_scores = {
    'bleu': train_bleu_scores,
    'bert': train_bert_scores,
    'rouge1': train_rouge1_scores,
    'rouge2': train_rouge2_scores,
    'rougeL': train_rougeL_scores,
}

print_average_scores(train_scores, "Training")

# Calculate and print average scores for validation
val_scores = {
    'bleu': val_bleu_scores,
    'bert': val_bert_scores,
    'rouge1': val_rouge1_scores,
    'rouge2': val_rouge2_scores,
    'rougeL': val_rougeL_scores,
}

print_average_scores(val_scores, "Validation")

Average Training BLEU Score: 1.3087328600458394e-05
Average Training BERTScore: tensor(0.9268)
Average Training ROUGE-1 Score: 0.8945867793209665
Average Training ROUGE-2 Score: 0.8905843839267907
Average Training ROUGE-L Score: 0.894264279662624
Average Validation BLEU Score: 0.0
Average Validation BERTScore: tensor(0.9558)
Average Validation ROUGE-1 Score: 0.9637385689459647
Average Validation ROUGE-2 Score: 0.9636880463128791
Average Validation ROUGE-L Score: 0.9637385689459647


In [28]:
 # Save model weights
torch.save(model.state_dict(), 'Summarize_weights1.pth')

In [36]:
test_losses, test_bleu_scores_epoch, test_bert_scores_epoch, test_rouge1_scores_epoch, test_rouge2_scores_epoch, test_rougeL_scores_epoch = validate_epoch(
        model, input_ids_test, target_ids_test, criterion, device, soft_prompt_ids
  )



In [39]:
# Calculate and print average scores for training
test_scores = {
    'bleu': test_bleu_scores_epoch,
    'bert': test_bert_scores_epoch,
    'rouge1': test_rouge1_scores_epoch,
    'rouge2': test_rouge2_scores_epoch,
    'rougeL': test_rougeL_scores_epoch,
}

print_average_scores(test_scores, "Test")


Average Test BLEU Score: 0.0
Average Test BERTScore: tensor(0.9559)
Average Test ROUGE-1 Score: 0.9643900862838889
Average Test ROUGE-2 Score: 0.9643464746319005
Average Test ROUGE-L Score: 0.9643900862838889
