In [1]:
import sys
import os

root_path = ""
for path in os.getcwd().split("\\")[:-2]:
    root_path += f"{path}/"
sys.path.insert(1, root_path)

In [None]:
# Importing stock libraries
import numpy as np
import pandas as pd
import torch
import yaml
import datasets
from torch import cuda
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    SwitchTransformersForConditionalGeneration,
)

from src.data.preparing import CustomDataset
from src.model.switch_transformer import trainer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data["target_ids"].to(device, dtype=torch.long)
            ids = data["source_ids"].to(device, dtype=torch.long)
            mask = data["source_mask"].to(device, dtype=torch.long)

            generated_ids = model.generate(
                input_ids=ids,
                attention_mask=mask,
                max_length=150,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True,
            )
            preds = [
                tokenizer.decode(
                    g, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                for g in generated_ids
            ]
            target = [
                tokenizer.decode(
                    t, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                for t in y
            ]
            if _ % 100 == 0:
                print(f"Completed {_}")
                break

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals


val_dataset = encoded_dataset["validation"]
val_set = CustomDataset(
    val_dataset, tokenizer, config["MAX_LEN"], config["SUMMARY_LEN"]
)
val_params = {
    "batch_size": config["VALID_BATCH_SIZE"],
    "shuffle": False,
    "num_workers": 0,
}

# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.

val_loader = DataLoader(val_set, **val_params)
# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
# Saving the dataframe as predictions.csv
print(
    "Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe"
)
for epoch in range(config.VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
    final_df.to_csv("./output/predictions.csv")
    print("Output Files generated for review")