In [None]:
import os
import warnings
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
import torch
from dotenv import load_dotenv
from razdel import sentenize
from transformers import set_seed

import src.utils.data as data_utils

In [None]:
load_dotenv()

warnings.filterwarnings("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2

EXTERNAL = Path(os.getenv("EXTERNAL_STORAGE_DIR"))
SRC = EXTERNAL / "data" / "raw" / "gazeta_validation.jsonl"
ROOT = Path(os.getenv("ROOT"))
METRICS_DIR = ROOT / "metrics"
METRICS_DIR.mkdir(parents=True, exist_ok=True)
RANDOM_STATE = 42
set_seed(RANDOM_STATE)
text_col, summ_col, title_col = "text", "reference_summary", "title"
device = "cuda" if torch.cuda.is_available() else "cpu"
N_SAMPLE = 400

In [None]:
df = pd.read_json(SRC, lines=True)

print(df.shape)
df.head(2)

In [None]:
df.loc[0, "reference_summary"]

In [None]:
def lead_k(text: Optional[str], k: Optional[int]) -> Optional[str]:
    sents = [s.text.strip() for s in sentenize(text or "")]
    return " ".join(sents[:k])

In [None]:
preds = [lead_k(text, 3) for text in df[text_col]]
refs = df[summ_col].tolist()

In [None]:
preds[0]

In [None]:
rouge_scores = data_utils.get_rouge_f1(preds, refs)

rouge_scores

In [None]:
len(preds), len(refs)

In [None]:
scores = data_utils.get_all_scores(preds[:N_SAMPLE], refs[:N_SAMPLE], device=device)
scores

In [None]:
Path(METRICS_DIR).mkdir(parents=True, exist_ok=True)

df_metrics = pd.DataFrame(
    [
        {
            "system": "extractive_lead3",
            "split": "validation_full",
            "rouge1": scores.get("rouge1", 0.0),
            "rouge2": scores.get("rouge2", 0.0),
            "rougeL": scores.get("rougeL", 0.0),
            "rougeLsum": scores.get("rougeLsum", 0.0),
            "bertscore_precision": scores.get("bertscore_precision", 0.0),
            "bertscore_recall": scores.get("bertscore_recall", 0.0),
            "bertscore_f1": scores.get("bertscore_f1", 0.0),
            "avg_len_pred": scores.get("avg_len_pred", 0.0),
            "avg_len_ref": scores.get("avg_len_ref", 0.0),
            "len_ratio_pred_to_ref": scores.get("len_ratio_pred_to_ref", 0.0),
            "k": 3,
            "n_examples": df.shape[0],
        }
    ]
)
df_metrics.to_csv(METRICS_DIR / f"lead3_validation_{N_SAMPLE}.csv", index=False)

df_sampels = pd.DataFrame(
    {
        "title": df["title"].head(3) if "title" in df else [""] * 3,
        "reference": refs[:3],
        "prediction": preds[:3],
    }
)
df_sampels.to_csv(METRICS_DIR / f"lead3_validation_examples.tsv", sep="\t", index=False)