In [None]:
# !pip install gdown

In [None]:
from transformers import (
    AdamW,
    PegasusForConditionalGeneration,
    PegasusTokenizerFast as PegasusTokenizer
)
from tqdm.auto import tqdm

In [None]:
token = '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'

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar
from pytorch_lightning.loggers import TensorBoardLogger

In [None]:
import json
import pandas as pd
import numpy as np
import torch
import transformers
from pathlib import Path
from torch.utils.data import Dataset, DataLoader

#from pytorch_lightning.callbacks import ModelCheckpoint
#from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

In [None]:
# !gdown 15WyzzCmFGXQjQuSzABIxaEjXWmE1aDJv

In [None]:
df = pd.read_csv('/kaggle/input/Dataset_articles_NoID-2.csv', encoding = "utf-8")

In [None]:
import pandas as pd

In [None]:
df = df[["Summary", "Contents"]]
df.columns = ["summary", "text"]

In [None]:
df = df.dropna()
df.head()

In [None]:
df.shape

In [None]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(style='whitegrid', palette='muted', font_scale = 1.2)
rcParams['figure.figsize'] = 16, 10

In [None]:
pl.seed_everything(42)

In [None]:
train_df, test_df = train_test_split(df, test_size = 0.0001)
train_df.shape, test_df.shape

In [None]:
class DataSet(Dataset):
    def __init__(
            self,
            data: pd.DataFrame,
            tokenizer: PegasusTokenizer,
            text_max_token_len: int = 512,
            summary_max_token_len: int = 300,
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text = data_row["text"]
        text_encoding = tokenizer(
            text,
            max_length=self.text_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        summary_encoding = tokenizer(
            data_row["summary"],
            max_length=self.summary_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        labels = summary_encoding["input_ids"]
        labels[labels == 0] = -100

        return dict(
            text=text,
            summary=data_row["summary"],
            text_input_ids=text_encoding["input_ids"].flatten(),
            text_attention_mask=text_encoding["attention_mask"].flatten(),
            labels=labels.flatten(),
            labels_attention_mask=summary_encoding["attention_mask"].flatten()
        )


In [None]:
class SummaryDataModule(pl.LightningDataModule):
    def __init__(
            self,
            train_df: pd.DataFrame,
            test_df: pd.DataFrame,
            tokenizer: PegasusTokenizer,
            batch_size: int = 8,
            text_max_token_len: int = 512,
            summary_max_token_len: int = 300
    ):
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df

        self.batch_size = batch_size
        self.tokenizer = tokenizer

        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def setup(self, stage=None):
        self.train_dataset = DataSet(
            self.train_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )
        self.test_dataset = DataSet(
            self.test_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=2
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=2
        )


In [None]:
# MODEL_NAME = 'google/bigbird-pegasus-large-arxiv'
MODEL_NAME = 'google/pegasus-cnn_dailymail'
tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME, token=token)

In [None]:
text_token_counts, summary_token_counts = [], []

for _, row in tqdm(train_df.iterrows()):
#     bigbird-pegasus-large-arxiv
#     text_token_count = len(tokenizer.encode(row["text"], max_length=4096, truncation=True))
#     google/pegasus-cnn_dailymail
    text_token_count = len(tokenizer.encode(row["text"], max_length=1024, truncation=True))
    text_token_counts.append(text_token_count)
    summary_token_count = len(tokenizer.encode(row["summary"]))
    summary_token_counts.append(summary_token_count)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)

sns.histplot(text_token_counts, ax = ax1)
ax1.set_title("full text token counts")

sns.histplot(summary_token_counts, ax= ax2)
ax2.set_title("summary text token counts")

In [None]:
num_epochs = 4
BATCH_SIZE = 2

data_module = SummaryDataModule(train_df, test_df, tokenizer,batch_size = BATCH_SIZE)

In [None]:
class SummaryModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = PegasusForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True, token=token)

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )

        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]

        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=labels_attention_mask,
            labels=labels
        )

        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=5e-4)


In [None]:
model = SummaryModel()
# model = SummaryModel.load_from_checkpoint('best.ckpt')

In [None]:
%load_ext tensorboard 
%tensorboard --logdir ./linghtning_logs

In [None]:
checkpoint_callback = ModelCheckpoint(
        dirpath ="/kaggle/working/checkpoints",
        filename = "best-checkpoint",
        save_top_k=1,
        verbose=True,
        monitor='val_loss',
        mode='min'
    )
logger = TensorBoardLogger("lightning_logs", name = "news-summary")

In [None]:
# Tạo callback cho thanh tiến trình
progress_bar = TQDMProgressBar(refresh_rate=1)  # Cập nhật mỗi 10 giây

In [None]:
trainer = pl.Trainer(
    logger = logger, 
    callbacks=[checkpoint_callback, progress_bar],  # Thay đổi này 
    max_epochs = num_epochs, 
    accelerator="gpu",
    devices=-1
)

In [None]:
trainer.fit(model, data_module)

In [None]:
trained_model = SummaryModel.load_from_checkpoint( 
    trainer.checkpoint_callback.best_model_path
)

trained_model.freeze()
trained_model.to('cpu')

In [None]:
def summarize(text):
    # Encode văn bản đầu vào với mã hóa UTF-8
    text = text.encode('utf-8').decode('utf-8')
    
    text_encoding = tokenizer(
        text,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )

    generated_ids = trained_model.model.generate(
        input_ids=text_encoding["input_ids"],
        attention_mask=text_encoding["attention_mask"],
        max_length=300,
        num_beams=2,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )

    preds = [
        tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for gen_id in generated_ids
    ]

    return "".join(preds)


In [None]:
sample_row = test_df.iloc[0]
text = sample_row["text"]
model_summary = summarize(text)

In [None]:
text

In [None]:
sample_row["summary"]

In [None]:
model_summary

In [None]:
text = """
Speech processing is the study of speech signals and the processing methods of signals. The signals are usually processed in a digital representation, so speech processing can be regarded as a special case of digital signal processing, applied to speech signals. Aspects of speech processing includes the acquisition, manipulation, storage, transfer and output of speech signals. Different speech processing tasks include speech recognition, speech synthesis, speaker diarization, speech enhancement, speaker recognition, etc.[1]
"""
result = summarize(text)
result

In [None]:
!pip install torch-summary

In [None]:
from torchsummary import summary
summary(trained_model)