In [None]:
!pip install -r requirements.txt
!python -m spacy download en_core_web_sm

In [None]:
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import os
import random
import time
import warnings
import zipfile
from tqdm import tqdm
import json

import evaluate
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import parquet
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertConfig, BertTokenizer
from unidecode import unidecode

from src.features.functions_preprocessing import (
    plot_text_length_distribution,
    preprocess_articles,
    preprocess_summaries,
)
from src.features.tokenization import parallel_tokenize
from src.models.bert import BertSummary
from src.models.rnn_encoder_decoder import Encoder, Decoder, Seq2Seq
from src.models.transformer import Transformer
from src.models.train_models import train_model
from src.evaluation.model_evaluation import (
    generate_summaries_seq2seq,
    generate_summaries_transformer,
    generate_summaries_bert,
)

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Using {device} device")

In [None]:
def get_allowed_cpu_count():
    # Returns the number of CPU cores available for this process.
    try:
        return len(os.sched_getaffinity(0))
    except AttributeError:
        return os.cpu_count() or 1


cpu_count = get_allowed_cpu_count()
print(cpu_count)

In [None]:
n_process = max(1, cpu_count // 2)

In [None]:
torch.set_num_threads(n_process)

# **Kaggle dataset**

Download and extract the news summarization dataset from Kaggle, then load it into a pandas DataFrame.

In [None]:
!kaggle datasets download -d sbhatti/news-summarization

In [None]:
with zipfile.ZipFile("news-summarization.zip", "r") as zip_ref:
    zip_ref.extractall("news-summarization")

In [None]:
news_data = pd.read_csv("news-summarization/data.csv")

In [None]:
news_data.head()

We pick a random article from the dataset and display both its content and the corresponding summary to compare.

In [None]:
N = random.randint(1, len(news_data))

print(news_data["Content"][N])
print()
print(news_data["Summary"][N])

We filter out very short and very long articles (outside the 10th and 90th percentiles) and then plot the length distribution of the remaining articles.

In [None]:
lengths_article = news_data["Content"].str.len()
lengths_article.describe()

In [None]:
news_data = news_data[
    (lengths_article >= lengths_article.quantile(0.10))
    & (lengths_article <= lengths_article.quantile(0.90))
]

In [None]:
plot_text_length_distribution(news_data, "Content")

We do the same for summaries.

In [None]:
lengths_summary = news_data["Summary"].str.len()
lengths_summary.describe()

In [None]:
news_data = news_data[
    (lengths_summary >= lengths_summary.quantile(0.10))
    & (lengths_summary <= lengths_summary.quantile(0.90))
]

In [None]:
news_data["Summary"].str.len().describe()

In [None]:
plot_text_length_distribution(news_data, "Summary")

In [None]:
len(news_data)

We preprocess the articles and summaries using parallel processing to clean and standardize the text data efficiently.

In [None]:
news_data.loc[:, "Content"] = preprocess_articles(
    news_data["Content"].tolist(), n_process=n_process, batch_size=32
)
news_data.loc[:, "Summary"] = preprocess_summaries(
    news_data["Summary"].tolist(), n_process=n_process, batch_size=32
)

In [None]:
news_data.to_parquet("news_data_cleaned.parquet", index=False)

In [None]:
news_data = pd.read_parquet("news_data_cleaned.parquet")

# **Tokenization**

We shuffle the dataset, split it into training and testing sets with an 80-20 ratio, and print the sizes of both subsets.

In [None]:
data_copy = news_data[:]
data_copy = news_data.sample(frac=1, random_state=42)

train_ratio = 0.8
train_size = int(train_ratio * len(data_copy))

# Slice the dataset
train_data = data_copy[:train_size]
test_data = data_copy[train_size:]

print(f"Train size: {len(train_data)}")
print(f"Test size:  {len(test_data)}")

We tokenize the content of the articles & summaries in parallel using a BERT tokenizer, then save the tokenized result as a PyTorch tensor.

In [None]:
if __name__ == "__main__":
    texts_content = list(train_data["Content"])
    print("Tokenizing Content...")
    tokenized_articles = parallel_tokenize(
        texts_content,
        tokenizer_name="bert-base-uncased",
        max_workers=n_process,
        chunk_size=2000,
        max_length=512,
    )
    print("tokenized_articles.shape =", tokenized_articles.shape)
    torch.save(tokenized_articles, "tokenized_articles.pt")

In [None]:
if __name__ == "__main__":
    texts_summary = list(train_data["Summary"])
    print("Tokenizing Summaries...")
    tokenized_summaries = parallel_tokenize(
        texts_summary,
        tokenizer_name="bert-base-uncased",
        max_workers=n_process,
        chunk_size=2000,
        max_length=129,
    )
    print("tokenized_summaries.shape =", tokenized_summaries.shape)
    torch.save(tokenized_summaries, "tokenized_summaries.pt")

In [None]:
if __name__ == "__main__":
    texts_content = list(test_data["Content"])
    print("Tokenizing Content...")
    tokenized_articles_test = parallel_tokenize(
        texts_content,
        tokenizer_name="bert-base-uncased",
        max_workers=n_process,
        chunk_size=2000,
        max_length=512,
    )
    print("tokenized_articles.shape =", tokenized_articles_test.shape)
    torch.save(tokenized_articles_test, "tokenized_articles_test.pt")

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    tokenized_articles = torch.load("tokenized_articles.pt")
    tokenized_summaries = torch.load("tokenized_summaries.pt")
    tokenized_articles_test = torch.load("tokenized_articles_test.pt")

In [None]:
article_ids = tokenized_articles.long()
summary_ids = tokenized_summaries.long()

# **Transformer**

We create a dataset and dataloader, then initialize a Transformer model with BERT's vocabulary size, hidden size, 8 attention heads, and 3 layers.

In [None]:
batch_size = 32

dataset = TensorDataset(tokenized_articles, tokenized_summaries)
dataloader = DataLoader(
    dataset, batch_size=batch_size, num_workers=n_process, shuffle=True
)

In [None]:
modelTransformer = Transformer(
    pad_idx=0,
    voc_size=BertTokenizer.from_pretrained("bert-base-uncased").vocab_size,
    hidden_size=128,
    n_head=8,
    max_len=512,
    dec_max_len=512,
    ffn_hidden=128,
    n_layers=3,
)

We train the Transformer model for 25 epochs using the Adam optimizer and cross-entropy loss.

In [None]:
train_model(
    model=modelTransformer,
    dataloader=dataloader,
    num_epochs=25,
    optimizer=torch.optim.Adam(modelTransformer.parameters(), lr=2e-4),
    loss_fn=nn.CrossEntropyLoss(
        ignore_index=BertTokenizer.from_pretrained("bert-base-uncased").pad_token_id
    ),
    model_name="Transformer",
    device=device,
)

We initialize the Transformer model, load the pre-trained weights from a previous run (after 25 epochs), and set the model to evaluation mode.

In [None]:
modelTransformer = Transformer(
    pad_idx=0,
    voc_size=BertTokenizer.from_pretrained("bert-base-uncased").vocab_size,
    hidden_size=128,
    n_head=8,
    max_len=512,
    dec_max_len=128,
    ffn_hidden=128,
    n_layers=3,
)
modelTransformer.load_state_dict(
    torch.load("model_weights/transformer_weights_25_epochs.pth")
)
modelTransformer.eval()

# **Evaluation**

We load the ROUGE evaluation metric, which is commonly used to assess the quality of generated text summaries by comparing them to reference summaries.

In [None]:
rouge = evaluate.load("rouge")

In [None]:
predictions_transformer = generate_summaries_transformer(
    model=modelTransformer,
    batch_size=32,
    tokenized_input=tokenized_articles_test,
    limit=None,
)

In [None]:
test_data.loc[:, "predictions_transformer"] = predictions_transformer

We compute ROUGE metrics by comparing the Transformer model's generated summaries to the reference summaries from the test set.

In [None]:
reference_summaries = list(test_data["Summary"])
results = rouge.compute(
    predictions=predictions_transformer, references=reference_summaries
)
print("ROUGE metrics:", results)