In [1]:
!pip install -r requirements.txt
!python -m spacy download en_core_web_sm

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import parquet
from unidecode import unidecode
import zipfile
import random
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset

from src.features.functions_preprocessing import (
    plot_text_length_distribution,
    preprocess_articles,
    preprocess_summaries,
)
from src.models.rnn_encoder_decoder import *
from src.models.transformer import Transformer
from src.models.bert import BertSummary

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Using {device} device")

Using cuda device


# **Kaggle dataset**

In [None]:
!kaggle datasets download -d sbhatti/news-summarization

Dataset URL: https://www.kaggle.com/datasets/sbhatti/news-summarization
License(s): CC0-1.0
Downloading news-summarization.zip to /home/onyxia/work/ml-project-transformers
  9%|███▋                                   | 133M/1.37G [00:05<00:55, 24.1MB/s]

In [None]:
with zipfile.ZipFile("news-summarization.zip", "r") as zip_ref:
    zip_ref.extractall("news-summarization")

In [None]:
news_data = pd.read_csv("news-summarization/data.csv")

In [None]:
news_data.head()

In [None]:
N = random.randint(1, len(news_data))

print(news_data["Content"][N])
print()
print(news_data["Summary"][N])

In [None]:
lengths_article = news_data["Content"].str.len()
lengths_article.describe()

In [None]:
news_data = news_data[
    (lengths_article >= lengths_article.quantile(0.10))
    & (lengths_article <= lengths_article.quantile(0.90))
]

In [None]:
plot_text_length_distribution(news_data, "Content")

In [None]:
lengths_summary = news_data["Summary"].str.len()
lengths_summary.describe()

In [None]:
news_data = news_data[
    (lengths_summary >= lengths_summary.quantile(0.10))
    & (lengths_summary <= lengths_summary.quantile(0.90))
]

In [None]:
news_data["Summary"].str.len().describe()

In [None]:
plot_text_length_distribution(news_data, "Summary")

In [None]:
len(news_data)

In [None]:
news_data.loc[:, "Content"] = preprocess_articles(news_data["Content"].tolist())
news_data.loc[:, "Summary"] = preprocess_summaries(news_data["Summary"].tolist())

In [None]:
news_data.to_parquet("news_data_cleaned.parquet", index=False)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
train_data = news_data[:200000]
# test_data = news_data[5000:6000]

In [None]:
tokenized_articles = tokenizer(
    list(train_data["Content"]),
    truncation=True,
    padding="max_length",
    return_tensors="pt",
    max_length=512,
)["input_ids"].long()

tokenized_summaries = tokenizer(
    list(train_data["Summary"]),
    truncation=True,
    padding="max_length",
    return_tensors="pt",
    max_length=512,
)["input_ids"]

tokenized_summaries = torch.cat(
    [torch.zeros(tokenized_summaries.size(0), 1), tokenized_summaries], dim=1
).long()

article_ids = tokenized_articles.long()
summary_ids = tokenized_summaries.long()

# **RNN**

# **Transformer**

In [None]:
batch_size = 10

dataset = TensorDataset(tokenized_articles, tokenized_summaries)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
model_transformer = Transformer(
    pad_idx=0,
    voc_size=tokenizer.vocab_size,
    hidden_size=128,
    n_head=8,
    max_len=512,
    dec_max_len=512,
    ffn_hidden=128,
    n_layers=3,
)

In [None]:
num_epochs = 50

loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model_transformer.parameters(), lr=5e-4)
model = model_transformer.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(dataloader):
        input_batch, summary_batch = batch
        input_batch = input_batch.to(device)
        summary_batch = summary_batch.to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_batch.long(), summary_batch[:, :-1])
        summary_batch = summary_batch[:, 1:]
        # Compute loss
        loss = loss_fn(
            outputs.reshape(-1, outputs.shape[-1]), summary_batch.reshape(-1)
        )
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()
        if step % 1000 == 0:
            print("Step-{}, Loss-{}".format(step, loss.item()))

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(dataloader)

    # Print the average loss
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

# **BERT model**

In [None]:
batch_size = 10

dataset = TensorDataset(tokenized_articles, tokenized_summaries[:, 1:])
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
config = BertConfig(
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    vocab_size=tokenizer.vocab_size,
)

In [None]:
model = BertSummary(config)
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
model.to(device)

num_epochs = 2

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(dataloader):

        input_batch, summary_batch = batch
        input_batch = input_batch.to(device)
        summary_batch = summary_batch.to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_batch, attention_mask=input_batch.ne(0))

        # Compute loss
        loss = loss_fn(outputs.view(-1, outputs.shape[-1]), summary_batch.view(-1))
        total_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()
        if step % 1000 == 0:
            print("Step-{}, Loss-{}".format(step, loss.item()))

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(dataloader)

    # Print the average loss
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

# **Evaluation**

In [None]:
!pip install evaluate

In [None]:
import evaluate

bleu = evaluate.load("bleu")

In [None]:
input_text = news_data["Content"][1000]

In [None]:
tokenized_input = tokenizer.encode_plus(
    input_text,
    max_length=512,
    truncation=True,
    padding="max_length",
    return_tensors="pt",
)

In [None]:
input_ids = tokenized_input["input_ids"].to(device)
attention_mask = tokenized_input["attention_mask"].to(device)

In [None]:
model.eval()

In [None]:
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

In [None]:
# Reshape the outputs tensor
reshaped_outputs = outputs

# Get the predicted summary
predicted_summary_ids = torch.argmax(reshaped_outputs, dim=-1)
predicted_summary = tokenizer.decode(predicted_summary_ids[0], skip_special_tokens=True)

print("Predicted Summary:", predicted_summary)