In [None]:
pip install -r requirements.txt

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unidecode import unidecode
import zipfile
import random

# **Kaggle dataset**

In [4]:
!kaggle datasets download -d sbhatti/news-summarization

Dataset URL: https://www.kaggle.com/datasets/sbhatti/news-summarization
License(s): CC0-1.0
news-summarization.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
with zipfile.ZipFile('news-summarization.zip', 'r') as zip_ref:
    zip_ref.extractall('news-summarization')

In [6]:
news_data = pd.read_csv("news-summarization/data.csv")

In [7]:
news_data.head()

Unnamed: 0.1,Unnamed: 0,ID,Content,Summary,Dataset
0,0,f49ee725a0360aa6881ed1f7999cc531885dd06a,New York police are concerned drones could bec...,Police have investigated criminals who have ri...,CNN/Daily Mail
1,1,808fe317a53fbd3130c9b7563341a7eea6d15e94,By . Ryan Lipman . Perhaps Australian porn sta...,Porn star Angela White secretly filmed sex act...,CNN/Daily Mail
2,2,98fd67bd343e58bc4e275bbb5a4ea454ec827c0d,"This was, Sergio Garcia conceded, much like be...",American draws inspiration from fellow country...,CNN/Daily Mail
3,3,e12b5bd7056287049d9ec98e41dbb287bd19a981,An Ebola outbreak that began in Guinea four mo...,World Health Organisation: 635 infections and ...,CNN/Daily Mail
4,4,b83e8bcfcd51419849160e789b6658b21a9aedcd,By . Associated Press and Daily Mail Reporter ...,A sinkhole opened up at 5:15am this morning in...,CNN/Daily Mail


In [8]:
news_data["Content"] = news_data["Content"].apply(
    lambda text: ' '.join([unidecode(token) for token in str(text).split()])
)
news_data["Summary"] = news_data["Summary"].apply(
    lambda text: ' '.join([unidecode(token) for token in str(text).split()])
)

In [42]:
N = random.randint(1, len(news_data))

print(news_data["Content"][N])
print()
print(news_data["Summary"][N])

(CNN) -- He apparently did NOT "mind the gap." A man in Perth, Australia, somehow slipped and got his leg stuck in the narrow space between a commuter train and the platform Wednesday. Closed circuit TV captured the incident at the Stirling Station and showed another passenger immediately raising the alarm. Authorities tried to pull the man out, but when that didn't work, they asked passengers to step out of the wagons and help push the six-car train in an effort to widen the gap, according to CNN affiliate Seven Network. About 50 commuters lined up along the side of the train, and after two collective pushes, the man was able to free his left leg. The commuter was examined by medics, but was not hurt, Seven Network said. He has not been identified. Fellow passenger Nicolas Taylor told PerthNow the man seemed a little embarrassed, "because right where he fell was the 'mind the gap' writing." The incident only delayed the busy train by 15 minutes.

A man gets his left leg caught between

In [None]:
lengths_article = news_data["Content"].str.len()
lengths_article.describe()

In [None]:
news_data = news_data[(lengths_article >= lengths_article.quantile(0.05)) & (lengths_article <= lengths_article.quantile(0.95))]

In [None]:
pd.DataFrame(news_data["Content"].str.len()).plot(kind='hist', bins=50, edgecolor='black', alpha=0.7, color='blue')
plt.axvline(news_data["Content"].str.len().mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {news_data["Content"].str.len().mean():.2f}')
plt.xlabel("Article Length (tokens)", fontsize=14)
plt.ylabel("")
plt.legend(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
lengths_summary = news_data["Summary"].str.len()
lengths_summary.describe()

In [None]:
news_data = news_data[(lengths_summary >= lengths_summary.quantile(0.05)) & (lengths_summary <= lengths_summary.quantile(0.95))]

In [None]:
news_data["Summary"].str.len().describe()

In [None]:
news_data["Summary"].str.len().plot(kind='hist', bins=50, edgecolor='black', alpha=0.7, color='blue')
plt.axvline(news_data["Summary"].str.len().mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {news_data["Summary"].str.len().mean():.2f}')
plt.xlabel("Summary Length (tokens)", fontsize=14)
plt.ylabel("")
plt.legend(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# **RNN**

In [1]:
import torch
from torch import nn

_ = torch.manual_seed(42)

# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [None]:
class RNN(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int) -> None:
        """
        Args:
        -----
        input_size: Number of features of your input vector
        hidden_size: Number of hidden neurons
        output_size: Number of features of your output vector
        """
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size, bias=True)
        self.h2o = nn.Linear(hidden_size, output_size)

    def forward(self, x: torch.Tensor, hidden_state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
        -----
        x: Input vector of shape (batch_size, input_size)
        hidden_state: Previous hidden state of shape (batch_size, hidden_size)

        Returns:
        --------
        output: Output vector of shape (batch_size, output_size)
        hidden_state: Updated hidden state of shape (batch_size, hidden_size)
        """
        x_transformed = self.i2h(x)
        hidden_transformed = self.h2h(hidden_state)
        hidden_state = torch.tanh(x_transformed + hidden_transformed)

        output_transformed = self.h2o(hidden_state)
        output = torch.softmax(output_transformed, dim=1)
        return output, hidden_state

    def init_hidden(self, batch_size: int) -> torch.Tensor:
        """
        Initialize the hidden state with zeros.
        
        Args:
        -----
        batch_size: Number of samples in the batch
        
        Returns:
        --------
        A tensor of shape (batch_size, hidden_size) initialized to zeros.
        """
        return torch.zeros(batch_size, self.hidden_size)

# **LLM Llama 3.2**

In [None]:
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
import yaml

with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

hf_token = config["huggingface"]["token"]
login("hf_token")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")