In [1]:
!pip install -r requirements.txt
!python -m spacy download en_core_web_sm

Collecting ipywidgets (from -r requirements.txt (line 1))
  Downloading ipywidgets-8.1.6-py3-none-any.whl.metadata (2.4 kB)
Collecting kaggle (from -r requirements.txt (line 2))
  Downloading kaggle-1.7.4.2-py3-none-any.whl.metadata (16 kB)
Collecting transformers (from -r requirements.txt (line 3))
  Downloading transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Collecting unidecode (from -r requirements.txt (line 5))
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Collecting spacy (from -r requirements.txt (line 6))
  Downloading spacy-3.8.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting parquet (from -r requirements.txt (line 7))
  Downloading parquet-1.3.1-py3-none-any.whl.metadata (4.1 kB)
Collecting evaluate (from -r requirements.txt (line 8))
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score (from -r requirements.txt (line 9))
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing

In [2]:
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import os
import random
import time
import warnings
import zipfile
from tqdm import tqdm
import json
import s3fs
import io

import evaluate
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import parquet
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertConfig, BertTokenizer
from unidecode import unidecode


from src.features.functions_preprocessing import (
    plot_text_length_distribution,
    preprocess_articles,
    preprocess_summaries,
)
from src.features.tokenization import parallel_tokenize
from src.models.bert import BertSummary
from src.models.rnn_encoder_decoder import Encoder, Decoder, Seq2Seq
from src.models.transformer import Transformer
from src.models.train_models import train_model
from src.evaluation.model_evaluation import (
    generate_summaries_seq2seq,
    generate_summaries_transformer,
    generate_summaries_bert,
)

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [4]:
def get_allowed_cpu_count():
    # Returns the number of CPU cores available for this process.
    try:
        return len(os.sched_getaffinity(0))
    except AttributeError:
        return os.cpu_count() or 1


cpu_count = get_allowed_cpu_count()
print(cpu_count)

72


In [5]:
n_process = max(1, cpu_count // 2)

In [6]:
torch.set_num_threads(n_process)

In [7]:
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

MY_BUCKET = config['MY_BUCKET']
CHEMIN_FICHIER = config['CHEMIN_FICHIER']

# **Import Dataset from S3**

Download and extract the news summarization dataset from S3, then load it into a pandas DataFrame.

In [None]:
fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"})

with fs.open(f"s3://{MY_BUCKET}/{CHEMIN_FICHIER}") as f:
    # Lire le contenu du fichier ZIP dans la mémoire
    zip_content = io.BytesIO(f.read())

with zipfile.ZipFile(zip_content, "r") as zip_ref:
    zip_ref.extractall("news-summarization")

In [None]:
news_data = pd.read_csv("news-summarization/data.csv")

We pick a random article from the dataset and display both its content and the corresponding summary to compare.

In [None]:
N = random.randint(1, len(news_data))

print(news_data["Content"][N])
print()
print(news_data["Summary"][N])

We filter out very short and very long articles (outside the 10th and 90th percentiles) and then plot the length distribution of the remaining articles.

In [None]:
lengths_article = news_data["Content"].str.len()
lengths_article.describe()

In [None]:
news_data = news_data[
    (lengths_article >= lengths_article.quantile(0.10))
    & (lengths_article <= lengths_article.quantile(0.90))
]

In [None]:
plot_text_length_distribution(news_data, "Content")

We do the same for summaries.

In [None]:
lengths_summary = news_data["Summary"].str.len()
lengths_summary.describe()

In [None]:
news_data = news_data[
    (lengths_summary >= lengths_summary.quantile(0.10))
    & (lengths_summary <= lengths_summary.quantile(0.90))
]

In [None]:
news_data["Summary"].str.len().describe()

In [None]:
plot_text_length_distribution(news_data, "Summary")

In [None]:
len(news_data)

We preprocess the articles and summaries using parallel processing to clean and standardize the text data efficiently.

In [None]:
news_data.loc[:, "Content"] = preprocess_articles(
    news_data["Content"].tolist(), n_process=n_process, batch_size=32
)
news_data.loc[:, "Summary"] = preprocess_summaries(
    news_data["Summary"].tolist(), n_process=n_process, batch_size=32
)

In [None]:
news_data.to_parquet("news_data_cleaned.parquet", index=False)

In [None]:
fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"})
local_parquet_path = "news_data_cleaned.parquet"
s3_parquet_path = f"s3://{MY_BUCKET}/data/news_data_cleaned.parquet"
fs.put(local_parquet_path, s3_parquet_path)
print(f"Fichier envoyé avec succès à {s3_parquet_path}")

# **Tokenization**

In [21]:
s3_parquet_path = f"s3://{MY_BUCKET}/data/news_data_cleaned.parquet"
fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"})

with fs.open(s3_parquet_path, 'rb') as f:
    news_data = pd.read_parquet(f)

# pb avec sauvegarder 
news_data.drop(columns = ["Unnamed: 0"], inplace=True)

In [24]:
news_data.iloc[1]['Content']

"ryan lipman australian porn star angela white think bring ultimate steamy study fantasy life partner secretly film sex act library la trobe university melbourne video believe shoot year library open use spark outrage university ire police despite presence nearby student white start video reveal bookshelf report herald sun raunchy angela white partner secretly film sex act library la trobe university melbourne white onscreen lover nearby desk partner keep lookout la trobe university spokesman say shock appal brazen act know video footage recently inform student 'permission seek give request assist fully police investigation victoria police spokeswoman say pair catch act face charge include wilful obscene exposure brazen act la trobe unaware video film leave outraged embarrassment controversy whiteâ€ ™ s dutch base production company agw entertainment issue statement address incident remove video publish online agw entertainment b.v. regret filming and/or posting video question offend m

We shuffle the dataset, split it into training and testing sets with an 80-20 ratio, and print the sizes of both subsets.

In [None]:
data_copy = news_data[:]
data_copy = news_data.sample(frac=1, random_state=42)

train_ratio = 0.8
train_size = int(train_ratio * len(data_copy))

# Slice the dataset
train_data = data_copy[:train_size]
test_data = data_copy[train_size:]

print(f"Train size: {len(train_data)}")
print(f"Test size:  {len(test_data)}")

We tokenize the content of the articles & summaries in parallel using a BERT tokenizer, then save the tokenized result as a PyTorch tensor.

In [None]:
if __name__ == "__main__":
    texts_content = list(train_data["Content"])
    print("Tokenizing Content...")
    tokenized_articles = parallel_tokenize(
        texts_content,
        tokenizer_name="bert-base-uncased",
        max_workers=n_process,
        chunk_size=2000,
        max_length=512,
    )
    print("tokenized_articles.shape =", tokenized_articles.shape)
    torch.save(tokenized_articles, "tokenized_articles.pt")

In [None]:
if __name__ == "__main__":
    texts_summary = list(train_data["Summary"])
    print("Tokenizing Summaries...")
    tokenized_summaries = parallel_tokenize(
        texts_summary,
        tokenizer_name="bert-base-uncased",
        max_workers=n_process,
        chunk_size=2000,
        max_length=129,
    )
    print("tokenized_summaries.shape =", tokenized_summaries.shape)
    torch.save(tokenized_summaries, "tokenized_summaries.pt")

In [None]:
if __name__ == "__main__":
    texts_content = list(test_data["Content"])
    print("Tokenizing Content...")
    tokenized_articles_test = parallel_tokenize(
        texts_content,
        tokenizer_name="bert-base-uncased",
        max_workers=n_process,
        chunk_size=2000,
        max_length=512,
    )
    print("tokenized_articles.shape =", tokenized_articles_test.shape)
    torch.save(tokenized_articles_test, "tokenized_articles_test.pt")

# **Transformer**

In [17]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    tokenized_articles = torch.load("tokenized_articles.pt")
    tokenized_summaries = torch.load("tokenized_summaries.pt")
    tokenized_articles_test = torch.load("tokenized_articles_test.pt")

article_ids = tokenized_articles.long()
summary_ids = tokenized_summaries.long()

In [20]:
tokenized_summaries.shape

torch.Size([446425, 129])

We create a dataset and dataloader, then initialize a Transformer model with BERT's vocabulary size, hidden size, 8 attention heads, and 3 layers.

In [None]:
batch_size = 32

dataset = TensorDataset(tokenized_articles, tokenized_summaries)
dataloader = DataLoader(
    dataset, batch_size=batch_size, num_workers=n_process, shuffle=True
)

In [None]:
modelTransformer = Transformer(
    pad_idx=0,
    voc_size=BertTokenizer.from_pretrained("bert-base-uncased").vocab_size,
    hidden_size=128,
    n_head=8,
    max_len=512,
    dec_max_len=512,
    ffn_hidden=128,
    n_layers=3,
)

We train the Transformer model for 25 epochs using the Adam optimizer and cross-entropy loss.

In [None]:
train_model(
    model=modelTransformer,
    dataloader=dataloader,
    num_epochs=25,
    optimizer=torch.optim.Adam(modelTransformer.parameters(), lr=2e-4),
    loss_fn=nn.CrossEntropyLoss(
        ignore_index=BertTokenizer.from_pretrained("bert-base-uncased").pad_token_id
    ),
    model_name="Transformer",
    device=device,
)

We initialize the Transformer model, load the pre-trained weights from a previous run (after 25 epochs), and set the model to evaluation mode.

# **API**

In [11]:
modelTransformer = Transformer(
    pad_idx=0,
    voc_size=BertTokenizer.from_pretrained("bert-base-uncased").vocab_size,
    hidden_size=128,
    n_head=8,
    max_len=512,
    dec_max_len=128,
    ffn_hidden=128,
    n_layers=3,
)
modelTransformer.load_state_dict(
    torch.load("model_weights/transformer_weights_25_epochs.pth")
)
modelTransformer.eval()
modelTransformer.to(device)

Transformer(
  (enc_embedding): TransformerEmbedding(
    (tok_emb): Embedding(30522, 128, padding_idx=1)
    (pos_emb): PositionalEncoding()
    (drop_out): Dropout(p=0.1, inplace=False)
  )
  (dec_embedding): TransformerEmbedding(
    (tok_emb): Embedding(30522, 128, padding_idx=1)
    (pos_emb): PositionalEncoding()
    (drop_out): Dropout(p=0.1, inplace=False)
  )
  (encoder_layers): ModuleList(
    (0-2): 3 x EncoderLayer(
      (attention): AttentionLayer(
        (w_q): Linear(in_features=128, out_features=128, bias=True)
        (w_k): Linear(in_features=128, out_features=128, bias=True)
        (w_v): Linear(in_features=128, out_features=128, bias=True)
        (w_o): Linear(in_features=128, out_features=128, bias=True)
      )
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (linear1): Linear(in_features=128, out_features=128, bias=True)
      (linear2): Linear(in_features=128, out_features=128, bias=Tr

In [None]:
torch.save(modelTransformer.state_dict(), "model/pytorch_model.bin")

In [None]:
from huggingface_hub import create_repo, upload_folder

# pip install huggingface_hub
# pour envoyer le modele sur huggingface 
# mettre dans le terminal : huggingface-cli login
# puis se connecter avec un token personnel 
# remplacer Antoiner77 par son propre compte 

create_repo("model_test", private=False)  
upload_folder(
    repo_id="Antoiner77/model_test",
    folder_path="model"
)

pytorch_model.bin:   0%|          | 0.00/50.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Antoiner77/model_test/commit/68f5cb3a5da1219348a92992cf69213738030c73', commit_message='Upload folder using huggingface_hub', commit_description='', oid='68f5cb3a5da1219348a92992cf69213738030c73', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Antoiner77/model_test', endpoint='https://huggingface.co', repo_type='model', repo_id='Antoiner77/model_test'), pr_revision=None, pr_num=None)

In [None]:
# pour copier le model directement : 
# git clone https://huggingface.co/Antoiner77/model_test
# le modele n'est pas privé donc peut etre charger directement. 

In [8]:
import torch
import json
from model.model import Transformer 

# Charger la config
with open("model/config_model.json", "r") as f:
    config = json.load(f)

In [9]:
# Créer le modèle avec les paramètres de la config
model = Transformer(
    pad_idx=config["pad_idx"],
    voc_size=config["voc_size"],
    hidden_size=config["hidden_size"],
    n_head=config["n_head"],
    max_len=config["max_len"],
    dec_max_len=config["dec_max_len"],
    ffn_hidden=config["ffn_hidden"],
    n_layers=config["n_layers"]
)

# Charger les poids
model.load_state_dict(torch.load("model/pytorch_model.bin", map_location="cpu"))

model.eval()


Transformer(
  (enc_embedding): TransformerEmbedding(
    (tok_emb): Embedding(30522, 128, padding_idx=1)
    (pos_emb): PositionalEncoding()
    (drop_out): Dropout(p=0.1, inplace=False)
  )
  (dec_embedding): TransformerEmbedding(
    (tok_emb): Embedding(30522, 128, padding_idx=1)
    (pos_emb): PositionalEncoding()
    (drop_out): Dropout(p=0.1, inplace=False)
  )
  (encoder_layers): ModuleList(
    (0-2): 3 x EncoderLayer(
      (attention): AttentionLayer(
        (w_q): Linear(in_features=128, out_features=128, bias=True)
        (w_k): Linear(in_features=128, out_features=128, bias=True)
        (w_v): Linear(in_features=128, out_features=128, bias=True)
        (w_o): Linear(in_features=128, out_features=128, bias=True)
      )
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (linear1): Linear(in_features=128, out_features=128, bias=True)
      (linear2): Linear(in_features=128, out_features=128, bias=Tr

In [13]:
text = "new york police concerned drone tool terrorist investigate way stop potential attack police acknowledge drone potential weapon nypd say technology advance use carry air assault chemical weapon firearm police want develop technology allow control drone scan sky major event nypd say drone carry explosive number threat investigate way stop attack deputy chief salvatore dipace left concern incident year drone land german chancellor angela merkel take chancellor people drone fly pack football stadium manchester england week ago result suspect pilot arrest consult military member counterterrorism bomb squad emergency service aviation unit work plan counter weaponize drone nypd receive intelligence indicate imminent threat increasingly concerned year deputy chief salvatore dipace tell cbs news look people jury rig drone carry gun carry different type explosive want possibility worried mr dipace say police see video show accurate attack drone see video drone fly different target route accurately hit target paintball nypd see drone carry explosive number threat mr dipace concern follow incident germany year drone able land german chancellor angela merkel deliver speech drone circle land ms merkel deliver speech sin germany spark fear device easily commit terrorist act say think happen drone hit target right mark take chancellor people dramatic increase incident involve drone new york city year 40 record case unmanned aircraft system drone fly airspace nypd helicopter incident summer drone 800 foot ground nearly collide police helicopter nypd aviation unit member sergeant antonio hernandez say fly dark night vision goggle try job thing know drone come altitude"

In [None]:
tokenized_input = parallel_tokenize(
        [text],
        tokenizer_name="bert-base-uncased",
        max_workers=n_process,
        chunk_size=2000,
        max_length=512,
    )

summaries = generate_summaries_transformer(model, batch_size=32, tokenized_input=tokenized_input)

print(summaries[0])

Processing Batches: 100%|██████████| 1/1 [00:01<00:00,  1.48s/batch]

police are investigating whether the drone could be used in the world. the drone was carrying drones in the world's largest drone strike. the drone was carrying drones in the world's largest drone strike.





# **Evaluation**

We load the ROUGE evaluation metric, which is commonly used to assess the quality of generated text summaries by comparing them to reference summaries.

In [None]:
rouge = evaluate.load("rouge")

## Transformer

We generate summaries using the Transformer model.

In [None]:
predictions_transformer = generate_summaries_transformer(
    model=modelTransformer,
    batch_size=32,
    tokenized_input=tokenized_articles_test,
    limit=None,
)

In [None]:
test_data.loc[:, "predictions_transformer"] = predictions_transformer

We compute ROUGE metrics by comparing the Transformer model's generated summaries to the reference summaries from the test set.

In [None]:
reference_summaries = list(test_data["Summary"])
results = rouge.compute(
    predictions=predictions_transformer, references=reference_summaries
)
print("ROUGE metrics:", results)