In [1]:
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
from rouge_score import rouge_scorer
from torch import optim

# Load the data from the CSV file
train_data = pd.read_csv("data/train.csv")
val_data = pd.read_csv("data/validation.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Select the columns containing the texts and summaries
train_texts = train_data["text"].tolist()
train_summaries = train_data["titles"].tolist()

val_texts = val_data["text"].tolist()
val_summaries = val_data["titles"].tolist()

# Load the tokenizer and the Bart model for text generation
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")


In [9]:

model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")



# Training loop
num_train_epochs = 3
logging_steps = 100
optimizer = optim.Adam(model.parameters(), lr=5e-5)

for epoch in range(num_train_epochs):
    for i in range(len(train_texts)):
        input_text = train_texts[i]
        input_encoding = tokenizer(input_text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        input_ids = input_encoding.input_ids.to(model.device)
        
        target_summary = train_summaries[i]
        target_encoding = tokenizer(target_summary, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        labels = target_encoding.input_ids.to(model.device)
        
        # Forward pass
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        
        # Backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if i % logging_steps == 0:
            print(f"Epoch {epoch}, Step {i}, Loss: {loss.item()}")



loading file https://huggingface.co/facebook/bart-base/resolve/main/vocab.json from cache at C:\Users\mouha/.cache\huggingface\transformers\43978bdeaa326572886b44fcfed82f932f76571095ce31973e51c3da8ccade7f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/facebook/bart-base/resolve/main/merges.txt from cache at C:\Users\mouha/.cache\huggingface\transformers\3c167ed8af56e6605eeb794b63a79d65d85e6708c9b04408d41946337030f5cd.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/facebook/bart-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/facebook/bart-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/facebook/bart-base/resolve/main/tokenizer_config.json from cache at None
loading file https://huggingface.co/facebook/bart-base/resolve/main/tokenizer.json from cache at C:\Users\mouha/.cache\huggingface\tran

Epoch 0, Step 0, Loss: 12.666672706604004
Epoch 0, Step 100, Loss: 1.2392264604568481
Epoch 0, Step 200, Loss: 0.3654614984989166
Epoch 0, Step 300, Loss: 1.3258610963821411
Epoch 0, Step 400, Loss: 0.7161915302276611
Epoch 0, Step 500, Loss: 1.114119529724121
Epoch 0, Step 600, Loss: 0.845808744430542
Epoch 0, Step 700, Loss: 0.9309849143028259
Epoch 0, Step 800, Loss: 2.3252220153808594
Epoch 0, Step 900, Loss: 0.4593234360218048
Epoch 0, Step 1000, Loss: 1.6621222496032715
Epoch 0, Step 1100, Loss: 1.4235011339187622
Epoch 0, Step 1200, Loss: 2.5234172344207764
Epoch 0, Step 1300, Loss: 1.4325523376464844
Epoch 0, Step 1400, Loss: 0.8728844523429871
Epoch 0, Step 1500, Loss: 1.5192229747772217
Epoch 0, Step 1600, Loss: 1.562177300453186
Epoch 0, Step 1700, Loss: 0.6252636313438416
Epoch 0, Step 1800, Loss: 0.6986242532730103
Epoch 0, Step 1900, Loss: 1.255396842956543
Epoch 0, Step 2000, Loss: 1.1824023723602295
Epoch 0, Step 2100, Loss: 1.311476230621338
Epoch 0, Step 2200, Loss: 1

In [12]:
# Save the model if needed
model.save_pretrained("trained_model")

Configuration saved in trained_model\config.json
Model weights saved in trained_model\pytorch_model.bin


In [5]:
# Charger le tokenizer et le modèle BART pour la génération de texte
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("trained_model")

In [6]:
import torch
device = "cuda"

In [None]:
# Evaluation loop
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
for i in range(len(val_texts)):
    input_text = val_texts[i]
    input_encoding = tokenizer(input_text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = input_encoding.input_ids.to(model.device)
    
    # Generate summary
    output_ids = model.generate(input_ids)
    output_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # Compute ROUGE score
    rouge_scores = scorer.score(output_summary, val_summaries[i])
    rouge_l_fmeasure = rouge_scores['rougeL'].fmeasure
    
    print(f"Example {i+1}, ROUGE-L F-Score: {rouge_l_fmeasure}")




In [7]:
test_data = pd.read_csv("data/test_text.csv")
model=model.to(device)
model.eval()

batch_size = 4  
num_batches = (len(test_data) + batch_size - 1) // batch_size

generated_summaries = []

for i in range(num_batches):
    batch_data = test_data.iloc[i * batch_size: (i + 1) * batch_size]
    
    test_encodings = tokenizer(batch_data["text"].tolist(), padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(input_ids=test_encodings["input_ids"], max_length=50, num_beams=4, early_stopping=True)
    
    decoded_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    generated_summaries.extend(decoded_summaries)

submission_df = pd.DataFrame({"ID": test_data["ID"], "titles": generated_summaries})

submission_df.to_csv("submission.csv", index=False)



