In [8]:
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
from torch import optim
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
train_data = pd.read_csv("data/train.csv")
val_data = pd.read_csv("data/validation.csv")

train_texts = train_data["text"].tolist()
train_summaries = train_data["titles"].tolist()

val_texts = val_data["text"].tolist()
val_summaries = val_data["titles"].tolist()

#  Load Model and Tokenize
tokenizer = PreTrainedTokenizerFast.from_pretrained("ainize/bart-base-cnn")
model = BartForConditionalGeneration.from_pretrained("ainize/bart-base-cnn")


num_train_epochs = 3
logging_steps = 100
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(num_train_epochs):
    model.train()
    for i in range(len(train_texts)):
        input_text = train_texts[i]
        input_encoding = tokenizer(input_text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        input_ids = input_encoding.input_ids.to(model.device)
        
        target_summary = train_summaries[i]
        target_encoding = tokenizer(target_summary, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        labels = target_encoding.input_ids.to(model.device)
        
        # Forward pass
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if i % logging_steps == 0:
            print(f"Epoch {epoch}, Step {i}, Loss: {loss.item()}")

# Save the model if needed
model.save_pretrained("bart_cnn_trained")


Downloading: 100%|██████████| 261/261 [00:00<00:00, 259kB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 260kB/s]
Downloading: 100%|██████████| 1.29M/1.29M [00:00<00:00, 13.6MB/s]
Downloading: 100%|██████████| 1.52k/1.52k [00:00<00:00, 776kB/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
Downloading: 100%|██████████| 532M/532M [00:33<00:00, 16.6MB/s] 


Epoch 0, Step 0, Loss: 1.3886562585830688
Epoch 0, Step 100, Loss: 1.4431841373443604
Epoch 0, Step 200, Loss: 0.41847679018974304
Epoch 0, Step 300, Loss: 1.6495763063430786
Epoch 0, Step 400, Loss: 0.7686737179756165
Epoch 0, Step 500, Loss: 1.298385500907898
Epoch 0, Step 600, Loss: 1.2216602563858032
Epoch 0, Step 700, Loss: 1.1133031845092773
Epoch 0, Step 800, Loss: 2.82553768157959
Epoch 0, Step 900, Loss: 0.5600195527076721
Epoch 0, Step 1000, Loss: 1.8677444458007812
Epoch 0, Step 1100, Loss: 1.697800636291504
Epoch 0, Step 1200, Loss: 2.640979766845703
Epoch 0, Step 1300, Loss: 1.6954879760742188
Epoch 0, Step 1400, Loss: 1.156732439994812
Epoch 0, Step 1500, Loss: 1.7583222389221191
Epoch 0, Step 1600, Loss: 1.7640764713287354
Epoch 0, Step 1700, Loss: 0.6852645874023438
Epoch 0, Step 1800, Loss: 0.8472453355789185
Epoch 0, Step 1900, Loss: 1.5269818305969238
Epoch 0, Step 2000, Loss: 1.2405385971069336
Epoch 0, Step 2100, Loss: 1.4261802434921265
Epoch 0, Step 2200, Loss: 1

In [11]:
test_data = pd.read_csv("data/test_text.csv")
model=model.to(device)
model.eval()

batch_size = 4  
num_batches = (len(test_data) + batch_size - 1) // batch_size

generated_summaries = []

for i in range(num_batches):
    batch_data = test_data.iloc[i * batch_size: (i + 1) * batch_size]
    
    test_encodings = tokenizer(batch_data["text"].tolist(), padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(input_ids=test_encodings["input_ids"], max_length=50, num_beams=4, early_stopping=True)
    
    decoded_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    generated_summaries.extend(decoded_summaries)

submission_df = pd.DataFrame({"ID": test_data["ID"], "titles": generated_summaries})

submission_df.to_csv("bart_cnn_trained.csv", index=False)


