## Installing and Importing Packages

In [None]:
!pip install faiss-gpu
!pip install torch_optimizer


In [None]:
import json
import numpy as np
import sys  
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, get_scheduler, Seq2SeqTrainingArguments, Seq2SeqTrainer, GenerationConfig
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from torch_optimizer import Lamb
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from huggingface_hub import HfApi, HfFolder

## Loading and Preparing Dataset

In [None]:
# Load the dataset from the JSON file
with open("/kaggle/input/divan-ali/Divan_ali_Simplified.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [2]:
# Initialize lists to store original and simplified texts
texts = []
simplified_texts = []

# Extract original and simplified texts
for entry in data:
    for key, content in entry.items():
        original_text = content.get("متن رای", "")
        simplified_text = content.get("simplified text", "")

        # Ensure both original and simplified texts are not empty
        if original_text and simplified_text:
            texts.append(original_text)
            simplified_texts.append(simplified_text)

In [None]:
sys.path.insert(1, '/kaggle/input/nunlimiformer/') 

## Loading Model

In [None]:
from unlimiformer import UnlimiformerT5


model_name_or_path = "Moryjj/pt5_la9"
use_auth_token = False  # Set to False if no authentication is needed
max_target_length = 512  # Maximum length of the target sequence
fp16 = True  # Enable mixed precision training

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    use_auth_token=use_auth_token
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name_or_path,
    use_auth_token=use_auth_token
)


# Initialize Unlimiformer for ParsT5
unlimiformer = UnlimiformerT5(
    model=model, 
    layer_begin=10,  # Layer to start from for Unlimiformer indexing
    layer_end=11,  # Ending layer (or None to include all following layers)
    unlimiformer_head_num=None,
    exclude_attention=False,
    model_encoder_max_len=4096,
    chunk_overlap=0.5,
    tokenizer=tokenizer
)




tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [None]:
from index_building import DatastoreBatch


# Initialize the datastore
datastore = DatastoreBatch(
    dim=unlimiformer.model.config.d_model,  # Dimension of model embeddings
    batch_size=8,  # Adjust batch size based on memory availability
    flat_index=False,  # Use inverted file index for efficient retrieval
    gpu_index=True,  # Enable GPU indexing
    index_device=torch.device("cuda")  # GPU device for index
)


## Preparing Dataset

In [None]:
inputs = texts
targets = simplified_texts


class SummarizationDataset(Dataset):
    def __init__(self, inputs, targets, max_chunk_length=1024, max_sequence_length=4096, chunk_overlap=512):
        self.inputs = inputs
        self.targets = targets
        self.max_chunk_length = max_chunk_length
        self.max_sequence_length = max_sequence_length  # Set a fixed max length for input_ids
        self.chunk_overlap = chunk_overlap

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        tokenized_input = tokenizer(input_text, return_tensors="pt", truncation=False)

        # Create overlapping chunks and concatenate them up to max_sequence_length
        input_ids_chunks = []
        for i in range(0, tokenized_input['input_ids'].shape[1], self.max_chunk_length - self.chunk_overlap):
            chunk = tokenized_input['input_ids'][:, i:i + self.max_chunk_length]
            if chunk.shape[1] < self.max_chunk_length:
                chunk = torch.cat([chunk, torch.full((1, self.max_chunk_length - chunk.shape[1]), tokenizer.pad_token_id)], dim=1)
            input_ids_chunks.append(chunk)
        
        # Concatenate chunks and truncate to max_sequence_length
        full_input_ids = torch.cat(input_ids_chunks, dim=1)[:, :self.max_sequence_length]
        attention_mask = (full_input_ids != tokenizer.pad_token_id).long()
        
        # Pad/truncate the input to max_sequence_length
        if full_input_ids.shape[1] < self.max_sequence_length:
            padding_length = self.max_sequence_length - full_input_ids.shape[1]
            full_input_ids = torch.cat([full_input_ids, torch.full((1, padding_length), tokenizer.pad_token_id)], dim=1)
            attention_mask = torch.cat([attention_mask, torch.zeros((1, padding_length), dtype=torch.long)], dim=1)
        
        # Tokenize target text and pad/truncate to max length of 512
        target_text = self.targets[idx]
        tokenized_target = tokenizer(target_text, return_tensors="pt", max_length=512, padding="max_length", truncation=True)

        return {
            "input_ids": full_input_ids.squeeze(0),
            "attention_mask": attention_mask.squeeze(0),
            "labels": tokenized_target["input_ids"].squeeze()
        }

# Verify the structure of a sample item
sample = SummarizationDataset(inputs, targets).__getitem__(0)
print("Sample input_ids shape:", sample["input_ids"].shape)
print("Sample attention_mask shape:", sample["attention_mask"].shape)
print("Sample labels shape:", sample["labels"].shape)

Sample input_ids shape: torch.Size([4096])
Sample attention_mask shape: torch.Size([4096])
Sample labels shape: torch.Size([512])


In [None]:
# Load your dataset
inputs = texts
targets = simplified_texts # List of target summaries

# First, split 80% for training and 20% for the remaining set (which will be split further)
train_inputs, remaining_inputs, train_targets, remaining_targets = train_test_split(
    inputs, targets, train_size =0.85, test_size = 0.15, random_state=42
)

# Then, split the remaining 20% into 15% for test and 5% for validation
val_inputs, test_inputs, val_targets, test_targets = train_test_split(
    remaining_inputs, remaining_targets, test_size=0.75, random_state=42
)

# Verify split sizes
print("Train size:", len(train_inputs))
print("Validation size:", len(val_inputs))
print("Test size:", len(test_inputs))

# Define dataset objects for each split
train_dataset = SummarizationDataset(train_inputs, train_targets)
val_dataset = SummarizationDataset(val_inputs, val_targets)
test_dataset = SummarizationDataset(test_inputs, test_targets)


Train size: 4522
Validation size: 199
Test size: 599


## Preparing Model

In [9]:
# Function to print the number of total, trainable, and non-trainable parameters
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    non_trainable_params = total_params - trainable_params
    print(f"Total parameters: {total_params}")
    print(f"Trainable parameters: {trainable_params}")
    print(f"Non-trainable parameters: {non_trainable_params}")

# Print parameter counts before freezing layers
print("Before freezing layers:")
count_parameters(unlimiformer.model)

# Freeze all layers
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last three layers by adjusting the names
for name, param in model.named_parameters():
    if 'encoder.block.11' in name or 'decoder.block.11' in name or 'lm_head' in name or 'shared' in name :
            param.requires_grad = True
            print(f"Unfreezing layer: {name}")

# Print the number of parameters after freezing
print("After freezing layers:")
count_parameters(unlimiformer.model)

Before freezing layers:
Total parameters: 247539456
Trainable parameters: 247539456
Non-trainable parameters: 0
Unfreezing layer: shared.weight
Unfreezing layer: encoder.block.11.layer.0.SelfAttention.q.weight
Unfreezing layer: encoder.block.11.layer.0.SelfAttention.k.weight
Unfreezing layer: encoder.block.11.layer.0.SelfAttention.v.weight
Unfreezing layer: encoder.block.11.layer.0.SelfAttention.o.weight
Unfreezing layer: encoder.block.11.layer.0.layer_norm.weight
Unfreezing layer: encoder.block.11.layer.1.DenseReluDense.wi_0.weight
Unfreezing layer: encoder.block.11.layer.1.DenseReluDense.wi_1.weight
Unfreezing layer: encoder.block.11.layer.1.DenseReluDense.wo.weight
Unfreezing layer: encoder.block.11.layer.1.layer_norm.weight
Unfreezing layer: decoder.block.11.layer.0.SelfAttention.q.weight
Unfreezing layer: decoder.block.11.layer.0.SelfAttention.k.weight
Unfreezing layer: decoder.block.11.layer.0.SelfAttention.v.weight
Unfreezing layer: decoder.block.11.layer.0.SelfAttention.o.weigh

In [None]:
optimizer = Lamb(
    filter(lambda p: p.requires_grad, unlimiformer.model.parameters()),
    lr=4e-5,
    eps=1e-8,
    weight_decay=0.01
)

lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=400,
    num_training_steps=4522
)

In [None]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Directory to save checkpoints
    report_to="none",  # Disable reporting for this setup
    save_steps=200,  # Save model checkpoints less frequently
    save_total_limit=1,  # Save only the last 3 checkpoints
    gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
    do_train=True,  # Perform training
    do_eval=True,  # Perform evaluation
    per_device_train_batch_size=1,  # Increase batch size if memory allows
    per_device_eval_batch_size=2,  # Increase batch size for evaluation
    num_train_epochs=1,  # 10.27 to 
    seed=42,  # Ensure reproducibility
    warmup_ratio=0.1,  # Keep warmup ratio for smoother start
    weight_decay=0.01,  # Weight decay to avoid overfitting
    learning_rate=3e-5,  # Reduced learning rate
    logging_dir="./logs",  # Directory for logs
    logging_steps=200,# Log every 50 steps
    eval_strategy="steps",
    eval_steps = 200, # Evaluate during training
    label_smoothing_factor=0.1,  # Apply label smoothing for better generalization
    fp16=False,  # Enable mixed precision for faster training
)

In [None]:
# Custom Seq2SeqTrainer with overridden compute_loss method
class CustomSeq2SeqTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Compute custom loss for summarization/simplification.
        """
        labels = inputs.get("labels")
        
        # Ensure labels are on the correct device (same as model)
        labels = labels.to(model.device)
        
        # Forward pass
        outputs = model(**inputs)
        
        logits = outputs.get("logits")
        
        # Ensure that the logits and labels are correctly aligned
        # logits: [batch_size, seq_len, vocab_size]
        # labels: [batch_size, seq_len]
        
        # Flatten the logits and labels to compute the loss across the sequence
        loss_fn = nn.CrossEntropyLoss(
            ignore_index=self.tokenizer.pad_token_id,  # ignore padding token
            reduction='mean'  # Optionally set reduction to 'mean' or 'sum'
        )
        
        # Flatten logits and labels for loss calculation
        loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
        
        if return_outputs:
            return loss, outputs
        return loss

In [None]:
# Ensure the model has a default GenerationConfig
if not hasattr(unlimiformer.model, "generation_config"):
    unlimiformer.model.generation_config = GenerationConfig()

# Instantiate the custom trainer
trainer = CustomSeq2SeqTrainer(
    model=unlimiformer.model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, lr_scheduler),
)

## Huggingface Pushing

In [None]:
# Save your Hugging Face token
huggingface_token = "YOURAPITOKEN"  # Replace with your actual token
HfFolder.save_token(huggingface_token)

In [None]:
repo_name = "pt5_la10"  # Replace with your username and desired model name

## Tunning

In [None]:
# Start fine-tuning
trainer.train()


model = trainer.model #access the trained model from trainer.
tokenizer = trainer.tokenizer #access the tokenizer from trainer.


# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub(repo_name, use_auth_token=huggingface_token)
tokenizer.push_to_hub(repo_name, use_auth_token=huggingface_token)


print(f"Model and tokenizer pushed to Hugging Face Hub under {repo_name}!")