## Installing and Importing Packages

In [1]:
!pip uninstall -y transformers torch

Found existing installation: transformers 4.44.2
Uninstalling transformers-4.44.2:
  Successfully uninstalled transformers-4.44.2
Found existing installation: torch 2.4.1+cu121
Uninstalling torch-2.4.1+cu121:
  Successfully uninstalled torch-2.4.1+cu121


In [None]:
!pip install transformers==4.45.1
!pip install torch==2.4.0
!pip install torch_optimizer

Collecting transformers==4.45.1
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers==4.45.1)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m86.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:


In [None]:
import torch
import json
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq, get_scheduler
from torch.utils.data import Dataset, random_split
from torch_optimizer import Lamb
import torch.nn as nn
from huggingface_hub import HfApi, HfFolder

## Loading Model

In [None]:
model_name_or_path = "agemagician/mlong-t5-tglobal-base"
use_auth_token = False  # Set to False if no authentication is needed
max_target_length = 512  # Maximum length of the target sequence
fp16 = True  # Enable mixed precision training

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name_or_path,
    use_auth_token=use_auth_token
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    use_auth_token=use_auth_token
)



config.json:   0%|          | 0.00/906 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.37G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## Loading and Preparing Dataset

In [None]:
# Load the dataset from the JSON file
with open("/kaggle/input/divan-ali/Divan_ali_Simplified.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    
# Initialize lists to store original and simplified texts
texts = []
simplified_texts = []

# Extract original and simplified texts
for entry in data:
    for key, content in entry.items():
        original_text = content.get("متن رای", "")
        simplified_text = content.get("simplified text", "")

        # Ensure both original and simplified texts are not empty
        if original_text and simplified_text:
            texts.append(original_text)
            simplified_texts.append(simplified_text)

In [None]:
# Load dataset
class LegalDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_input_length=4096, max_output_length=512):
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

        with open(file_path, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)  # Load the list of dictionaries

        # Flatten the nested structure into a list of relevant entries
        self.data = []
        for case in raw_data:
            for case_id, details in case.items():
                if "متن رای" in details and "simplified text" in details:
                    self.data.append({
                        "متن رای": details["متن رای"],
                        "simplified text": details["simplified text"]
                    })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = item["متن رای"]
        target_text = item["simplified text"]

        inputs = self.tokenizer(
            input_text,
            max_length=self.max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        targets = self.tokenizer(
            target_text,
            max_length=self.max_output_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Ensure correct type and handle padding tokens
        input_ids = inputs["input_ids"].squeeze(0).long()
        attention_mask = inputs["attention_mask"].squeeze(0).long()
        labels = targets["input_ids"].squeeze(0).long()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [6]:
# Prepare dataset
data_file = "/kaggle/input/divan-ali/Divan_ali_Simplified.json"
dataset = LegalDataset(data_file, tokenizer)

## Split Data

In [7]:
# Split dataset
train_size = 4522
test_size = 599
val_size = 199

train_dataset, test_dataset, val_dataset = random_split(dataset, [train_size, test_size, val_size])

## Preparing Model

In [8]:
# Function to print the number of total, trainable, and non-trainable parameters
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    non_trainable_params = total_params - trainable_params
    print(f"Total parameters: {total_params}")
    print(f"Trainable parameters: {trainable_params}")
    print(f"Non-trainable parameters: {non_trainable_params}")

# Print parameter counts before freezing layers
print("Before freezing layers:")
count_parameters(model)

# Freeze all layers
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last three layers by adjusting the names
for name, param in model.named_parameters():
    if 'encoder.block.11' in name or 'decoder.block.11' in name or 'lm_head' in name or 'shared' in name :
            param.requires_grad = True
            print(f"Unfreezing layer: {name}")

# Print the number of parameters after freezing
print("After freezing layers:")
count_parameters(model)

Before freezing layers:
Total parameters: 985850496
Trainable parameters: 985850496
Non-trainable parameters: 0
Unfreezing layer: shared.weight
Unfreezing layer: encoder.block.11.layer.0.TransientGlobalSelfAttention.q.weight
Unfreezing layer: encoder.block.11.layer.0.TransientGlobalSelfAttention.k.weight
Unfreezing layer: encoder.block.11.layer.0.TransientGlobalSelfAttention.v.weight
Unfreezing layer: encoder.block.11.layer.0.TransientGlobalSelfAttention.o.weight
Unfreezing layer: encoder.block.11.layer.0.TransientGlobalSelfAttention.global_input_layer_norm.weight
Unfreezing layer: encoder.block.11.layer.0.layer_norm.weight
Unfreezing layer: encoder.block.11.layer.1.DenseReluDense.wi_0.weight
Unfreezing layer: encoder.block.11.layer.1.DenseReluDense.wi_1.weight
Unfreezing layer: encoder.block.11.layer.1.DenseReluDense.wo.weight
Unfreezing layer: encoder.block.11.layer.1.layer_norm.weight
Unfreezing layer: decoder.block.11.layer.0.SelfAttention.q.weight
Unfreezing layer: decoder.block.1

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save checkpoints
    report_to="none",  # Disable reporting for this setup
    save_steps=10,  # Save model checkpoints less frequently
    save_total_limit=2,  # Save only the last 3 checkpoints
    gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
    do_train=True,  # Perform training
    do_eval=True,  # Perform evaluation
    per_device_train_batch_size=1,  # Increase batch size if memory allows
    per_device_eval_batch_size=2,  # Increase batch size for evaluation
    num_train_epochs=3,  # Increase the number of epochs
    seed=42,  # Ensure reproducibility
    warmup_ratio=0.1,  # Keep warmup ratio for smoother start
    weight_decay=0.01,  # Weight decay to avoid overfitting
    learning_rate=3e-5,  # Reduced learning rate
    logging_dir="./logs",  # Directory for logs
    logging_steps=10,  # Log every 50 steps
    eval_strategy="steps",  # Evaluate during training
    eval_steps=10,  # Perform evaluation less frequently
    label_smoothing_factor=0.1,  # Apply label smoothing for better generalization
    fp16=False,  # Enable mixed precision for faster training
)

In [None]:
optimizer = Lamb(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=5e-4,
    eps=1e-8,
    weight_decay=0.01
)

lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=170,
    num_training_steps=1695
)

In [12]:
!CUDA_LAUNCH_BLOCKING=1

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to GPU

LongT5ForConditionalGeneration(
  (shared): Embedding(256384, 768)
  (encoder): LongT5Stack(
    (embed_tokens): Embedding(256384, 768)
    (block): ModuleList(
      (0): LongT5Block(
        (layer): ModuleList(
          (0): LongT5LayerTransientGlobalSelfAttention(
            (TransientGlobalSelfAttention): LongT5TransientGlobalAttention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
              (global_relative_attention_bias): Embedding(32, 12)
              (global_input_layer_norm): LongT5LayerNorm()
            )
            (layer_norm): LongT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): LongT5LayerFF(
            (DenseReluDen

In [None]:
class Seq2SeqTrainerWithLoss(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Shift logits and labels for loss computation
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        return (loss, outputs) if return_outputs else loss

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainerWithLoss(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optimizer, lr_scheduler)
)

## Huggingface Pushing

In [None]:
# Save your Hugging Face token
huggingface_token = "YOURTOKENAPI"  # Replace with your actual token
HfFolder.save_token(huggingface_token)

repo_name = "USERNAME/REPONAME"

## Tunning

In [None]:
# Start fine-tuning
trainer.train()


# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub(repo_name, use_auth_token=huggingface_token)
tokenizer.push_to_hub(repo_name, use_auth_token=huggingface_token)

print(f"Model and tokenizer pushed to Hugging Face Hub under {repo_name}!")