In [1]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "2" 
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")  
print(f"Using device: {device}")

Using device: cuda:2


In [2]:
import pickle

with open('all_abbasid_poems.pkl', 'rb') as file:
    data = pickle.load(file)

In [3]:
import numpy as np
import pandas as pd
from datasets import Dataset
from tqdm.auto import tqdm  # This auto-detects the environment (Jupyter/Terminal)


# Convert numpy array to DataFrame with column name "text"
df_data = pd.DataFrame(data, columns=["text"])
df_data = df_data.sample(n=5000, random_state=42)  
dataset = Dataset.from_pandas(df_data)

# Split into 80% train / 20% validation
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)

train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "core42/jais-13b"

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    use_auth_token=True,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.float16,  
    use_auth_token=True,
    trust_remote_code=True
)




Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [5]:
from peft import get_peft_model, LoraConfig
import bitsandbytes as bnb

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=['attn.c_attn', 'attn.c_proj'],
    lora_dropout=0.1,
)
model = get_peft_model(model, lora_config)



In [6]:
# Custom Hook Function to Prevent In-Place Modification
def safe_grad_hook(grad):
    return grad.clone()  # Ensure gradients are cloned

# Applying the hook to all Linear8bitLt layers
from bitsandbytes import nn as bnb

for name, module in model.named_modules():
    if isinstance(module, bnb.Linear8bitLt):
        module.register_full_backward_hook(lambda module, grad_input, grad_output: (grad_input[0].clone(),))


In [7]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total Parameters: {total_params}")
print(f"Trainable Parameters (LoRA): {trainable_params}")


Total Parameters: 13030642360
Trainable Parameters (LoRA): 9830400


In [8]:
from transformers import DataCollatorForLanguageModeling

# Tokenization function - ensure padding=False for now
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors=None  # Important: returns lists not tensors
    )
    return tokenized

# Apply tokenization
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Add labels (same as input_ids for causal LM)
tokenized_train = tokenized_train.map(lambda x: {'labels': x['input_ids']})
tokenized_val = tokenized_val.map(lambda x: {'labels': x['input_ids']})

# Use the correct collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Important: False for causal LM
    pad_to_multiple_of=8  # Optional: helps with GPU efficiency
)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
from transformers import Trainer
from peft import PeftModel
#from transformers.trainer_utils import can_return_loss
from transformers.utils import find_labels, can_return_loss

class PeftTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if hasattr(self.model, "get_base_model"):
            model_to_inspect = self.model.get_base_model()
            default_label_names = find_labels(model_to_inspect.__class__)
            self.can_return_loss = can_return_loss(model_to_inspect.__class__)
        else:
            default_label_names = find_labels(self.model.__class__)
            self.can_return_loss = can_return_loss(self.model.__class__)
        # Override label_names if explicitly provided
        self.label_names = default_label_names if self.args.label_names is None else self.args.label_names

In [10]:
from transformers import DataCollatorWithPadding, EvalPrediction, TrainingArguments, Trainer

steps_per_epoch = len(tokenized_train)

training_args = TrainingArguments(
    output_dir="./jais-finetuned",  
    overwrite_output_dir=True,
    num_train_epochs=5, 
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,   
    eval_accumulation_steps=1,
    save_strategy="epoch",  
    logging_steps=steps_per_epoch,
    eval_strategy="epoch", 
    learning_rate=5e-5,
    weight_decay=0.01,
    #load_best_model_at_end=True,
    #metric_for_best_model="eval_loss", 
    report_to="none",                  
    disable_tqdm=False 
)

In [11]:
from transformers import TrainerCallback

trainer = PeftTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,
2,No log,
3,No log,
4,0.000000,
5,0.000000,


TrainOutput(global_step=5000, training_loss=0.0, metrics={'train_runtime': 8352.921, 'train_samples_per_second': 2.394, 'train_steps_per_second': 0.599, 'total_flos': 7.738664951808e+17, 'train_loss': 0.0, 'epoch': 5.0})

In [13]:
model.save_pretrained("./jais_finetuned")
tokenizer.save_pretrained("./jais_finetuned")

('./jais_finetuned/tokenizer_config.json',
 './jais_finetuned/special_tokens_map.json',
 './jais_finetuned/tokenizer.json')

In [30]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the saved model and tokenizer
model_path = "./jais_finetuned"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Move model to the appropriate device (GPU if available)
device = "cpu"
model = model.to(device)

The repository for core42/jais-13b contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/core42/jais-13b.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


JAISLMHeadModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [33]:
prompt = ["و لا يحب ترى نجوم في سماء في سماء اكتب قصيدة عن الحب يا حبيبي"]
#inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=512, truncation=True).to(model.device)

for p in prompt:
    inputs = tokenizer(p, return_tensors="pt", padding="max_length", max_length=512, truncation=True).to(model.device)
    generated_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=512,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        temperature=0.7,
        top_p=0.9,
        top_k=50,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print(generated_text)
    print("\n")

و لا يحب ترى نجوم في سماء في سماء اكتب قصيدة عن الحب يا حبيبي!




In [37]:
# Adjusted Generation with Higher Creativity
prompt = "اكتب قصيدة عن الحب"

# Tokenizing the entire prompt
inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=512, truncation=True).to(model.device)

# Generating text with higher creativity
generated_ids = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_new_tokens=100, 
    num_return_sequences=1,
    no_repeat_ngram_size=3,
    temperature=1.5,        # Higher creativity
    top_p=0.95,             # More diverse word selection
    top_k=0,                # No restriction on top-k (disable)
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
)

# Decoding and printing the generated text
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_text)


اكتب قصيدة عن الحب!
