In [1]:
import json
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments

# 1. Load the custom dataset
def load_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Custom dataset file path
file_path = 'G:\\My Drive\\My Research\\Topics\\0-Arabic Finetune LLAMA3\\datasets\\alpaca_arabic_gpt4_hf\\alpaca-gpt4-arabic.json'
dataset = load_dataset(file_path)


  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# 2. Prepare the dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer, max_length=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        conversations = self.dataset[idx]['conversations']
        prompt = ''.join([conv['value'] for conv in conversations if conv['from'] == 'human'])
        response = ''.join([conv['value'] for conv in conversations if conv['from'] == 'gpt'])
        inputs = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        outputs = self.tokenizer(response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': outputs['input_ids'].squeeze()
        }

In [3]:
# Load the tokens from the configuration file
with open("config.json", "r") as file:
    config = json.load(file)

hf_token = config["HUGGINGFACE_TOKEN"]

In [4]:
# 3. Initialize the model and tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login


# Login to Hugging Face
login(token=hf_token)


# 4. Initialize the model and tokenizer
# model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'  # The model from the provided line
model_name = 'meta-llama/Meta-Llama-3-8B'  # The model from the provided line

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
except TypeError as e:
    print(f"Error loading tokenizer: {e}")
    raise

# Set padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    #model.resize_token_embeddings(len(tokenizer))

try:
    model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=True)
except TypeError as e:
    print(f"Error loading model: {e}")
    raise

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\USER\.cache\huggingface\token
Login successful


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:10<00:00,  2.69s/it]


In [5]:
# 4. Prepare the dataset for training
custom_dataset = CustomDataset(dataset, tokenizer)

# 5. Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=1,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=100,
    evaluation_strategy="steps",
    eval_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,  # Use mixed precision training
)

# 6. Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=custom_dataset,
    tokenizer=tokenizer,
)

# 7. Fine-tune the model
trainer.train()

# 8. Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-llama')
tokenizer.save_pretrained('./fine-tuned-llama')

print("Fine-tuning completed and model saved.")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmsfasha[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/149904 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
