<a href="https://colab.research.google.com/github/mastermindankur/AI-ML-Udemy/blob/main/LLM_TRAINED_TEST_DATA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# Step 1: Install Hugging Face libraries
%%capture
!pip install transformers datasets huggingface_hub -q
!pip install bitsandbytes accelerate peft transformers

In [14]:
# Step 2: Log in to Hugging Face Hub (you'll need to use your own token)
from huggingface_hub import notebook_login, login

# Replace "YOUR_TOKEN" with your actual Hugging Face token
#token = ""
login(token=token)
# or
#notebook_login() #This opens a widget for you to manually paste the token.

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [17]:
# Step 3: Load Dataset
from datasets import load_dataset

# Load the full dataset
dataset = load_dataset("mastermindankur/ankur")

# Select the first 200 rows from the 'train' split
#dataset = dataset["train"].select(range(200))
dataset = dataset["train"]

# Perform the train-test split on the selected 200 rows
dataset = dataset.train_test_split(test_size=0.2)

In [18]:
# Step 4: Load Model and Tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [19]:
import gc; gc.collect()

210

In [20]:
# Step 4: Load Model and Tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Load the model with 4-bit quantization and LoRA configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Use the 'offload_folder' parameter to store parts of the model on the disk.
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"":0}, # Place the model on the first GPU
    trust_remote_code=True, # Trust the remote code for Llama models
    offload_folder="offload_dir"  # Specify a folder to offload to
)

# Apply LoRA to the model
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8, # Rank of the LoRA update matrices
    lora_alpha=32, # Scaling factor for the LoRA update matrices
    lora_dropout=0.05, # Dropout probability for the LoRA layers
    bias="none", # Whether to apply a bias to the LoRA layers
    task_type="CAUSAL_LM" # Type of task for the LoRA layers
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # Print the number of trainable parameters

trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689


In [21]:
# Step 5
# Tokenize function to process `combined_text` and create labels
def tokenize_function(examples):
    # Tokenize combined_text and create input_ids and labels
    tokenized = tokenizer(
        examples["combined_text"],           # Use the combined_text field for input
        truncation=True,                     # Truncate to fit max length
        max_length=tokenizer.model_max_length
    )
    # Set labels to be a copy of input_ids for causal language modeling loss
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Map tokenize_function to the entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=[
    'id', 'test_case_id', 'topic', 'test_scenario', 'test_steps', 'expected_intended_result', 'pass_fail_criteria', 'input_text', 'target_text'
])

# Check if the transformation was successful
print(tokenized_dataset)

Map:   0%|          | 0/3738 [00:00<?, ? examples/s]

Map:   0%|          | 0/935 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['combined_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3738
    })
    test: Dataset({
        features: ['combined_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 935
    })
})


In [22]:
# Step 6: Fine-tuning with Trainer
import traceback
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Create DataCollatorForLanguageModeling for padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False # We are not doing masked language modeling
)


# Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory for checkpoints and logs
    per_device_train_batch_size=1,  # Batch size per GPU for training
    per_device_eval_batch_size=1,   # Batch size per GPU for evaluation
    gradient_accumulation_steps=2,
    num_train_epochs=3,              # Number of training epochs
    learning_rate=2e-5,             # Learning rate
    weight_decay=0.01,               # Weight decay for regularization
    fp16=True,                       # Enable mixed precision training (if supported)
    logging_dir="./logs",            # Directory for storing logs
    push_to_hub=False,              # Whether to push the model to the Hugging Face Hub
)


trainer = Trainer(
        model=model,                         # The model to train
        args=training_args,                  # Training arguments
        train_dataset=tokenized_dataset['train'],# Training dataset
        #data_collator=data_collator
        eval_dataset=tokenized_dataset['test'],      # Evaluation dataset (optional)
    )

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
500,1.5794
1000,1.2245
1500,1.1568
2000,1.1077
2500,1.0817
3000,1.0716
3500,1.059
4000,1.0415
4500,1.0396
5000,1.0227


TrainOutput(global_step=5607, training_loss=1.126214390879373, metrics={'train_runtime': 1999.3181, 'train_samples_per_second': 5.609, 'train_steps_per_second': 2.804, 'total_flos': 7476348874199040.0, 'train_loss': 1.126214390879373, 'epoch': 3.0})

In [27]:
# Save the model
trainer.save_model("ankur")
tokenizer.save_pretrained("ankur")

('ankur/tokenizer_config.json',
 'ankur/special_tokens_map.json',
 'ankur/tokenizer.json')

In [31]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the path to your trained model
# Replace "path_to_trained_model" with the actual directory where your fine-tuned model is saved
model_path = "ankur"

# Load the fine-tuned model and tokenizer from the directory
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Prepare a sample input for testing
input_text = "Topic: Test Case Generation for Credit card Failed transactions, Verify incorrect CVV. Include test steps"  # Replace with your sample text
inputs = tokenizer(input_text, return_tensors="pt", truncation=True)

# Generate predictions
outputs = model.generate(**inputs, max_length=200, num_return_sequences=1)
predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Display the output
print("Input:", input_text)
print("Generated Output:", predicted_text)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input: Topic: Test Case Generation for Credit card Failed transactions, Verify incorrect CVV
Generated Output: Topic: Test Case Generation for Credit card Failed transactions, Verify incorrect CVV validation on failed transactions., Scenario: Verify that the system displays a message indicating incorrect CVV validation for a failed transaction., Expected Result: The system should display a message indicating "Incorrect CVV" for a failed transaction., Pass/Fail Criteria: The system should correctly identify the failed transaction and display the appropriate message.


In [44]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.42M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mastermindankur/results/commit/686026999321ed8cd065aa22a8fb1b3ffb671632', commit_message='End of training', commit_description='', oid='686026999321ed8cd065aa22a8fb1b3ffb671632', pr_url=None, pr_revision=None, pr_num=None)