# Joke Generator using LLM Fine-tuning
- **Initial Idea/Motivation:**
  - Taking a small model like Phi-2 (2.7B) or even TinyLlama (1.1B) and fine-tuning it on a specific genre of jokes within 4-5 lines max (e.g., clever, silly, punny, tech). Then, quantizing it to the highest possible level (e.g., 4-bit or 3-bit GGUF) and running it.
  - This is the ultimate test of the (quantization + fine-tuning) combination by creating a fun, usable application with a model under 2GB in size.
- **Tech Stack**:
    - **Base Model**: Phi-2 (2.7B).
    - **Fine-tuning (PEFT+Quatization)**: Implemented QLoRA (Quantized Low-Rank Adaptation), combined with 4-bit quantization, for parameter-efficient training.
- **Curated Dataset Approach**:
    - **Targeted Creation**: Generated 400 high-quality examples (100 per category) across four safe topics: ["Technology & Programming", "Coffee & Beverages", "Food & Cooking", "Animals & Pets"].
    - **Generation Method**: Used DeepSeek Chat with carefully crafted prompts to ensure: Consistent Format (4-5 lines), Content Safety (avoiding offensive content), Category Relevance.
- **Demo**:
    - **HuggingFace Model Repository**: https://huggingface.co/nanditab35/phi-2-jokebot-peft
    - **HuggingFace Space Repository**: https://huggingface.co/spaces/nanditab35/jokebot

## Code for **JokeBot - AI Comedy Generator**

### Installation & Setup

In [None]:
# Step 1: Install required packages

#!pip install -qU transformers accelerate peft bitsandbytes datasets trl huggingface_hub
!pip install -qU transformers accelerate peft bitsandbytes datasets huggingface_hub

### Imports & GPU Check

In [None]:
# All imports and GPU verification
# Step 2: Import libraries

import os
import json
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
from google.colab import userdata

# Step 3: Check GPU availability
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU name: {torch.cuda.get_device_name(0)}")
print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

### CONFIG DICT

In [None]:
CONFIG_DICT = {
    "MODEL_NAME": "microsoft/phi-2",
    "JOKE_TYPES_ARR": ["tech", "coffee", "foodie", "animals"]
}

### API Keys

In [None]:
from google.colab import userdata
from huggingface_hub import login

HF_API_KEY = userdata.get('HF_API_KEY')
login(token=HF_API_KEY)

### Data Preparation

In [None]:
# Load and prepare training data
with open('jokebot_training_data.json', 'r') as f:
    training_data = json.load(f)

# Formatting the data
def format_instruction(example):
    return f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}<|endoftext|>"

formatted_data = [format_instruction(example) for example in training_data]
train_dataset = Dataset.from_dict({"text": formatted_data})

print(f"Training dataset size: {len(train_dataset)}")
print("Sample training example:")
print(formatted_data[0])

### Tokenization

In [None]:
# Load tokenizer
model_name = CONFIG_DICT["MODEL_NAME"]
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    # Tokenize with truncation and padding
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors=None,
    )
    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Apply tokenization
tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    # batch_size = 100,
    remove_columns=train_dataset.column_names
)

print("âœ… Dataset tokenized successfully!")
print(f"Sample tokenized keys: {list(tokenized_dataset[0].keys())}")

### Model Setup with QLoRA

In [None]:
# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Load model with quantization
print("Loading Phi-2 model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16
)

# Prepare for k-bit training
model = prepare_model_for_kbit_training(model)

In [None]:
# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["Wqkv", "out_proj", "fc1", "fc2"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

### Training Setup

In [None]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
    pad_to_multiple_of=8,
    return_tensors="pt",
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./phi-2-jokebot",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_ratio=0.03,
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
    save_total_limit=3,
    prediction_loss_only=True,
    remove_unused_columns=False,
    fp16=True,
    dataloader_pin_memory=False,
    report_to="none",  # Disable wandb in Colab
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

### Training Execution

In [None]:
# Start training
print("ðŸš€ Starting training...")
trainer.train()

# Save the final model
trainer.save_model()
tokenizer.save_pretrained("./phi-2-jokebot")
print("âœ… Training completed and model saved!")

### Testing the Fine-tuned Model

In [None]:
# Load the fine-tuned model for testing
from transformers import pipeline

print("Testing fine-tuned model...")

# Load the saved model
model_path = "./phi-2-jokebot"

# Create text generation pipeline
joke_pipe = pipeline(
    "text-generation",
    model=model_path,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.float16
)

# Test with different categories
test_prompts = [
    "Generate a short joke within 4-5 lines that is coming from tech topic",
    "Generate a short joke within 4-5 lines that is coming from food topic",
    "Generate a short joke within 4-5 lines that is coming from animals topic"
]

for i, prompt in enumerate(test_prompts):
    print(f"\nðŸŽ¯ Test {i+1}: {prompt}")
    result = joke_pipe(
        prompt,
        max_new_tokens=100,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.1
    )
    print("Generated:")
    print(result[0]['generated_text'])
    print("-" * 50)

### Merge LLM PEFT Model with the Base Model Before GGUF Conversion
- avoided this option for simplicity, and directly uploaded the PEFT models to HuggingFace Model Repository

In [None]:
# import torch
# from peft import PeftModel
# from transformers import AutoModelForCausalLM, AutoTokenizer

# # During DEVELOPMENT - Use PEFT model
# def load_peft_model():
#     base_model = AutoModelForCausalLM.from_pretrained(CONFIG_DICT["MODEL_NAME"], device_map="auto")
#     model = PeftModel.from_pretrained(base_model, "./phi-2-jokebot", device_map="auto",)
#     return model

# # Merge LoRA model with the Base Model - Helps llama.cpp understand BaseModel Architecture
# def merge_lora_adapter():
#     # Load tokenizer first (very lightweight)
#     tokenizer = AutoTokenizer.from_pretrained(
#         CONFIG_DICT["MODEL_NAME"],
#         trust_remote_code=True
#     )
#     tokenizer.pad_token = tokenizer.eos_token
#     # Load base model with low memory
#     base_model = AutoModelForCausalLM.from_pretrained(
#         CONFIG_DICT["MODEL_NAME"],
#         torch_dtype=torch.float16,
#         device_map="auto",
#         trust_remote_code=True,
#         low_cpu_mem_usage=True  # Critical for low memory
#     )
#     # Load PEFT model
#     model = PeftModel.from_pretrained(base_model, "./phi-2-jokebot")
#     # merged_model = model.merge_and_unload() # Making the Application Crash on Colab T4 GPU
#     # Export both model AND tokenizer
#     model.save_pretrained(
#         "./phi-2-jokebot-merged",
#         safe_serialization=True,
#         max_shard_size="2GB"
#     )
#     tokenizer.save_pretrained("./phi-2-jokebot-merged")

#     print("âœ… LoRA adapter merged!")

# # For DEPLOYMENT - Convert to GGUF once, then use GGUF
# # def convert_for_deployment():
# #     # Merge only for conversion
# #     merged_model = merge_lora_adapter()
# #     convert_to_gguf(merged_model)
# #     # Then use the GGUF file forever

# # In PRODUCTION - Use GGUF
# def load_production_model():
#     return Llama(model_path="./phi-2-jokebot-q4_k_m.gguf")

In [None]:
# merge_lora_adapter()

### Zip the Saved Model files for Download purpose

In [None]:
import shutil
shutil.make_archive("phi-2-jokebot", 'zip', "./phi-2-jokebot")

### Push the PEFT model to HF Space (if HF_API_KEY has Wrrite Permission)
- The purpose behind this is - From HF Space use Gradio UI to run the JokeGen Bot
- If HF_API_KEY does not have write permission, then upload the model files manually

In [None]:
# from huggingface_hub import HfApi
# from google.colab import userdata

# # Login
# from huggingface_hub import login
# login(token=HF_API_KEY) # Needs a HF_API_KEY with Write Access

# # Upload your PEFT model
# api = HfApi()
# api.upload_folder(
#     folder_path="./phi-2-jokebot",
#     # repo_id="your-username/phi-2-jokebot-peft",
#     repo_type="model"
# )

# print("Uploaded to HF Hub!")
# print("Now you can:")
# print("1. Use the HF interface to create GGUF versions")
# print("2. Or download and use Spaces with your PEFT model")
# print("3. Or use their conversion tools via API")