In [1]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb
%pip install -q google-api-python-client google-auth google-auth-httplib2 google-auth-oauthlib

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
from datasets import load_dataset
from trl import SFTTrainer
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload, MediaFileUpload

import os, torch, wandb, io, bitsandbytes as bnb

In [3]:
# Set up secrets and logins
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("hf_token")
wb_token = user_secrets.get_secret("wandb")
login(token=hf_token)
wandb.login(key=wb_token)
run = wandb.init(project='Fine-tune Llama 3.2 on Customer Support Dataset', job_type="training", anonymous="allow")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnitichavda24[0m ([33mnitichavda24-nirma-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
# Google Drive authentication
SERVICE_ACCOUNT_FILE = '/kaggle/input/credential2/avid-influence-451503-v3-d8e2e9fce49c.json'  # Upload this file to Kaggle
SCOPES = ['https://www.googleapis.com/auth/drive']
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
drive_service = build('drive', 'v3', credentials=credentials)


In [5]:
def read_query_from_drive(file_id):
    request = drive_service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while not done:
        status, done = downloader.next_chunk()
    fh.seek(0)
    return fh.read().decode()

In [6]:
base_model = "meta-llama/Llama-3.2-3b-instruct"
new_model = "llama-3.2-3b-it-Ecommerce-ChatBot"
dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"

In [7]:
import torch

if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"


In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [9]:
# Dataset prep
instruction = """You are a top-rated customer service agent named John. Be polite to customers and answer all their questions."""
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.shuffle(seed=65).select(range(1000))

def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": row["instruction"]},
        {"role": "assistant", "content": row["response"]}
    ]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(format_chat_template, num_proc=4)

README.md:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

(…)t_Training_Dataset_27K_responses-v11.csv:   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26872 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:
# LoRA config

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)

In [11]:
model = get_peft_model(model, peft_config)
tokenizer.pad_token = tokenizer.eos_token

training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)


In [12]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=dataset,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=training_arguments,
)

Converting train dataset to ChatML:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [13]:
trainer.train()



Step,Training Loss,Validation Loss
100,0.7487,0.752964
200,0.5275,0.642907
300,0.6031,0.56569
400,0.4414,0.520692
500,0.4244,0.501356


TrainOutput(global_step=500, training_loss=0.7136747350692749, metrics={'train_runtime': 1434.4591, 'train_samples_per_second': 0.697, 'train_steps_per_second': 0.349, 'total_flos': 3328650423035904.0, 'train_loss': 0.7136747350692749})

In [14]:
wandb.finish()

0,1
eval/loss,█▅▃▂▁
eval/mean_token_accuracy,▁▄▆▇█
eval/num_tokens,▁▃▅▆█
eval/runtime,▁▇█▂▄
eval/samples_per_second,█▂▁▇▅
eval/steps_per_second,█▂▁▇▅
train/epoch,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇█████
train/global_step,▁▁▁▁▁▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
train/grad_norm,█▆▅▅▅▂▅▃▃▅▂▂▃▁▁▂▄▃▃▄▂▂▃▂▂▃▂▄▁▁▅▄▃▅▄▃▄▄▄▃
train/learning_rate,▆████▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁

0,1
eval/loss,0.50136
eval/mean_token_accuracy,0.84222
eval/num_tokens,195133.0
eval/runtime,196.3901
eval/samples_per_second,5.092
eval/steps_per_second,5.092
total_flos,3328650423035904.0
train/epoch,1.0
train/global_step,500.0
train/grad_norm,0.9579


In [27]:
from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import io

In [28]:
# Inference from query.txt and save response
query_file_id = "1h6H4XiYDCGtEzK2RQMJrfQVVhxeUvUH-"  # 🔁 Replace with your actual file ID from Google Drive
query_text = read_query_from_drive(query_file_id)

In [29]:
messages = [
    {"role": "system", "content": instruction},
    {"role": "user", "content": query_text}
]

In [30]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")
outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)
response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("assistant")[-1].strip()

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [31]:
print("Generated Response:\n", response)

Generated Response:
 We offer a variety of payment options to cater to your needs. You can choose from credit/debit cards, PayPal, bank transfer, Apple Pay, and Google Wallet. Feel free to explore these options and select the one that suits you best. If you have any specific questions or need further assistance, please don't hesitate to let me know.


In [32]:
def upload_response_to_drive(text, filename, folder_id=None):
    file_metadata = {'name': filename}
    if folder_id:
        file_metadata['parents'] = [folder_id]  # ⬅️ ensures upload goes into shared folder
    media = MediaIoBaseUpload(io.BytesIO(text.encode()), mimetype='text/plain')
    file = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
    print(f"✅ Uploaded! View: https://drive.google.com/file/d/{file.get('id')}/view")


In [33]:
from googleapiclient.http import MediaIoBaseUpload
import io

In [34]:
folder_id = "1mYtpp5m_RDrD7CLk8BbWLXAX0WIjcYRa"  # 🔁 Replace with the one you copied

# Uploading model output to Drive inside that folder
upload_response_to_drive(response, "response.txt", folder_id)

✅ Uploaded! View: https://drive.google.com/file/d/1JSePKiEYSXL1xw36KmdXjlFeEjbC4nr_/view
