In [7]:
# -*- coding: utf-8 -*-
"""Dataprep with Drive Integration
Automatically rewritten to read/write files directly from Google Drive.
"""

# === Mount Google Drive ===
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import re
import random
from sklearn.model_selection import train_test_split

# === Define paths ===
BASE_PATH = "/content/drive/MyDrive/LLM_Project/data"
os.makedirs(BASE_PATH, exist_ok=True)

# Path to your seed CSV in Drive
SEED_FILE = f"{BASE_PATH}/Support_intent_seed.csv"

# === Load seed CSV ===
df = pd.read_csv(SEED_FILE)
print("Seed data loaded. Sample:")
print(df.head())
print("Label distribution:\n", df['label'].value_counts())

# === Cleaning function ===
def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"http\S+|www.\S+", " ", t)
    t = re.sub(r"\S+@\S+", " ", t)
    t = re.sub(r"\d{5,}", " ", t)
    t = re.sub(r"[^a-zA-Z0-9\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)
df = df.drop_duplicates(subset=['clean_text'])
df = df[df['clean_text'].str.strip() != ""]

print("After cleaning, sample data:")
print(df.head())
print("Label distribution after cleaning:\n", df['label'].value_counts())

# === Split seed data ===
train_seed, test_seed = train_test_split(
    df,
    test_size=0.33,
    stratify=df['label'],
    random_state=42
)

print("Seed split - train size:", len(train_seed), "test size:", len(test_seed))

# === Another split if needed ===
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

print("Full split - train size:", len(train_df), "test size:", len(test_df))

# === Define label templates for synthetic data ===
label_templates = {
    "billing_issue": [
        "I have a problem with my billing statement.",
        "My invoice amount seems incorrect.",
        "Why was I charged extra this month?",
        "Please check my billing details.",
    ],
    "tech_support": [
        "My device is not working properly.",
        "The app keeps crashing.",
        "I need help fixing a technical issue.",
        "Troubleshooting is not resolving my problem."
    ],
    "refund_request": [
        "I want to request a refund for my purchase.",
        "Please initiate a refund.",
        "The product did not meet expectations; need refund.",
        "How can I get my money back?"
    ],
    "shipping_delay": [
        "My delivery is taking too long.",
        "Package has not arrived yet.",
        "Why is my order delayed?",
        "Shipment is late and still not received."
    ],
    "product_question": [
        "I have a question about the product features.",
        "Does this item come with warranty?",
        "Need more details about this product.",
        "Is this product compatible with my device?"
    ],
    "account_access": [
        "I cannot access my account.",
        "Forgot my login password.",
        "Account locked — need help.",
        "Unable to sign in to my account."
    ]
}

def generate_synthetic_row(label):
    template = random.choice(label_templates[label])
    noise = random.choice([
        "",
        " please assist.",
        " urgent help needed.",
        " can you check?",
        " let me know."
    ])
    return template + noise

# === Generate synthetic training data ===
synthetic_rows = []
labels = list(label_templates.keys())

for i in range(1000):
    label = random.choice(labels)
    text = generate_synthetic_row(label)
    synthetic_rows.append([text, label])

synthetic_df = pd.DataFrame(synthetic_rows, columns=['text', 'label'])

train_final = synthetic_df.sample(1000, random_state=42).reset_index(drop=True)
print("Synthetic training data generated. Size:", len(train_final))

# === Generate synthetic test data if needed ===
test_needed = 500 - len(test_seed)
test_synthetic = []

for i in range(test_needed):
    label = random.choice(labels)
    text = generate_synthetic_row(label) + " test."
    test_synthetic.append([text, label])

test_synthetic_df = pd.DataFrame(test_synthetic, columns=['text', 'label'])

# Combine seed test data + synthetic test data
test_final = pd.concat([
    test_seed[['clean_text','label']].rename(columns={'clean_text':'text'}),
    test_synthetic_df
], ignore_index=True)

print("Final test data size:", len(test_final))

# === Save CSVs to Drive ===
train_final.to_csv(f"{BASE_PATH}/train.csv", index=False)
test_final.to_csv(f"{BASE_PATH}/test.csv", index=False)

print("Train and test CSVs saved to:", BASE_PATH)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Seed data loaded. Sample:
                                              text             label
0        The screen is stuck on a loading spinner.      tech_support
1      My order seems stuck at a sorting facility.    shipping_delay
2     Received the wrong item—requesting a refund.    refund_request
3  How do I change the email linked to my account?    account_access
4     What's the warranty period for this product?  product_question
Label distribution:
 label
tech_support        20
shipping_delay      20
refund_request      20
account_access      20
product_question    20
billing_issue       20
Name: count, dtype: int64
After cleaning, sample data:
                                              text             label  \
0        The screen is stuck on a loading spinner.      tech_support   
1      My order seems stuck at a sorting facility.    shipping_dela

In [None]:
# -*- coding: utf-8 -*-
"""Fine-Tuning TinyLlama Using QLoRA (Drive-ready version)
Automatically rewritten to use train/test data from Drive.
"""

# 1) Install dependencies
!pip install -q transformers datasets accelerate bitsandbytes peft sentencepiece evaluate safetensors

# 2) Imports
import os
import json
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,
    TrainingArguments, Trainer
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel

# 3) Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# === Paths ===
BASE_PATH = "/content/drive/MyDrive/LLM_Project/data"  # folder where train.csv/test.csv are saved
OUTPUT_PATH = "/content/drive/MyDrive/LLM_Project/fine_tune_output"
os.makedirs(OUTPUT_PATH, exist_ok=True)

TRAIN_FILE = os.path.join(BASE_PATH, "train.csv")
TEST_FILE  = os.path.join(BASE_PATH, "test.csv")

# 4) Load CSVs
train_df = pd.read_csv(TRAIN_FILE)
test_df  = pd.read_csv(TEST_FILE)

print('train_df shape:', train_df.shape)
print('test_df shape:', test_df.shape)
print('train columns:', train_df.columns.tolist())
print('test columns :', test_df.columns.tolist())

# Ensure columns exist
for df,name in [(train_df,'train'),(test_df,'test')]:
    if 'text' not in df.columns or 'label' not in df.columns:
        raise ValueError(f"{name} dataframe must have columns 'text' and 'label'. Found: {df.columns.tolist()}")

# Optional: label map
labels = sorted(train_df['label'].unique().tolist())
with open(os.path.join(OUTPUT_PATH,'label_map.json'),'w') as f:
    json.dump(labels,f)
print('Labels:', labels)

# 5) Format prompts
def format_row(text, label):
    return f"Classify the customer intent:\n\n{text}\n\nIntent: {label}"

train_prompts = train_df.apply(lambda r: {'text': format_row(r['text'], r['label'])}, axis=1, result_type='expand')
test_prompts  = test_df.apply(lambda r: {'text': format_row(r['text'], r['label'])}, axis=1, result_type='expand')

train_ds = Dataset.from_pandas(train_prompts)
test_ds  = Dataset.from_pandas(test_prompts)

# 6) Load tokenizer + model (TinyLlama) with 4-bit quantization
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map='auto',
    quantization_config=bnb_config,
    trust_remote_code=True
)

# 7) Prepare for k-bit training + attach LoRA
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# 8) Tokenize dataset
max_len = 256

def tokenize_and_add_labels(examples):
    out = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_len)
    out['labels'] = out['input_ids'].copy()
    return out

train_ds = train_ds.map(tokenize_and_add_labels, batched=True, remove_columns=['text'], keep_in_memory=True)
test_ds  = test_ds.map(tokenize_and_add_labels, batched=True, remove_columns=['text'], keep_in_memory=True)

train_ds.set_format(type='torch')
test_ds.set_format(type='torch')

# 9) TrainingArguments + Trainer
training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_PATH,'lora-output'),
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    max_steps=2,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
)

# 10) Run training
trainer.train()

# 11) Save LoRA adapter
adapter_path = os.path.join(OUTPUT_PATH,'lora_adapter')
model.save_pretrained(adapter_path)
tokenizer.save_pretrained(adapter_path)
print('Saved adapter to:', adapter_path)

# 12) Merge adapter into base weights (optional)
try:
    base = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', quantization_config=bnb_config, trust_remote_code=True)
    peft_model = PeftModel.from_pretrained(base, adapter_path)
    merged = peft_model.merge_and_unload()
    merged_path = os.path.join(OUTPUT_PATH,'merged_model')
    merged.save_pretrained(merged_path)
    tokenizer.save_pretrained(merged_path)
    print('Saved merged model to:', merged_path)
except Exception as e:
    print('Merge step failed:', e)

# 13) Simple evaluation (first 50 examples)
def predict_text(text, max_new_tokens=16):
    prompt = text
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
    with torch.no_grad():
        gen = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    return tokenizer.decode(gen[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True).strip()

correct = 0
N = min(50, len(test_df))
for i in range(N):
    prompt = format_row(test_df.iloc[i]['text'], test_df.iloc[i]['label'])
    pred = predict_text(prompt)
    if test_df.iloc[i]['label'] in pred:
        correct += 1

print(f'Exact-match label in model output (first {N}) = {correct}/{N} =', correct/N)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDrive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
train_df shape: (1000, 2)
test_df shape: (500, 2)
train columns: ['text', 'label']
test columns : ['text', 'label']
Labels: ['account_access', 'billing_issue', 'product_question', 'refund_request', 'shipping_delay', 'tech_support']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]