In [None]:
import pandas as pd
import re
import json
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

In [None]:
df = pd.read_csv("Zoho Tickets 2021-2024.csv")

df["Category"] = ""
df["Sub Category"] = ""
df["Tags"] = ""

taxonomy = {
  "categories": [
    {
      "name": "Technical Issues",
      "subcategories": ["Internet Issue", "Hardware Problem", "Software Glitch"]
    },
    {
      "name": "Account/Password Related",
      "subcategories": ["Login Issue", "Password Reset", "Account Suspension"]
    },
    {
      "name": "Service Related",
      "subcategories": ["Change Dependent", "Intake Questionnaire", "Service Activation"]
    },
    {
      "name": "General Enquiries",
      "subcategories": ["Other Issues", "Questions and Concerns"]
    },
  ],
  "Tags": [
    "internet",
    "issue",
    "change",
    "dependent",
    "account",
    "billing",
    "intake",
    "questionnaire",
    "support",
    "requests",
    "b2b",
    "reacted",
    "fully delete file",
    "login", 
    "password", 
    "service", 
    "general", 
    "inquiry", 
    "technical", 
  ]
}

categories_str = ", ".join([cat["name"] for cat in taxonomy["categories"]])
subcategory_str = "\n".join(
    f"- {cat['name']}: {', '.join(cat['subcategories']) or 'None'}"
    for cat in taxonomy["categories"]
)
tags_str = ", ".join(taxonomy["Tags"])

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
).to(device)  

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

In [None]:
for i, row in tqdm(df.iterrows(), total=len(df)):
    context = f"""
    Subject: {row.get('Subject', '')}
    Description: {row.get('Description', '')}
    Department: {row.get('Department', '')}
    """

    prompt = f"""
    You are a smart classification assistant specialized in support ticket information.

    Analyze the following ticket details and classify them according to the taxonomy provided.

    Ticket details:
    {context}

    Taxonomy:
    Categories: {categories_str}
    Subcategories (each belongs to one category):
    {subcategory_str}
    Tags: {tags_str}

    Instructions:
    - Return your answer **ONLY** as a valid JSON object.
    - The JSON must contain **exactly** the keys: `"Category"`, `"Sub Category"`, and `"Tags"`.
    - `"Category"` must be one of the provided categories.
    - `"Sub Category"` must be a valid subcategory of the selected category.
    - `"Tags"` must be an array of 3 to 5 relevant tags from the list.
    - **Do NOT include** any extra text, examples, labels like "Classification:", or multiple outputs.
    - **Do NOT wrap** the JSON in markdown (```json ... ```), return only the raw JSON.
    - If a classification is unclear, infer the best possible match.
    - All three fields are **required**. Never return an empty JSON.
    - Return **only one** valid JSON object.

    Begin classification.
    """

    try:
        result = pipe(prompt, max_new_tokens=128, temperature=0.3)[0]["generated_text"]

        json_candidates = re.findall(r'\{[^{}]*"Category"[^{}]*"Sub Category"[^{}]*"Tags"[^{}]*\}', result)
        valid_found = False

        for json_text in json_candidates:
            try:
                parsed = json.loads(json_text)

                category = parsed.get("Category", "").strip()
                sub_category = parsed.get("Sub Category", "").strip()
                tags = parsed.get("Tags", [])

                if category and sub_category and isinstance(tags, list) and 3 <= len(tags) <= 5:
                    df.at[i, "Category"] = category
                    df.at[i, "Sub Category"] = sub_category
                    df.at[i, "Tags"] = ", ".join(tag.strip() for tag in tags)
                    valid_found = True
                    print(f"Row {i} SUCCESS.")
                    break
            except Exception as e:
                continue 

        if not valid_found:
            print(f"Row {i} skipped due to no valid JSON classification.")
            
    except Exception as e:
        print(f"Row {i} failed: {e}")

In [None]:
df.to_csv("classified_output_2124.csv", index=False)
print("Classification complete.")