In [None]:
!pip install -q transformers datasets accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
import pandas as pd
import numpy as np
import torch
import time
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline,
    DataCollatorWithPadding,
    EvalPrediction
)
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [None]:
# Configuration
WORD_THRESHOLD = 300  # Summarize texts longer than this many words
MIN_SUMMARY_LENGTH = 50  # Minimum summary length
MAX_SUMMARY_LENGTH = 200  # Maximum summary length
BATCH_SIZE_SUMMARY = 8  # Batch size for summarization
BATCH_SIZE_TRAINING = 16  # Batch size for training

# Model saving configuration
MODEL_SAVE_DIR = "./saved_ticket_classifier"
TOKENIZER_SAVE_DIR = os.path.join(MODEL_SAVE_DIR, "tokenizer")
MODEL_WEIGHTS_DIR = os.path.join(MODEL_SAVE_DIR, "model")
METADATA_FILE = os.path.join(MODEL_SAVE_DIR, "metadata.json")
LABEL_ENCODER_FILE = os.path.join(MODEL_SAVE_DIR, "label_encoder.pkl")

def create_save_directories():
    """Create directories for saving model components"""
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
    os.makedirs(TOKENIZER_SAVE_DIR, exist_ok=True)
    os.makedirs(MODEL_WEIGHTS_DIR, exist_ok=True)
    print(f"Created save directories: {MODEL_SAVE_DIR}")

def save_model_components(model, tokenizer, label_encoder, training_config, results):
    """Save all model components needed for inference"""
    create_save_directories()

    print("Saving model components...")

    # Save the trained model
    model.save_pretrained(MODEL_WEIGHTS_DIR)
    print(f"✓ Model saved to: {MODEL_WEIGHTS_DIR}")

    # Save the tokenizer
    tokenizer.save_pretrained(TOKENIZER_SAVE_DIR)
    print(f"✓ Tokenizer saved to: {TOKENIZER_SAVE_DIR}")

    # Save label encoder
    with open(LABEL_ENCODER_FILE, 'wb') as f:
        pickle.dump(label_encoder, f)
    print(f"✓ Label encoder saved to: {LABEL_ENCODER_FILE}")

    # Save metadata and configuration
    metadata = {
        "model_name": "distilbert-base-uncased",
        "num_labels": len(label_encoder.classes_),
        "label_classes": label_encoder.classes_.tolist(),
        "max_length": 512,
        "word_threshold": WORD_THRESHOLD,
        "min_summary_length": MIN_SUMMARY_LENGTH,
        "max_summary_length": MAX_SUMMARY_LENGTH,
        "training_config": training_config,
        "results": results,
        "save_timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
        "preprocessing_params": {
            "summarization_model": "facebook/bart-large-cnn",
            "tokenizer_model": "distilbert-base-uncased"
        }
    }

    with open(METADATA_FILE, 'w') as f:
        json.dump(metadata, f, indent=2, default=str)
    print(f"✓ Metadata saved to: {METADATA_FILE}")

    print(f"\n✅ All model components saved successfully to: {MODEL_SAVE_DIR}")
    return MODEL_SAVE_DIR


In [None]:
def load_model_for_inference(model_dir=MODEL_SAVE_DIR):
    """
    Load all model components for inference
    Usage example after loading:
        model, tokenizer, label_encoder, metadata = load_model_for_inference()
    """
    print(f"Loading model components from: {model_dir}")

    # Load metadata
    with open(os.path.join(model_dir, "metadata.json"), 'r') as f:
        metadata = json.load(f)

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_dir, "tokenizer"))

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(os.path.join(model_dir, "model"))

    # Load label encoder
    with open(os.path.join(model_dir, "label_encoder.pkl"), 'rb') as f:
        label_encoder = pickle.load(f)

    print("✅ Model components loaded successfully!")
    print(f"Model: {metadata['model_name']}")
    print(f"Number of classes: {metadata['num_labels']}")
    print(f"Classes: {metadata['label_classes']}")

    return model, tokenizer, label_encoder, metadata

In [None]:
def predict_ticket_category(text, model, tokenizer, label_encoder, device='cpu', max_length=512):
    """
    Predict the category of a single ticket text
    """
    model.eval()
    model.to(device)

    # Tokenize input
    encoding = tokenizer(
        str(text),
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )

    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Make prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).cpu().numpy()[0]
        confidence = torch.max(predictions).cpu().numpy()

    # Convert back to original label
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]

    return {
        'predicted_class': predicted_label,
        'confidence': float(confidence),
        'class_probabilities': {
            label_encoder.classes_[i]: float(predictions[0][i].cpu().numpy())
            for i in range(len(label_encoder.classes_))
        }
    }


In [None]:
!pip install -q transformers datasets accelerate

import pandas as pd
import numpy as np
import torch
import time
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline,
    DataCollatorWithPadding,
    EvalPrediction
)
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Configuration
WORD_THRESHOLD = 300  # Summarize texts longer than this many words
MIN_SUMMARY_LENGTH = 50  # Minimum summary length
MAX_SUMMARY_LENGTH = 200  # Maximum summary length
BATCH_SIZE_SUMMARY = 8  # Batch size for summarization
BATCH_SIZE_TRAINING = 16  # Batch size for training

# Model saving configuration
MODEL_SAVE_DIR = "./saved_ticket_classifier"
TOKENIZER_SAVE_DIR = os.path.join(MODEL_SAVE_DIR, "tokenizer")
MODEL_WEIGHTS_DIR = os.path.join(MODEL_SAVE_DIR, "model")
METADATA_FILE = os.path.join(MODEL_SAVE_DIR, "metadata.json")
LABEL_ENCODER_FILE = os.path.join(MODEL_SAVE_DIR, "label_encoder.pkl")

def create_save_directories():
    """Create directories for saving model components"""
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
    os.makedirs(TOKENIZER_SAVE_DIR, exist_ok=True)
    os.makedirs(MODEL_WEIGHTS_DIR, exist_ok=True)
    print(f"Created save directories: {MODEL_SAVE_DIR}")

def save_model_components(model, tokenizer, label_encoder, training_config, results):
    """Save all model components needed for inference"""
    create_save_directories()

    print("Saving model components...")

    # Save the trained model
    model.save_pretrained(MODEL_WEIGHTS_DIR)
    print(f"✓ Model saved to: {MODEL_WEIGHTS_DIR}")

    # Save the tokenizer
    tokenizer.save_pretrained(TOKENIZER_SAVE_DIR)
    print(f"✓ Tokenizer saved to: {TOKENIZER_SAVE_DIR}")

    # Save label encoder
    with open(LABEL_ENCODER_FILE, 'wb') as f:
        pickle.dump(label_encoder, f)
    print(f"✓ Label encoder saved to: {LABEL_ENCODER_FILE}")

    # Save metadata and configuration
    metadata = {
        "model_name": "distilbert-base-uncased",
        "num_labels": len(label_encoder.classes_),
        "label_classes": label_encoder.classes_.tolist(),
        "max_length": 512,
        "word_threshold": WORD_THRESHOLD,
        "min_summary_length": MIN_SUMMARY_LENGTH,
        "max_summary_length": MAX_SUMMARY_LENGTH,
        "training_config": training_config,
        "results": results,
        "save_timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
        "preprocessing_params": {
            "summarization_model": "facebook/bart-large-cnn",
            "tokenizer_model": "distilbert-base-uncased"
        }
    }

    with open(METADATA_FILE, 'w') as f:
        json.dump(metadata, f, indent=2, default=str)
    print(f"✓ Metadata saved to: {METADATA_FILE}")

    print(f"\n✅ All model components saved successfully to: {MODEL_SAVE_DIR}")
    return MODEL_SAVE_DIR

def load_model_for_inference(model_dir=MODEL_SAVE_DIR):
    """
    Load all model components for inference
    Usage example after loading:
        model, tokenizer, label_encoder, metadata = load_model_for_inference()
    """
    print(f"Loading model components from: {model_dir}")

    # Load metadata
    with open(os.path.join(model_dir, "metadata.json"), 'r') as f:
        metadata = json.load(f)

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_dir, "tokenizer"))

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(os.path.join(model_dir, "model"))

    # Load label encoder
    with open(os.path.join(model_dir, "label_encoder.pkl"), 'rb') as f:
        label_encoder = pickle.load(f)

    print("✅ Model components loaded successfully!")
    print(f"Model: {metadata['model_name']}")
    print(f"Number of classes: {metadata['num_labels']}")
    print(f"Classes: {metadata['label_classes']}")

    return model, tokenizer, label_encoder, metadata

def predict_ticket_category(text, model, tokenizer, label_encoder, device='cpu', max_length=512):
    """
    Predict the category of a single ticket text
    """
    model.eval()
    model.to(device)

    # Tokenize input
    encoding = tokenizer(
        str(text),
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )

    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Make prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).cpu().numpy()[0]
        confidence = torch.max(predictions).cpu().numpy()

    # Convert back to original label
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]

    return {
        'predicted_class': predicted_label,
        'confidence': float(confidence),
        'class_probabilities': {
            label_encoder.classes_[i]: float(predictions[0][i].cpu().numpy())
            for i in range(len(label_encoder.classes_))
        }
    }

def count_words(text):
    """Count words in text"""
    if pd.isna(text) or text == "":
        return 0
    return len(str(text).split())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
def summarize_texts(texts, summarizer, batch_size=BATCH_SIZE_SUMMARY):
    """
    Summarize texts in batches with proper error handling
    """
    summaries = []
    total_batches = len(texts) // batch_size + (1 if len(texts) % batch_size != 0 else 0)

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        print(f"Processing summarization batch {i//batch_size + 1}/{total_batches}")

        batch_summaries = []
        for text in batch_texts:
            try:
                if pd.isna(text) or text == "":
                    batch_summaries.append("")
                    continue

                # Check if text needs summarization
                word_count = count_words(text)
                if word_count <= WORD_THRESHOLD:
                    batch_summaries.append(str(text))
                    continue

                # Summarize long text
                summary = summarizer(
                    str(text),
                    max_length=MAX_SUMMARY_LENGTH,
                    min_length=MIN_SUMMARY_LENGTH,
                    do_sample=False,
                    truncation=True
                )[0]['summary_text']

                batch_summaries.append(summary)

            except Exception as e:
                print(f"Error summarizing text: {str(e)[:100]}...")
                # Fallback: use truncated original text
                words = str(text).split()[:WORD_THRESHOLD]
                batch_summaries.append(" ".join(words))

        summaries.extend(batch_summaries)

        # Clear cache periodically
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    return summaries

In [None]:
# Load and Prepare Dataset
print("Loading dataset...")
df = pd.read_csv("/kaggle/input/processes-tickets-1/processed_tickets.csv")

# Basic data cleaning
df = df.dropna(subset=['full_text', 'queue'])
df['full_text'] = df['full_text'].astype(str)
print(f"Dataset shape: {df.shape}")
print(f"Number of unique queues: {df['queue'].nunique()}")

# Analyze text lengths before summarization
df['word_count'] = df['full_text'].apply(count_words)
print(f"\nText length statistics:")
print(f"Mean words: {df['word_count'].mean():.1f}")
print(f"Median words: {df['word_count'].median():.1f}")
print(f"Max words: {df['word_count'].max()}")
print(f"Texts longer than {WORD_THRESHOLD} words: {(df['word_count'] > WORD_THRESHOLD).sum()}")

Loading dataset...
Dataset shape: (11923, 7)
Number of unique queues: 5

Text length statistics:
Mean words: 62.7
Median words: 58.0
Max words: 276
Texts longer than 300 words: 0


In [None]:
# Initialize summarization pipeline
print("\nInitializing summarization model...")
device = 0 if torch.cuda.is_available() else -1
try:
    summarizer = pipeline(
        "summarization",
        model="facebook/bart-large-cnn",
        device=device,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
    )
    print("Summarization model loaded successfully")
except Exception as e:
    print(f"Error loading BART model, falling back to simpler approach: {e}")
    # Fallback: simple truncation
    def simple_summarizer(text):
        words = str(text).split()
        if len(words) <= WORD_THRESHOLD:
            return text
        return " ".join(words[:WORD_THRESHOLD])

    summarizer = None


Initializing summarization model...


Device set to use cuda:0


Summarization model loaded successfully


In [None]:
# Summarize texts
print("\nStarting text summarization...")
start_time = time.time()
if summarizer is not None:
    # Use BART for summarization
    texts_to_process = df['full_text'].tolist()
    summarized_texts = summarize_texts(texts_to_process, summarizer)
    df['summarized_text'] = summarized_texts
else:
    # Fallback to simple truncation
    df['summarized_text'] = df['full_text'].apply(simple_summarizer)
summarization_time = time.time() - start_time
print(f"Summarization completed in {summarization_time:.2f} seconds")

# Clear summarization model from memory
if summarizer is not None:
    del summarizer
if torch.cuda.is_available():
    torch.cuda.empty_cache()


Starting text summarization...
Processing summarization batch 1/1491
Processing summarization batch 2/1491
Processing summarization batch 3/1491
Processing summarization batch 4/1491
Processing summarization batch 5/1491
Processing summarization batch 6/1491
Processing summarization batch 7/1491
Processing summarization batch 8/1491
Processing summarization batch 9/1491
Processing summarization batch 10/1491
Processing summarization batch 11/1491
Processing summarization batch 12/1491
Processing summarization batch 13/1491
Processing summarization batch 14/1491
Processing summarization batch 15/1491
Processing summarization batch 16/1491
Processing summarization batch 17/1491
Processing summarization batch 18/1491
Processing summarization batch 19/1491
Processing summarization batch 20/1491
Processing summarization batch 21/1491
Processing summarization batch 22/1491
Processing summarization batch 23/1491
Processing summarization batch 24/1491
Processing summarization batch 25/1491
Pr

In [None]:
# Analyze summarized text lengths
df['summarized_word_count'] = df['summarized_text'].apply(count_words)
print(f"\nSummarized text statistics:")
print(f"Mean words: {df['summarized_word_count'].mean():.1f}")
print(f"Median words: {df['summarized_word_count'].median():.1f}")
print(f"Max words: {df['summarized_word_count'].max()}")


Summarized text statistics:
Mean words: 62.7
Median words: 58.0
Max words: 276


In [None]:
# Encode target labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['queue'])
num_labels = len(label_encoder.classes_)
print(f"\nNumber of classes: {num_labels}")
print(f"Classes: {list(label_encoder.classes_)}")



Number of classes: 5
Classes: ['Billing Support', 'Customer Service', 'Product Support', 'Sales & HR', 'Technical Support']


In [None]:
# Select required columns
df_final = df[['summarized_text', 'label']].copy()

# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_final['summarized_text'],
    df_final['label'],
    test_size=0.2,
    stratify=df_final['label'],
    random_state=42
)
print(f"\nDataset split:")
print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")


Dataset split:
Training samples: 9538
Validation samples: 2385
