In [1]:
!pip install transformers torch scikit-learn pandas numpy



In [2]:
import json
import numpy as np

# Custom JSON encoder to handle numpy types
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (np.integer, np.int64, np.int32)):
            return int(obj)
        elif isinstance(obj, (np.floating, np.float64, np.float32)):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)


In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import (DistilBertTokenizerFast,DistilBertForSequenceClassification,Trainer,TrainingArguments,EarlyStoppingCallback)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
import json
from datetime import datetime

In [4]:
df = pd.read_csv('training_data_comprehensive.csv')
print(f"Loaded {len(df)} samples with {df['category'].nunique()} categories")

#Encoding
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

#mapping
label_map = {label: int(idx) for idx, label in enumerate(label_encoder.classes_)}
id_map = {int(idx): label for label, idx in label_map.items()}

print(f"\nCategories: {list(label_encoder.classes_)}")

# Save mappings
with open('model_config.json', 'w') as f:
    json.dump({
        'label_map': label_map,
        'id_map': {str(k): v for k, v in id_map.items()},
        'num_labels': len(label_map),
        'categories': list(label_encoder.classes_)
    }, f, indent=2)

print("Label mappings saved to model_config.json")

Loaded 2040 samples with 17 categories

Categories: ['Bill Payment', 'Cashback', 'EMI', 'Education', 'Entertainment', 'Fees', 'Food', 'Funds Transfer', 'Groceries', 'Healthcare', 'Income', 'Insurance', 'Others', 'Recharge', 'Shopping', 'Travel', 'Utilities']
Label mappings saved to model_config.json


In [5]:

X = df['description'].values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)

print(f"Train: {len(X_train)} samples | Test: {len(X_test)} samples")

# Prepare datasets
train_df = pd.DataFrame({'description': X_train, 'label': y_train})
test_df = pd.DataFrame({'description': X_test, 'label': y_test})

Train: 1734 samples | Test: 306 samples


In [6]:
print("\n[STEP 3/6] Computing class weights for balanced training...")

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y),
    y=y
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)
print(f"Class weights computed: {class_weights.round(3)}")


[STEP 3/6] Computing class weights for balanced training...
Class weights computed: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [14]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_data(texts, labels):
    encodings = tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=32,
        return_tensors='pt'
    )
    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': torch.tensor(labels, dtype=torch.long)
    }

train_encodings = tokenize_data(X_train, y_train)
test_encodings = tokenize_data(X_test, y_test)

class ExpenseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.encodings['labels'][idx]
        }

    def __len__(self):
        return len(self.encodings['labels'])

train_dataset = ExpenseDataset(train_encodings)
test_dataset = ExpenseDataset(test_encodings)

print(f"Tokenization complete")

Tokenization complete


In [8]:
#Custom Trainer with class weights
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor.to(logits.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_map)
)
model.config.id2label = id_map
model.config.label2id = label_map

# Metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./training_results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_dir='./logs',
    logging_steps=20,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    save_total_limit=2,
    seed=42,
    report_to='none'
)

# Initialize trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

print("\nStarting training")

train_result = trainer.train()

print("\nTraining complete!")
print(f"Training time: {train_result.metrics.get('train_runtime', 0):.2f} seconds")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,2.6123,2.365688,0.647059,0.631225,0.631225
2,1.0581,0.693693,0.915033,0.915623,0.915623
3,0.1938,0.16554,0.970588,0.970614,0.970614
4,0.1076,0.098603,0.973856,0.973709,0.973709
5,0.0467,0.058651,0.980392,0.980349,0.980349
6,0.0273,0.044025,0.986928,0.986822,0.986822
7,0.0277,0.04913,0.986928,0.986822,0.986822
8,0.0241,0.086453,0.980392,0.980274,0.980274
9,0.0121,0.060263,0.986928,0.986822,0.986822
10,0.0216,0.059226,0.986928,0.986822,0.986822



Training complete!
Training time: 159.99 seconds


In [15]:
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

#Calculate metrics
test_accuracy = accuracy_score(y_true, y_pred)
test_f1_macro = f1_score(y_true, y_pred, average='macro')
test_f1_weighted = f1_score(y_true, y_pred, average='weighted')

print(f"\nOverall Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"F1 Score (Macro): {test_f1_macro:.4f} ({test_f1_macro*100:.2f}%)")
print(f"F1 Score (Weighted): {test_f1_weighted:.4f} ({test_f1_weighted*100:.2f}%)")

# Classification report
print("\n" + classification_report(y_true, y_pred, target_names=[id_map[i] for i in range(len(id_map))], digits=4))

# Per-category accuracy
category_results = []
for i in range(len(id_map)):
    mask = y_true == i
    if mask.sum() > 0:
        cat_accuracy = (y_pred[mask] == y_true[mask]).mean()
        category_results.append({
            'category': id_map[i],
            'accuracy': cat_accuracy,
            'samples': mask.sum()
        })
        print(f"{id_map[i]:20} : {cat_accuracy:.4f} ({cat_accuracy*100:.2f}%) - {mask.sum()} samples")

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
print(cm)


Overall Accuracy: 0.9869 (98.69%)
F1 Score (Macro): 0.9868 (98.68%)
F1 Score (Weighted): 0.9868 (98.68%)

                precision    recall  f1-score   support

  Bill Payment     1.0000    1.0000    1.0000        18
      Cashback     1.0000    1.0000    1.0000        18
           EMI     1.0000    1.0000    1.0000        18
     Education     1.0000    1.0000    1.0000        18
 Entertainment     1.0000    1.0000    1.0000        18
          Fees     0.9444    0.9444    0.9444        18
          Food     0.9474    1.0000    0.9730        18
Funds Transfer     0.9474    1.0000    0.9730        18
     Groceries     1.0000    0.9444    0.9714        18
    Healthcare     1.0000    1.0000    1.0000        18
        Income     1.0000    0.8889    0.9412        18
     Insurance     1.0000    1.0000    1.0000        18
        Others     1.0000    1.0000    1.0000        18
      Recharge     1.0000    1.0000    1.0000        18
      Shopping     1.0000    1.0000    1.0000       

In [16]:
#Save Model
model.save_pretrained("expense_model_distilbert")
tokenizer.save_pretrained("expense_model_distilbert")

print("Model saved to: expense_model_distilbert/")

# Save test results
test_results = {
    'test_accuracy': float(test_accuracy),
    'test_f1_macro': float(test_f1_macro),
    'test_f1_weighted': float(test_f1_weighted),
    'category_results': category_results,
    'training_time': train_result.metrics.get('train_runtime', 0),
    'total_samples': int(len(df)),
    'accuracy': float(cat_accuracy),
    'train_samples': len(X_train),
    'test_samples': len(X_test),
    'num_categories': len(label_map)
}

with open('test_results.json', 'w') as f:
    json.dump(test_results, f, indent=2, cls=NumpyEncoder)

print("Test results saved to: test_results.json")

Model saved to: expense_model_distilbert/
Test results saved to: test_results.json


In [11]:
test_transactions = [
    "UPI/SWIGGY",
    "NETFLIX MONTHLY",
    "LIC PREMIUM",
    "UPI/AMAZON",
    "SALARY CREDIT",
    "TATA POWER",
    "JIO RECHARGE",
    "UPI/OLA",
    "UPI/DMART",
    "SCHOOL FEE PAYMENT",
    "HOME LOAN EMI",
    "UPI/APOLLO PHARMACY",
    "UPI/FLIPKART",
    "INTEREST CREDIT",
    "YOUTUBE PREMIUM"
]

# Get model's device
device = next(model.parameters()).device
print(f"Model is on: {device}")

model.eval()

print("\nTesting model with real-world transaction descriptions:\n")
for transaction in test_transactions:
    inputs = tokenizer(
        transaction,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=32
    )

    # Move inputs to same device as model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)[0]
        predicted_idx = probs.argmax().item()
        confidence = probs[predicted_idx].item()
        predicted_category = id_map[predicted_idx]

    print(f"{transaction:30} → {predicted_category:15} (Confidence: {confidence:.2%})")


Model is on: cuda:0

Testing model with real-world transaction descriptions:

UPI/SWIGGY                     → Food            (Confidence: 99.38%)
NETFLIX MONTHLY                → Entertainment   (Confidence: 99.46%)
LIC PREMIUM                    → Insurance       (Confidence: 99.41%)
UPI/AMAZON                     → Shopping        (Confidence: 98.39%)
SALARY CREDIT                  → Income          (Confidence: 99.44%)
TATA POWER                     → Utilities       (Confidence: 99.13%)
JIO RECHARGE                   → Recharge        (Confidence: 98.94%)
UPI/OLA                        → Travel          (Confidence: 99.13%)
UPI/DMART                      → Groceries       (Confidence: 99.35%)
SCHOOL FEE PAYMENT             → Education       (Confidence: 99.50%)
HOME LOAN EMI                  → EMI             (Confidence: 99.45%)
UPI/APOLLO PHARMACY            → Healthcare      (Confidence: 99.55%)
UPI/FLIPKART                   → Shopping        (Confidence: 99.44%)
INTEREST CRE

In [12]:
print(f"\nModel: DistilBERT for Sequence Classification")
print(f"Total Training Samples: {len(X_train)}")
print(f"Total Test Samples: {len(X_test)}")
print(f"Number of Categories: {len(label_map)}")
print(f"Test Accuracy: {test_accuracy*100:.2f}%")
print(f"F1 Score (Macro): {test_f1_macro*100:.2f}%")
print(f"Model saved to: expense_model_distilbert/")
print(f"\nTraining pipeline completed successfully!")


Model: DistilBERT for Sequence Classification
Total Training Samples: 1734
Total Test Samples: 306
Number of Categories: 17
Test Accuracy: 98.69%
F1 Score (Macro): 98.68%
Model saved to: expense_model_distilbert/

Training pipeline completed successfully!


In [13]:
!zip -r expense_model_distilbert.zip expense_model_distilbert

  adding: expense_model_distilbert/ (stored 0%)
  adding: expense_model_distilbert/tokenizer_config.json (deflated 75%)
  adding: expense_model_distilbert/vocab.txt (deflated 53%)
  adding: expense_model_distilbert/special_tokens_map.json (deflated 42%)
  adding: expense_model_distilbert/tokenizer.json (deflated 71%)
  adding: expense_model_distilbert/config.json (deflated 55%)
  adding: expense_model_distilbert/model.safetensors (deflated 8%)
