In [10]:
# Upgrade to the latest transformers (and optionally accelerate)
!pip install --upgrade transformers accelerate



In [11]:
# Install the Tesseract engine
!apt-get update -y
!apt-get install -y tesseract-ocr libtesseract-dev

# Install the Python wrappers
!pip install pytesseract optuna


Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly 

In [12]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, random_split, Dataset

# OCR and text model imports
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, BertTokenizer, BertModel

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.ensemble import GradientBoostingClassifier
import pytesseract

# Hyperparameter tuning
import optuna

# Visualization
import matplotlib.pyplot as plt
from PIL import Image, UnidentifiedImageError

#### Configurations

In [13]:
DATA_DIR = 'receipt_dataset'
BATCH_SIZE = 4
NUM_EPOCHS = 10
LEARNING_RATE = 1e-4
VAL_SPLIT = 0.2
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### Data Augmentation & Transforms

In [14]:
train_transforms = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

#### Dataset

In [15]:
class OCRImageTextDataset(Dataset):
    def __init__(self, root_dir, image_transform=None, ocr_processor=None, ocr_model=None, text_tokenizer=None, max_length=128):
        self.samples = []
        self.image_transform = image_transform
        self.ocr_processor = ocr_processor
        self.ocr_model = ocr_model
        self.text_tokenizer = text_tokenizer
        self.max_length = max_length

        # Get valid class folders
        classes = [
            d for d in sorted(os.listdir(root_dir))
            if os.path.isdir(os.path.join(root_dir, d)) and not d.startswith(".")
        ]

        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(classes)

        # Collect and verify readable image paths
        for cls in classes:
            cls_dir = os.path.join(root_dir, cls)
            for fname in os.listdir(cls_dir):
                img_path = os.path.join(cls_dir, fname)
                try:
                    with Image.open(img_path) as img:
                        img.verify()  # validate image
                    self.samples.append((img_path, cls))
                except (UnidentifiedImageError, OSError):
                    print(f"⚠️ Skipping corrupted image during init: {img_path}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, label = self.samples[idx]

        # Load image
        try:
            image = Image.open(img_path).convert("RGB")
        except UnidentifiedImageError:
            print(f"⚠️ Failed to load image at runtime: {img_path}")
            return self.__getitem__((idx + 1) % len(self))  # fallback to next sample

        # Transform image
        if self.image_transform:
            image_tensor = self.image_transform(image)
        else:
            image_tensor = transforms.ToTensor()(image)

        # Extract text using TrOCR or pytesseract fallback
        try:
            if self.ocr_processor and self.ocr_model:
                pixel_values = self.ocr_processor(images=image, return_tensors='pt').pixel_values.to(DEVICE)
                generated_ids = self.ocr_model.generate(pixel_values)
                text = self.ocr_processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            else:
                text = pytesseract.image_to_string(image)
        except Exception as e:
            print(f"⚠️ OCR failed on {img_path}, error: {e}")
            text = ""

        # Tokenize text
        encoding = self.text_tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        # Encode label
        label_idx = self.label_encoder.transform([label])[0]

        return image_tensor, input_ids, attention_mask, label_idx


In [16]:
class HybridClassifier(nn.Module):
    def __init__(self, img_feat_dim, txt_feat_dim, num_classes, hidden_dim=256):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(img_feat_dim + txt_feat_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, num_classes)   # ← now dynamic
        )
    def forward(self, img_feats, txt_feats):
        x = torch.cat([img_feats, txt_feats], dim=1)
        return self.fc(x)


In [17]:
ocr_processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
ocr_model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(DEVICE)
text_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

In [18]:
full_dataset = OCRImageTextDataset(DATA_DIR,
    image_transform=train_transforms,
    ocr_processor=ocr_processor,
    ocr_model=ocr_model,
    text_tokenizer=text_tokenizer)
num_classes = len(full_dataset.label_encoder.classes_)  # e.g. 2

hybrid_model = HybridClassifier(
    img_feat_dim=1280,
    txt_feat_dim=768,
    num_classes=num_classes   # ← use this!
).to(DEVICE)


⚠️ Skipping corrupted image during init: receipt_dataset/ai_generated/ChatGPT Image Apr 22, 2025, 09_30_14 PM.png
⚠️ Skipping corrupted image during init: receipt_dataset/ai_generated/ChatGPT Image Apr 22, 2025, 09_46_52 PM.png


In [19]:
print(full_dataset.label_encoder.classes_)  # Should be ['ai_generated', 'real']
print(len(full_dataset.label_encoder.classes_))  # Should be 2


['ai_generated' 'real']
2


In [20]:
val_size = int(len(full_dataset) * VAL_SPLIT)
train_size = len(full_dataset) - val_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

In [21]:
# apply val transforms
train_dataset.dataset.image_transform = train_transforms
val_dataset.dataset.image_transform = val_transforms

In [22]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

#### Feature Extractors

In [23]:
# Image model
image_model = models.efficientnet_b0(pretrained=True)
image_model.classifier = nn.Identity()  # remove classification head
image_model = image_model.to(DEVICE)

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 139MB/s] 


In [24]:
# Text model
text_model = BertModel.from_pretrained('bert-base-uncased').to(DEVICE)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

####Combine Features & Classifier

In [25]:
hybrid_model = HybridClassifier(img_feat_dim=1280, txt_feat_dim=768, num_classes=2).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(list(image_model.parameters()) +
                       list(text_model.parameters()) +
                       list(hybrid_model.parameters()), lr=LEARNING_RATE)

In [26]:
# LR Scheduler & Early Stopping
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2)
early_stop_patience = 3

####Training Loop

In [None]:
for epoch in range(1, NUM_EPOCHS + 1):
    hybrid_model.train(); image_model.train(); text_model.train()
    total_loss = correct = 0

    for imgs, ids, masks, labels in train_loader:
        imgs, ids, masks, labels = [x.to(DEVICE) for x in (imgs, ids, masks, labels)]

        optimizer.zero_grad()
        img_feats = image_model(imgs)
        txt_out = text_model(input_ids=ids, attention_mask=masks, output_hidden_states=True)
        txt_feats = txt_out.logits if hasattr(txt_out, 'logits') else txt_out.last_hidden_state[:, 0, :]

        logits = hybrid_model(img_feats, txt_feats)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        correct += (logits.argmax(1) == labels).sum().item()

    avg_loss = total_loss / len(train_loader.dataset)
    acc = correct / len(train_loader.dataset)
    print(f"Epoch {epoch} — Train Loss: {avg_loss:.4f}, Accuracy: {acc:.4f}")

    # ---- Validation ----
    hybrid_model.eval(); image_model.eval(); text_model.eval()
    val_loss = correct = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for imgs, ids, masks, labels in val_loader:
            imgs, ids, masks, labels = [x.to(DEVICE) for x in (imgs, ids, masks, labels)]
            img_feats = image_model(imgs)
            txt_out = text_model(input_ids=ids, attention_mask=masks, output_hidden_states=True)
            txt_feats = txt_out.logits if hasattr(txt_out, 'logits') else txt_out.last_hidden_state[:, 0, :]

            logits = hybrid_model(img_feats, txt_feats)
            loss = criterion(logits, labels)
            val_loss += loss.item() * labels.size(0)
            correct += (logits.argmax(1) == labels).sum().item()

            all_preds.extend(logits.argmax(1).cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    avg_val_loss = val_loss / len(val_loader.dataset)
    val_acc = correct / len(val_loader.dataset)
    scheduler.step(avg_val_loss)

    print(f"   Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_acc:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save({
            'image_model': image_model.state_dict(),
            'text_model': text_model.state_dict(),
            'hybrid_model': hybrid_model.state_dict()
        }, 'best_hybrid_model.pth')

Epoch 1 — Train Loss: 0.5211, Accuracy: 0.7578
   Val Loss: 0.1987, Val Accuracy: 0.9750


#### Evaluation

In [None]:
cm = confusion_matrix(all_labels, all_preds)
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", classification_report(all_labels, all_preds))
print("F1 Score:", f1_score(all_labels, all_preds, average='weighted'))

In [None]:
plt.imshow(cm, cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.colorbar()
plt.tight_layout()
plt.show()

#### Optuna

In [None]:
def objective(trial):
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32])
    return best_val_loss  # Placeholder — insert training loop here

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=20)
# print('Best Params:', study.best_params)