In [1]:
import pandas as pd
import numpy as np
import os
import glob

In [2]:
# Mount Drive
from google.colab import drive
drive.mount("/content/drive")

# Directory
dir = "/content/drive/MyDrive/Personal/Apziva/MonReader"

# Setting random state for consistency
seed = 123
np.random.seed(seed)

# Confirm GPU


gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mounted at /content/drive
Tue Feb  3 23:47:23 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   33C    P0             53W /  400W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                      

In [3]:
TRAIN_NOT   = f"{dir}/images/training/notflip"
TRAIN_FLIP  = f"{dir}/images/training/flip"
TEST_NOT    = f"{dir}/images/testing/notflip"
TEST_FLIP   = f"{dir}/images/testing/flip"

def load_paths(pos_dir, neg_dir):
    # Use glob.glob to find files directly in the specified directories
    neg = glob.glob(os.path.join(neg_dir, "*"))
    pos = glob.glob(os.path.join(pos_dir, "*"))

    paths = neg + pos
    labels = [0]*len(neg) + [1]*len(pos)
    return paths, labels

train_files, train_labels = load_paths(TRAIN_FLIP, TRAIN_NOT)
test_files,  test_labels  = load_paths(TEST_FLIP, TEST_NOT)


In [4]:
from datasets import Dataset
from PIL import Image
from transformers import AutoImageProcessor, AutoModelForImageClassification, TrainingArguments, Trainer
import os
os.environ["WANDB_DISABLED"] = "true"


model_name = "google/vit-base-patch16-224"

processor = AutoImageProcessor.from_pretrained(model_name)

id2label = {0: "notflip", 1: "flip"}
label2id = {"notflip": 0, "flip": 1}

# Hugging Face Datasets expects dicts
train_dict = {"image": train_files, "label": train_labels}
test_dict  = {"image": test_files,  "label": test_labels}

train_ds = Dataset.from_dict(train_dict)
test_ds  = Dataset.from_dict(test_dict)

def preprocess_images(examples):
    images = [Image.open(p).convert("RGB") for p in examples["image"]]
    # Don't specify return_tensors - let it return numpy/lists
    inputs = processor(images=images)
    return {"pixel_values": inputs["pixel_values"], "labels": examples["label"]}

train_ds = train_ds.map(
    preprocess_images,
    batched=True,
    batch_size=32,  # Much smaller chunks
    remove_columns=["image"],
    writer_batch_size=100  # Flush to disk more often
)

test_ds = test_ds.map(
    preprocess_images,
    batched=True,
    batch_size=32,
    remove_columns=["image"],
    writer_batch_size=100
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


Map:   0%|          | 0/2392 [00:00<?, ? examples/s]

Map:   0%|          | 0/597 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForImageClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True # Add this argument to ignore size mismatches in the classification head
)
# For up-to-date versions
#   eval_strategy = "epoch"
training_args = TrainingArguments(
    output_dir=f"{dir}/vit-flip-checkpoints",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
)

trainer.train()

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/200 [00:00<?, ?it/s]

ViTForImageClassification LOAD REPORT from: google/vit-base-patch16-224
Key               | Status   |                                                                                        
------------------+----------+----------------------------------------------------------------------------------------
classifier.bias   | MISMATCH | Reinit due to size mismatch ckpt: torch.Size([1000]) vs model:torch.Size([2])          
classifier.weight | MISMATCH | Reinit due to size mismatch ckpt: torch.Size([1000, 768]) vs model:torch.Size([2, 768])

Notes:
- MISMATCH	:ckpt weights were loaded, but they did not match the original empty weight shapes.


Epoch,Training Loss,Validation Loss
1,0.011356,0.009754
2,0.001068,0.010624
3,9.3e-05,0.008813


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import accuracy_score, f1_score

pred_output = trainer.predict(test_ds)
logits = pred_output.predictions
labels = pred_output.label_ids
preds = np.argmax(logits, axis=-1)

print("Accuracy:", accuracy_score(labels, preds))
print("F1:", f1_score(labels, preds, average="weighted"))


2 datasets:
1. Test images both flipped and not flipped (n = 100)
2. Images ViT predied as not flipped (n=100)

In [None]:
# Get predictions from your existing trainer
pred_output = trainer.predict(test_ds)
logits = pred_output.predictions
label_ids = pred_output.label_ids  # true labels, if you need them

# For binary classification with 2 logits per sample:
preds = np.argmax(logits, axis=-1)

# If your model outputs a single logit (sigmoid), do:
# preds = (logits.squeeze(-1) > 0).astype(int)

# Indices of images predicted as "not flipped" (assume label 0)
non_flipped_idx = np.where(preds == 0)[0]

# Create a subset dataset with only predicted non-flipped images
non_flipped_ds = test_ds.select(non_flipped_idx)

print(f"Total images in test_ds: {len(test_ds)}")
print(f"Images predicted as not flipped: {len(non_flipped_ds)}")


In [None]:
from transformers import pipeline

ocr_pipe = pipeline(
    "image-to-text",
    model="microsoft/trocr-base-printed",
    device=0
)


In [None]:
import time
from math import ceil

def run_ocr_and_time(dataset, batch_size=8):
    """
    dataset: HF Dataset with an 'image' column (PIL images or arrays)
    Returns: (total_time_seconds, texts_list)
    """
    n = len(dataset)
    all_texts = []
    start = time.perf_counter()

    # Loop in batches
    for i in range(0, n, batch_size):
        batch = dataset[i:i+batch_size]
        images = batch["image"]  # list of images
        # OCR pipeline supports list input
        outputs = ocr_pipe(images)

        # outputs is typically a list of dicts like [{'generated_text': '...'}, ...]
        texts = [o["generated_text"] for o in outputs]
        all_texts.extend(texts)

    end = time.perf_counter()
    return end - start, all_texts

# 3a. Time on full test set
time_full, texts_full = run_ocr_and_time(test_ds, batch_size=8)
print(f"OCR time on FULL dataset ({len(test_ds)} images): {time_full:.2f} seconds")

# 3b. Time on predicted non-flipped subset
time_nonflip, texts_nonflip = run_ocr_and_time(non_flipped_ds, batch_size=8)
print(f"OCR time on NON-FLIPPED subset ({len(non_flipped_ds)} images): {time_nonflip:.2f} seconds")

# Optional: relative speedup
speedup = time_full / time_nonflip if time_nonflip > 0 else float("inf")
print(f"Speedup from filtering (full / non-flipped): {speedup:.2f}x")
