In [None]:
!pip install -q pytorch-lightning

In [None]:
import torchvision
import os

class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(self, img_folder, processor, train=True):
        ann_file = os.path.join(img_folder, "train_data-1 (1).json" if train else "test_data-1 (1).json")
        super(CocoDetection, self).__init__(img_folder, ann_file)
        self.processor = processor

    def __getitem__(self, idx):
        # read in PIL image and target in COCO format
        # feel free to add data augmentation here before passing them to the next step
        img, target = super(CocoDetection, self).__getitem__(idx)

        # preprocess image and target (converting target to DETR format, resizing + normalization of both image and target)
        image_id = self.ids[idx]
        target = {'image_id': image_id, 'annotations': target}
        encoding = self.processor(images=img, annotations=target, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze() # remove batch dimension
        target = encoding["labels"][0] # remove batch dimension

        return pixel_values, target

In [None]:
# from transformers import DetrImageProcessor
from transformers import AutoImageProcessor

# processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
# processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-structure-recognition")
processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-detection")
processor.size['shortest_edge'] = 800
train_dataset = CocoDetection(img_folder='/kaggle/input/table-structure-recognition/train_task02/', processor=processor)
val_dataset = CocoDetection(img_folder='/kaggle/input/table-structure-recognition/val_task02/', processor=processor, train=False)

In [None]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(val_dataset))

In [None]:
import numpy as np
import os
from IPython.display import display
from PIL import Image, ImageDraw

# Lấy danh sách các ID ảnh từ COCO dataset
image_ids = train_dataset.coco.getImgIds()

# Chọn một ảnh ngẫu nhiên
image_id = image_ids[np.random.randint(0, len(image_ids))]
print('Image n°{}'.format(image_id))

# Load thông tin ảnh và ảnh từ thư mục
image_info = train_dataset.coco.loadImgs(image_id)[0]
image_path = os.path.join('/kaggle/input/table-detection/train_task01/', image_info['file_name'])
image = Image.open(image_path).convert("RGB")

# Lấy annotation cho ảnh đó
annotations = train_dataset.coco.imgToAnns[image_id]

# Tạo đối tượng để vẽ
draw = ImageDraw.Draw(image, "RGBA")

# Mapping từ category_id sang tên nhãn
cats = train_dataset.coco.cats
id2label = {k: v['name'] for k, v in cats.items()}

#if 0 not in id2label:
    #id2label[0] = "row" 

# Vẽ bbox và nhãn lên ảnh
for annotation in annotations:
    box = annotation['bbox']  # bbox dạng [x, y, width, height]
    class_idx = annotation['category_id']
    
    x, y, w, h = map(int, box)  # Chuyển bbox sang số nguyên
    draw.rectangle((x, y, x + w, y + h), outline='red', width=2)
    
    label_text = id2label.get(class_idx, f"class_{class_idx}")  # Nếu không tìm thấy thì dùng tên mặc định
    draw.text((x, y), label_text, fill='white')

# Hiển thị ảnh kết quả
display(image)


In [None]:
# Lấy danh sách các ID ảnh từ dataset COCO
image_ids = train_dataset.ids  # hoặc train_dataset.coco.getImgIds()

In [None]:
print(id2label)

In [None]:
from torch.utils.data import DataLoader

def collate_fn(batch):
  pixel_values = [item[0] for item in batch]
  encoding = processor.pad(pixel_values, return_tensors="pt")
  labels = [item[1] for item in batch]
  batch = {}
  batch['pixel_values'] = encoding['pixel_values']
  batch['pixel_mask'] = encoding['pixel_mask']
  batch['labels'] = labels
  return batch

train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, collate_fn=collate_fn, batch_size=2)
batch = next(iter(train_dataloader))

In [None]:
batch.keys()

In [None]:
pixel_values, target = train_dataset[0]

In [None]:
pixel_values.shape

In [None]:
print(target)

In [None]:
import pytorch_lightning as pl
# from transformers import DetrForObjectDetection
from transformers import AutoModelForObjectDetection
import torch

class Detr(pl.LightningModule):
     def __init__(self, lr, lr_backbone, weight_decay):
         super().__init__()
         # replace COCO classification head with custom head
         # we specify the "no_timm" variant here to not rely on the timm library
         # for the convolutional backbone
        #  self.model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50",
        #                                                      revision="no_timm",
        #                                                      num_labels=len(id2label),
        #                                                      ignore_mismatched_sizes=True)
         self.model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection",
                                                                        # revision="no_timm",
                                                                        num_labels=len(id2label),
                                                                        ignore_mismatched_sizes=True)
         # see https://github.com/PyTorchLightning/pytorch-lightning/pull/1896
         self.lr = lr
         self.lr_backbone = lr_backbone
         self.weight_decay = weight_decay

     def forward(self, pixel_values, pixel_mask):
       outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

       return outputs

     def common_step(self, batch, batch_idx):
       pixel_values = batch["pixel_values"]
       pixel_mask = batch["pixel_mask"]
       labels = [{k: v.to(self.device) for k, v in t.items()} for t in batch["labels"]]

       outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)

       loss = outputs.loss
       loss_dict = outputs.loss_dict

       return loss, loss_dict

     def training_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)
        for k,v in loss_dict.items():
          self.log("train_" + k, v.item())

        return loss

     def validation_step(self, batch, batch_idx):
        loss, loss_dict = self.common_step(batch, batch_idx)
        self.log("validation_loss", loss)
        for k,v in loss_dict.items():
          self.log("validation_" + k, v.item())

        return loss

     def configure_optimizers(self):
        param_dicts = [
              {"params": [p for n, p in self.named_parameters() if "backbone" not in n and p.requires_grad]},
              {
                  "params": [p for n, p in self.named_parameters() if "backbone" in n and p.requires_grad],
                  "lr": self.lr_backbone,
              },
        ]
        optimizer = torch.optim.AdamW(param_dicts, lr=self.lr,
                                  weight_decay=self.weight_decay)

        return optimizer

     def train_dataloader(self):
        return train_dataloader

     def val_dataloader(self):
        return val_dataloader

In [None]:
model = Detr(lr=1e-4, lr_backbone=1e-5, weight_decay=1e-4)

outputs = model(pixel_values=batch['pixel_values'], pixel_mask=batch['pixel_mask'])

In [None]:
outputs.logits.shape

In [None]:
learnable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Số tham số học được: {learnable_params}/{total_params}")


In [None]:
from pytorch_lightning import Trainer

trainer = Trainer(max_epochs=200, gradient_clip_val=0.1, accelerator='gpu', devices=1)
trainer.fit(model)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
model.model.push_to_hub("10Ngoc/task02update")

In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger
import pandas as pd
import matplotlib.pyplot as plt

# Tạo mô hình với các siêu tham số
model = Detr(lr=1e-4, lr_backbone=1e-5, weight_decay=1e-4)
outputs = model(pixel_values=batch['pixel_values'], pixel_mask=batch['pixel_mask'])

# Tạo logger để lưu loss mỗi epoch
logger = CSVLogger(save_dir="logs/", name="detr_structure_recognition")

# Cấu hình Trainer
trainer = Trainer(
    max_epochs=10,
    logger=logger,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    log_every_n_steps=10,
)

# Huấn luyện mô hình
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

# Đọc log CSV để vẽ biểu đồ
log_path = logger.log_dir + "/metrics.csv"
df = pd.read_csv(log_path)

# Vẽ biểu đồ loss
plt.plot(df["epoch"], df["training_loss"], label="Train Loss")
if "validation_loss" in df.columns:
    plt.plot(df["epoch"], df["validation_loss"], label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training & Validation Loss")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from transformers import AutoImageProcessor, AutoModelForObjectDetection, TableTransformerForObjectDetection
import torch

model = TableTransformerForObjectDetection.from_pretrained("10Ngoc/task01tabledetection", id2label={0:"table"})
# model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection",
#                                                                         revision="no_timm",
#                                                                         num_labels=len(id2label),
#                                                                         ignore_mismatched_sizes=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
!pip install -q coco-eval

In [None]:
def convert_to_xywh(boxes):
    xmin, ymin, xmax, ymax = boxes.unbind(1)
    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)

def prepare_for_coco_detection(predictions):
    coco_results = []
    for original_id, prediction in predictions.items():
        if len(prediction) == 0:
            continue

        boxes = prediction["boxes"]
        boxes = convert_to_xywh(boxes).tolist()
        scores = prediction["scores"].tolist()
        labels = prediction["labels"].tolist()

        coco_results.extend(
            [
                {
                    "image_id": original_id,
                    "category_id": labels[k],
                    "bbox": box,
                    "score": scores[k],
                }
                for k, box in enumerate(boxes)
            ]
        )
    return coco_results

In [None]:
from coco_eval import CocoEvaluator
from tqdm.notebook import tqdm

import numpy as np

# initialize evaluator with ground truth (gt)
evaluator = CocoEvaluator(coco_gt=train_dataset.coco, iou_types=["bbox"])

print("Running evaluation...")
for idx, batch in enumerate(tqdm(train_dataloader)):
    # get the inputs
    pixel_values = batch["pixel_values"].to(device)
    pixel_mask = batch["pixel_mask"].to(device)
    labels = [{k: v.to(device) for k, v in t.items()} for t in batch["labels"]] # these are in DETR format, resized + normalized

    # forward pass
    with torch.no_grad():
      outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)

    # turn into a list of dictionaries (one item for each example in the batch)
    orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
    results = processor.post_process_object_detection(outputs, target_sizes=orig_target_sizes, threshold=0)

    # provide to metric
    # metric expects a list of dictionaries, each item
    # containing image_id, category_id, bbox and score keys
    predictions = {target['image_id'].item(): output for target, output in zip(labels, results)}
    predictions = prepare_for_coco_detection(predictions)
    evaluator.update(predictions)

evaluator.synchronize_between_processes()
evaluator.accumulate()
evaluator.summarize()

In [None]:
import requests
from io import BytesIO
from PIL import Image

url = "https://www.dropbox.com/scl/fi/glyymn5opvhmd929004ri/28_1.jpg?rlkey=d99hbhszy1z922ywei315bl06&st=ewtu61ki&dl=1"

# Dropbox link gốc khi tải trực tiếp, thường bạn phải đổi 'dl=0' thành 'dl=1' hoặc 'raw=1' để có link trực tiếp
# Ở đây dl=1 rồi nên tải trực tiếp được.

response = requests.get(url)
image = Image.open(BytesIO(response.content)).convert("RGB")

# Tiếp theo, như bạn đã làm:
inputs = processor(images=image, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model(**inputs)

target_sizes = torch.tensor([image.size[::-1]]).to(device)
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.5)[0]

predictions = [{
    "image_id": 0,
    "category_id": label.item(),
    "bbox": box.tolist(),
    "score": score.item()
} for score, label, box in zip(results["scores"], results["labels"], results["boxes"])]


from PIL import Image
import itertools, math
import matplotlib.pyplot as plt

# ------------------------------------------------
# 0. Thông số
# ------------------------------------------------
PAD = 1                            # số pixel padding quanh mỗi cell

# ------------------------------------------------
# 1. Phân loại bbox hàng / cột
# ------------------------------------------------
rows = [tuple(map(int, p["bbox"])) for p in predictions if p["category_id"] == 0]
cols = [tuple(map(int, p["bbox"])) for p in predictions if p["category_id"] == 1]

# ------------------------------------------------
# 2. Giao nhau -> box ô
# ------------------------------------------------
def intersect(a, b):
    x0, y0 = max(a[0], b[0]), max(a[1], b[1])
    x1, y1 = min(a[2], b[2]), min(a[3], b[3])
    return (x0, y0, x1, y1) if x1 > x0 and y1 > y0 else None

cells = [intersect(r, c) for r, c in itertools.product(rows, cols)]
cells = [b for b in cells if b]                     # loại ô ảo
cells.sort(key=lambda b: (b[1], b[0]))              # đọc trên-xuống, trái-phải

# ------------------------------------------------
# 3. Hàm pad & cắt
# ------------------------------------------------
W, H = image.size
def pad_box(b, p=PAD):
    x0 = max(0, b[0] - p)
    y0 = max(0, b[1] - p)
    x1 = min(W, b[2] + p)
    y1 = min(H, b[3] + p)
    return (x0, y0, x1, y1)

cell_imgs = [image.crop(pad_box(b)) for b in cells]

# ------------------------------------------------
# 4. Hiển thị lưới (tùy chọn)
# ------------------------------------------------
n = len(cell_imgs)
cols_grid = min(6, n)
rows_grid = math.ceil(n / cols_grid)
fig, axes = plt.subplots(rows_grid, cols_grid, figsize=(2.5*cols_grid, 2.5*rows_grid))

for ax in axes.flat:
    ax.axis("off")
for i, im in enumerate(cell_imgs):
    ax = axes.flat[i]
    ax.imshow(im)
    ax.set_title(f"Cell {i}", fontsize=8)

plt.tight_layout()
plt.show()


In [None]:
import requests, zipfile, io, os, itertools, math, shutil
from pathlib import Path
from PIL import Image, ImageOps
import torch


# xóa ZIP cũ nếu có
if Path(OUTPUT_ZIP).exists():
    Path(OUTPUT_ZIP).unlink()

# ────────────────────────────────────────────────────────────────────
# CONFIGURATION (customize chỗ này nha)
# ────────────────────────────────────────────────────────────────────
ZIP_URL       = "https://www.dropbox.com/scl  /fo/ovyizztnavkpf3929nu2g/AId_MJ7X-E4XcV96jMIT3yI?rlkey=6i37jm6nr4jsx5z3d2f1fwsze&dl=1"
TMP_DIR       = Path("ZIP_EXTR")
OUT_ROOT      = Path("OUT_CELLS")
PAD           = 1                  # pixel padding around each cell crop
DEVICE        = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# dọn sạch mọi thứ của lần chạy trước
for p in (TMP_DIR, OUT_ROOT):
    if p.exists():
        shutil.rmtree(p)
    p.mkdir(parents=True, exist_ok=True)

# resize rules
SQUARE_SIZE   = (512, 512)
RECT_SIZE     = (640, 480)
ASPECT_TOLER  = 0.05               # |w/h − 1| ≤ tol → treat as square
KEEP_ASPECT   = False # Stretch the image
PAD_COLOR     = (255, 255, 255)    # for letter-box

# name of the final zip you will download
OUTPUT_ZIP    = "OUT_CELLS.zip"

# ────────────────────────────────────────────────────────────────────
# HOUSEKEEPING  (clean old runs)
# ────────────────────────────────────────────────────────────────────
shutil.rmtree(TMP_DIR,  ignore_errors=True)
shutil.rmtree(OUT_ROOT, ignore_errors=True)
for p in (TMP_DIR, OUT_ROOT):
    p.mkdir(parents=True, exist_ok=True)



In [None]:


# ────────────────────────────────────────────────────────────────────
# 1. DOWNLOAD ZIP
# ────────────────────────────────────────────────────────────────────
print("⬇️  Downloading ZIP …")
resp = requests.get(ZIP_URL, stream=True)
resp.raise_for_status()
zip_bytes = io.BytesIO(resp.content)

# ────────────────────────────────────────────────────────────────────
# 2. EXTRACT JPGs
# ────────────────────────────────────────────────────────────────────
print("📦 Extracting JPGs …")
with zipfile.ZipFile(zip_bytes) as zf:
    for member in zf.namelist():
        if member.lower().endswith(".jpg"):
            zf.extract(member, path=TMP_DIR)

jpg_paths = sorted(TMP_DIR.rglob("*.jpg"))
print(f"✅ Found {len(jpg_paths)} images\n")



### Utils functions

In [None]:
def intersect(a, b):
    x0, y0 = max(a[0], b[0]), max(a[1], b[1])
    x1, y1 = min(a[2], b[2]), min(a[3], b[3])
    return (x0, y0, x1, y1) if x1 > x0 and y1 > y0 else None

def pad_box(b, p, w, h):
    x0 = max(0, b[0]-p); y0 = max(0, b[1]-p)
    x1 = min(w, b[2]+p); y1 = min(h, b[3]+p)
    return (x0, y0, x1, y1)

def resize_with_letterbox(img, target, keep_aspect=True, fill=PAD_COLOR):
    """Resize to *target* (w,h).  If keep_aspect→True, letter-box pad."""
    if not keep_aspect:
        return img.resize(target, Image.LANCZOS)


    tw, th = target
    iw, ih = img.size
    scale  = min(tw/iw, th/ih)
    nw, nh = int(iw*scale), int(ih*scale)
    img_r  = img.resize((nw, nh), Image.LANCZOS)
    canvas = Image.new("RGB", target, fill)
    canvas.paste(img_r, ((tw-nw)//2, (th-nh)//2))
    return canvas


### Process each image

In [None]:

for img_path in jpg_paths:
    image = Image.open(img_path).convert("RGB")

    # ── Run detector ────────────────────────────────────────────────
    inputs = processor(images=image, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        outputs = model(**inputs)

    target_sizes = torch.tensor([image.size[::-1]]).to(DEVICE)
    results = processor.post_process_object_detection(
                    outputs, target_sizes=target_sizes, threshold=0.5)[0]

    predictions = [{
        "category_id": label.item(),
        "bbox": list(map(int, box.tolist()))
    } for label, box in zip(results["labels"], results["boxes"])]

    rows = [p["bbox"] for p in predictions if p["category_id"] == 0]
    cols = [p["bbox"] for p in predictions if p["category_id"] == 1]

    cell_boxes = [intersect(r, c) for r, c in itertools.product(rows, cols)]
    cell_boxes = [b for b in cell_boxes if b]
    cell_boxes.sort(key=lambda b: (b[1], b[0]))   # reading order

    # ── Crop, resize, save ──────────────────────────────────────────
    W, H = image.size
    out_dir = OUT_ROOT / img_path.stem
    out_dir.mkdir(parents=True, exist_ok=True)

    for idx, b in enumerate(cell_boxes):
        pb   = pad_box(b, PAD, W, H)
        crop = image.crop(pb)

        # choose target size
        w, h      = crop.size
        is_square = abs((w/h) - 1.0) <= ASPECT_TOLER
        target_sz = SQUARE_SIZE if is_square else RECT_SIZE
        
        final = resize_with_letterbox(
                    crop,
                    target=target_sz,
                    keep_aspect=False,        # <── tắt giữ tỉ lệ
                    fill=PAD_COLOR            # giá trị này sẽ bị bỏ qua khi keep_aspect=False
                )
        final.save(out_dir / f"cell_{idx:03d}.jpg", quality=95)

    print(f"{img_path.name:35s} ➜  {len(cell_boxes):3d} cells")

print("\n🥳  All done! Crops live in:", OUT_ROOT.resolve())


### Make zip file

In [None]:

if os.path.exists(OUTPUT_ZIP):
    os.remove(OUTPUT_ZIP)

shutil.make_archive(OUTPUT_ZIP.replace(".zip",""), "zip", root_dir=OUT_ROOT)
print(f"📁  Created {OUTPUT_ZIP} – grab it from the right-hand Files pane.")


In [None]:
from IPython.display import FileLink, display
display(FileLink(OUTPUT_ZIP))