# Table Dataset Exploration

**Purpose:** Explore v√† analyze table datasets cho Meddies-OCR project

**Datasets covered:**
1. FinePDFs-Full (24K images, unlabeled)
2. PubTables-1M (1M table images, fully annotated)

**Tasks:**
- Dataset statistics & quality analysis
- Sample visualization
- Annotation structure exploration
- Data preparation for fine-tuning

In [None]:
import os
import json
import random
from pathlib import Path
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
from datasets import load_dataset

# Setup
plt.rcParams['figure.figsize'] = (15, 8)
random.seed(42)

## 1. FinePDFs-Full Dataset (Current)

Explore the 24K images we already downloaded.

In [None]:
# Load FinePDFs dataset
finepdfs_path = Path("../data/raw/finepdfs_full/images")
finepdfs_images = sorted(list(finepdfs_path.glob("*.jpg")))

print(f"üìä FinePDFs-Full Statistics:")
print(f"  Total images: {len(finepdfs_images):,}")
print(f"  Location: {finepdfs_path}")

# Sample 10 random images
sample_indices = random.sample(range(len(finepdfs_images)), 10)
print(f"\nüé≤ Random samples: {sample_indices}")

In [None]:
# Visualize samples in grid
fig, axes = plt.subplots(2, 5, figsize=(20, 8))
axes = axes.flatten()

for idx, img_idx in enumerate(sample_indices):
    img_path = finepdfs_images[img_idx]
    img = Image.open(img_path)
    
    axes[idx].imshow(img)
    axes[idx].set_title(f"Image {img_idx:06d}\n{img.size[0]}x{img.size[1]}")
    axes[idx].axis('off')

plt.tight_layout()
plt.suptitle("FinePDFs-Full: Random Samples", fontsize=16, y=1.02)
plt.show()

In [None]:
# Analyze image properties
def analyze_image_stats(image_paths, sample_size=1000):
    """Analyze image dimensions, file sizes, brightness."""
    sampled = random.sample(image_paths, min(sample_size, len(image_paths)))
    
    widths, heights, sizes, brightness = [], [], [], []
    
    for img_path in sampled:
        img = Image.open(img_path)
        w, h = img.size
        widths.append(w)
        heights.append(h)
        sizes.append(os.path.getsize(img_path) / 1024)  # KB
        
        # Brightness (mean pixel value)
        gray = img.convert('L')
        brightness.append(np.array(gray).mean())
    
    return {
        'widths': widths,
        'heights': heights,
        'sizes': sizes,
        'brightness': brightness
    }

print("Analyzing 1000 random images...")
stats = analyze_image_stats(finepdfs_images, sample_size=1000)

# Plot distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

axes[0, 0].hist(stats['widths'], bins=50, edgecolor='black')
axes[0, 0].set_title('Width Distribution')
axes[0, 0].set_xlabel('Width (pixels)')

axes[0, 1].hist(stats['heights'], bins=50, edgecolor='black', color='orange')
axes[0, 1].set_title('Height Distribution')
axes[0, 1].set_xlabel('Height (pixels)')

axes[1, 0].hist(stats['sizes'], bins=50, edgecolor='black', color='green')
axes[1, 0].set_title('File Size Distribution')
axes[1, 0].set_xlabel('Size (KB)')

axes[1, 1].hist(stats['brightness'], bins=50, edgecolor='black', color='red')
axes[1, 1].set_title('Brightness Distribution (blank detection)')
axes[1, 1].set_xlabel('Mean pixel value (0=black, 255=white)')
axes[1, 1].axvline(x=250, color='red', linestyle='--', label='Likely blank (>250)')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

# Summary stats
print(f"\nüìä Image Statistics:")
print(f"  Width: {np.mean(stats['widths']):.0f} ¬± {np.std(stats['widths']):.0f} pixels")
print(f"  Height: {np.mean(stats['heights']):.0f} ¬± {np.std(stats['heights']):.0f} pixels")
print(f"  File size: {np.mean(stats['sizes']):.1f} ¬± {np.std(stats['sizes']):.1f} KB")
print(f"  Brightness: {np.mean(stats['brightness']):.1f} ¬± {np.std(stats['brightness']):.1f}")

# Estimate blank ratio
blank_threshold = 250
blank_count = sum(1 for b in stats['brightness'] if b > blank_threshold)
blank_ratio = blank_count / len(stats['brightness'])
print(f"\n‚ö†Ô∏è  Estimated blank ratio: {blank_ratio:.1%} (brightness > {blank_threshold})")

## 2. PubTables-1M Dataset

Explore the annotated table dataset (once download completes).

In [None]:
# Load PubTables-1M dataset
pubtables_path = Path("../data/raw/pubtables-1m")

if not pubtables_path.exists():
    print("‚è≥ PubTables-1M is still downloading...")
    print(f"   Check progress: tail -f {pubtables_path}/download.log")
else:
    print(f"‚úÖ PubTables-1M found at {pubtables_path}")
    
    # Load dataset
    print("\nLoading dataset...")
    ds = load_dataset(
        "bsmock/pubtables-1m",
        cache_dir=str(pubtables_path / ".cache")
    )
    
    print(f"\nüìä PubTables-1M Statistics:")
    for split_name, split_data in ds.items():
        print(f"  {split_name}: {len(split_data):,} samples")
    
    print(f"\nüìã Features: {list(ds['train'].features.keys())}")

In [None]:
# Explore sample annotations
if pubtables_path.exists():
    # Get a sample
    sample = ds['train'][0]
    
    print("üîç Sample annotation structure:")
    for key, value in sample.items():
        if key == 'image':
            print(f"  {key}: PIL.Image {value.size}")
        elif isinstance(value, (list, dict)):
            print(f"  {key}: {type(value).__name__} (len={len(value) if hasattr(value, '__len__') else 'N/A'})")
        else:
            print(f"  {key}: {value}")

In [None]:
# Visualize annotated table
if pubtables_path.exists():
    def visualize_table_annotation(sample, figsize=(15, 10)):
        """Visualize table with bounding boxes and structure."""
        fig, axes = plt.subplots(1, 2, figsize=figsize)
        
        # Original image
        axes[0].imshow(sample['image'])
        axes[0].set_title('Original Image')
        axes[0].axis('off')
        
        # Image with annotations
        axes[1].imshow(sample['image'])
        ax = axes[1]
        
        # Draw bounding boxes (assuming bbox format: [x, y, width, height])
        # Note: Adapt this based on actual annotation format
        if 'bboxes' in sample or 'objects' in sample:
            # This is a placeholder - adjust based on actual format
            print("Drawing bounding boxes...")
            # Example:
            # for bbox in sample['bboxes']:
            #     rect = patches.Rectangle(
            #         (bbox[0], bbox[1]), bbox[2], bbox[3],
            #         linewidth=2, edgecolor='red', facecolor='none'
            #     )
            #     ax.add_patch(rect)
        
        axes[1].set_title('Annotated (with bboxes)')
        axes[1].axis('off')
        
        plt.tight_layout()
        plt.show()
    
    # Visualize 3 random samples
    print("Visualizing 3 random samples...")
    for i in random.sample(range(100), 3):  # Sample from first 100
        sample = ds['train'][i]
        visualize_table_annotation(sample)

## 3. Dataset Comparison

Compare FinePDFs vs PubTables for table understanding tasks.

In [None]:
# Create comparison table
import pandas as pd

comparison = pd.DataFrame([
    {
        'Dataset': 'FinePDFs-Full',
        'Images': '24,318',
        'Size': '21 GB',
        'Annotations': '‚ùå None',
        'Table-focused': '‚ùå Mixed content',
        'Use case': 'Unlabeled pretraining'
    },
    {
        'Dataset': 'PubTables-1M',
        'Images': '1,000,000',
        'Size': '~100 GB',
        'Annotations': '‚úÖ Full structure',
        'Table-focused': '‚úÖ 100% tables',
        'Use case': 'Supervised fine-tuning'
    }
])

print(comparison.to_string(index=False))

## 4. Data Preparation for Training

Prepare datasets cho Qwen VL fine-tuning.

In [None]:
# Example: Convert PubTables to Qwen VL format
if pubtables_path.exists():
    def convert_to_qwen_format(sample):
        """
        Convert PubTables sample to Qwen VL training format.
        
        Format:
        {
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": <PIL.Image>},
                        {"type": "text", "text": "Extract the table structure"}
                    ]
                },
                {
                    "role": "assistant",
                    "content": [{"type": "text", "text": "<table>...</table>"}]
                }
            ]
        }
        """
        # This is a template - adjust based on actual annotation format
        return {
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": sample['image']},
                        {"type": "text", "text": "Extract all text from this table and return as markdown."}
                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {"type": "text", "text": "TODO: Format table structure here"}
                    ]
                }
            ]
        }
    
    # Convert sample
    sample = ds['train'][0]
    qwen_sample = convert_to_qwen_format(sample)
    
    print("üìù Qwen VL format example:")
    print(json.dumps(qwen_sample, indent=2, default=str)[:500] + "...")

## 5. Quality Checks

Validate dataset quality before training.

In [None]:
# Check for common issues
def quality_check_dataset(dataset, sample_size=100):
    """Run quality checks on dataset."""
    issues = {
        'corrupted_images': [],
        'missing_annotations': [],
        'blank_images': [],
        'low_resolution': []
    }
    
    samples = random.sample(range(len(dataset)), min(sample_size, len(dataset)))
    
    for idx in samples:
        sample = dataset[idx]
        
        # Check image
        try:
            img = sample['image']
            w, h = img.size
            
            # Low resolution
            if w < 800 or h < 600:
                issues['low_resolution'].append(idx)
            
            # Blank detection
            gray = img.convert('L')
            brightness = np.array(gray).mean()
            if brightness > 250:
                issues['blank_images'].append(idx)
                
        except Exception as e:
            issues['corrupted_images'].append((idx, str(e)))
        
        # Check annotations (adapt based on format)
        # if not sample.get('bboxes'):
        #     issues['missing_annotations'].append(idx)
    
    return issues

if pubtables_path.exists():
    print("Running quality checks on PubTables-1M...")
    issues = quality_check_dataset(ds['train'], sample_size=100)
    
    print("\n‚ö†Ô∏è  Quality Check Results:")
    for issue_type, issue_list in issues.items():
        if issue_list:
            print(f"  {issue_type}: {len(issue_list)} found")
        else:
            print(f"  {issue_type}: ‚úÖ None found")

## 6. Next Steps

**Recommendations:**

1. **Filter blank images** from FinePDFs (current estimate: ~X% blank)
2. **Setup training pipeline** with PubTables-1M:
   - Start with small subset (1K samples) for quick iteration
   - Scale to full dataset once pipeline works
3. **Fine-tune Qwen VL** for table understanding:
   - Task: Table structure extraction
   - Output format: Markdown or HTML tables
4. **Evaluate** on held-out test set:
   - Structure accuracy (row/column correctness)
   - Cell-level CER (Character Error Rate)
   - End-to-end table extraction accuracy

**Reference CLAUDE.md:**
- Section 1: VLM Architecture Fundamentals
- Section 2.1: Qwen 2.5 VL (Primary cho OCR tasks)
- Section 5: Common Tasks & Commands