# Create raw baseline dataset 

## 1. Setup

In [None]:
%cd /home/minhquana/workspace/project_DeepLearning/computer_vision/Abnormal-Prediction-In-Chest-X-Ray

In [None]:
import shutil
from pathlib import Path
import yaml
from tqdm import tqdm

## 2. Chuẩn Bị Raw Datasets

In [None]:
baseline_dir = Path('data/baseline_2classes')
preprocessed_dir = Path('data/preprocessed_2classes')

print("Creating baseline dataset (raw images, no preprocessing)...")
print("=" * 80)

for split in ['train', 'valid', 'test']:
    src_img = preprocessed_dir / split / 'images'
    src_lbl = preprocessed_dir / split / 'labels'
    
    dst_img = baseline_dir / split / 'images'
    dst_lbl = baseline_dir / split / 'labels'
    
    dst_img.mkdir(parents=True, exist_ok=True)
    dst_lbl.mkdir(parents=True, exist_ok=True)
    
    if not src_img.exists():
        continue
    
    # Copy images từ source gốc (chưa preprocessing)
    source_dir = Path('data')
    
    # Copy labels (giữ nguyên)
    label_files = list(src_lbl.glob('*.txt'))
    for lbl in tqdm(label_files, desc=f"  {split} labels"):
        shutil.copy2(lbl, dst_lbl / lbl.name)
    
    # Copy raw images (tìm từ source gốc)
    for lbl in tqdm(label_files, desc=f"  {split} images"):
        img_name = lbl.stem
        
        # Tìm image trong source gốc
        found = False
        for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
            for src_split in ['train', 'valid', 'test']:
                src_path = source_dir / src_split / 'images' / f"{img_name}{ext}"
                if src_path.exists():
                    shutil.copy2(src_path, dst_img / src_path.name)
                    found = True
                    break
            if found:
                break
    
    img_count = len(list(dst_img.glob('*')))
    lbl_count = len(list(dst_lbl.glob('*.txt')))
    print(f"  {split.upper()}: {img_count:,} images, {lbl_count:,} labels")

print("\n✓ Baseline dataset created")
print("=" * 80)

In [None]:
# Tạo data.yaml cho baseline
baseline_yaml = {
    'path': str(baseline_dir.absolute()),
    'train': 'train/images',
    'val': 'valid/images',
    'test': 'test/images',
    'nc': 2,
    'names': ['Aortic enlargement', 'Cardiomegaly']
}

baseline_yaml_path = baseline_dir / 'data.yaml'
with open(baseline_yaml_path, 'w') as f:
    yaml.dump(baseline_yaml, f, default_flow_style=False, sort_keys=False)

print(f"✓ Created {baseline_yaml_path}")
with open(baseline_yaml_path) as f:
    print(f.read())