# Download dataset (diurno/noturno)
Este notebook baixa o dataset via link compartilhado do Google Drive e extrai imagens/labels
para a estrutura YOLO em data/diurna ou data/noturna.


In [None]:
from pathlib import Path
import sys
import subprocess
import shutil
import zipfile
import tempfile
from collections import Counter

CWD = Path.cwd()
PROJECT_ROOT = CWD.parent if CWD.name.lower() == 'notebooks' else CWD

# ==== CONFIG ====
SHARED_LINK = 'https://drive.google.com/drive/folders/1Dz9A3vgyRTUeuQL8MQRMezptnFZnpAkO?usp=sharing'  # cole aqui o link compartilhado do Drive
DOWNLOAD_ROOT = PROJECT_ROOT / 'downloads'
DATASET_ROOT = None  # opcional: PROJECT_ROOT / 'Estagio' / 'Lucas' / 'dataset'

SPLIT = 'diurno'  # 'diurno' ou 'noturno'
VERSION = 'v2'  # ex: 'v0', 'v1' (use None para listar)

OUTPUT_BASE = PROJECT_ROOT / 'data'
OVERWRITE_OUTPUT = True
DRY_RUN = False  # True = nao extrai arquivos

In [None]:
try:
    import gdown  # type: ignore
except Exception:
    print('Instalando gdown...')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'gdown'])
    import gdown  # type: ignore

In [None]:
def is_drive_folder(link: str) -> bool:
    return 'drive/folders' in link or 'folders/' in link

def download_from_drive(link: str, output_dir: Path):
    if not link:
        print('SHARED_LINK vazio. Pule o download se os arquivos ja estiverem no disco.')
        return
    output_dir.mkdir(parents=True, exist_ok=True)
    if is_drive_folder(link):
        gdown.download_folder(url=link, output=str(output_dir), quiet=False, use_cookies=False, remaining_ok=True)
    else:
        out_path = output_dir / 'drive_download'
        gdown.download(url=link, output=str(out_path), quiet=False, fuzzy=True)

def is_dataset_root(path: Path) -> bool:
    if not path.is_dir():
        return False
    names = {p.name.lower() for p in path.iterdir() if p.is_dir()}
    return ('diurno' in names) or ('noturno' in names)

def get_child_dir(path: Path, name: str):
    for child in path.iterdir():
        if child.is_dir() and child.name.lower() == name:
            return child
    return None

def find_dataset_root(download_root: Path):
    if is_dataset_root(download_root):
        return download_root
    candidates = []
    for path in download_root.rglob('*'):
        if not path.is_dir():
            continue
        if is_dataset_root(path):
            candidates.append(path)
            continue
        if path.name.lower() in ('diurno', 'noturno'):
            parent = path.parent
            if parent not in candidates:
                candidates.append(parent)
    if not candidates:
        return None
    def score(p: Path) -> int:
        score = 0
        diurno_dir = get_child_dir(p, 'diurno')
        noturno_dir = get_child_dir(p, 'noturno')
        if diurno_dir:
            score += 1
            if any(diurno_dir.glob('v*')):
                score += 1
        if noturno_dir:
            score += 1
            if any(noturno_dir.glob('v*')):
                score += 1
        return score
    return sorted(candidates, key=score, reverse=True)[0]

download_from_drive(SHARED_LINK, DOWNLOAD_ROOT)

if DATASET_ROOT is None:
    DATASET_ROOT = find_dataset_root(DOWNLOAD_ROOT)

if DATASET_ROOT is None:
    raise RuntimeError('Nao encontrei a pasta dataset. Ajuste DATASET_ROOT manualmente.')

print('DATASET_ROOT:', DATASET_ROOT)


In [None]:
def find_split_dir(dataset_root: Path, split_name: str) -> Path:
    for child in dataset_root.iterdir():
        if child.is_dir() and child.name.lower() == split_name.lower():
            return child
    return dataset_root / split_name

split_root = find_split_dir(DATASET_ROOT, SPLIT)
if not split_root.exists():
    raise RuntimeError(f'Nao encontrei split: {split_root}')

versions = sorted([p.name for p in split_root.glob('v*') if p.is_dir()])
print('Versoes disponiveis:', versions)

if VERSION is None:
    raise SystemExit('Defina VERSION e execute novamente.')

version_dir = split_root / VERSION
if not version_dir.exists():
    raise RuntimeError(f'Versao nao encontrada: {version_dir}')

imgs_dir = version_dir / 'imgs'
labels_dir = version_dir / 'labels'
if not imgs_dir.exists() or not labels_dir.exists():
    raise RuntimeError('Esperado imgs/ e labels/ dentro da versao.')

print('Usando:', version_dir)


In [None]:
OUTPUT_DIR = OUTPUT_BASE / SPLIT
images_out = OUTPUT_DIR / 'images'
labels_out = OUTPUT_DIR / 'labels'

if OVERWRITE_OUTPUT and OUTPUT_DIR.exists():
    shutil.rmtree(OUTPUT_DIR)

if not DRY_RUN:
    images_out.mkdir(parents=True, exist_ok=True)
    labels_out.mkdir(parents=True, exist_ok=True)

def extract_zip_to(zip_path: Path, dest_dir: Path):
    with tempfile.TemporaryDirectory() as tmp:
        tmp_path = Path(tmp)
        with zipfile.ZipFile(zip_path) as zf:
            zf.extractall(tmp_path)
        entries = list(tmp_path.iterdir())
        root = tmp_path
        if len(entries) == 1 and entries[0].is_dir():
            root = entries[0]
        for src in root.rglob('*'):
            if not src.is_file():
                continue
            rel = src.relative_to(root)
            dst = dest_dir / rel
            dst.parent.mkdir(parents=True, exist_ok=True)
            shutil.move(str(src), dst)

img_zips = sorted(imgs_dir.glob('*.zip'))
lbl_zips = sorted(labels_dir.glob('*.zip'))

print('img zips:', [p.name for p in img_zips])
print('label zips:', [p.name for p in lbl_zips])

if not img_zips or not lbl_zips:
    raise RuntimeError('Nao encontrei .zip em imgs/ ou labels/.')

if DRY_RUN:
    print('DRY_RUN=True: sem extracao')
else:
    for z in img_zips:
        extract_zip_to(z, images_out)
    for z in lbl_zips:
        extract_zip_to(z, labels_out)
    print('Extracao concluida em:', OUTPUT_DIR)

In [None]:
IMG_EXTS = {'.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff', '.webp'}

def list_images(root: Path):
    return [p for p in root.rglob('*') if p.suffix.lower() in IMG_EXTS]

def list_labels(root: Path):
    return [p for p in root.rglob('*.txt')]

if not DRY_RUN:
    images = list_images(images_out)
    labels = list_labels(labels_out)
    image_stems = {p.stem for p in images}
    label_stems = {p.stem for p in labels}

    missing_labels = sorted(image_stems - label_stems)
    missing_images = sorted(label_stems - image_stems)

    empty_labels = 0
    for lp in labels:
        if not lp.read_text(encoding='utf-8', errors='ignore').strip():
            empty_labels += 1

    print(f'Imagens: {len(images)}')
    print(f'Labels: {len(labels)}')
    print(f'Labels vazios (background): {empty_labels}')
    print(f'Imagens sem label: {len(missing_labels)}')
    print(f'Labels sem imagem: {len(missing_images)}')
    if missing_labels[:5]:
        print('Exemplo imagens sem label:', missing_labels[:5])
    if missing_images[:5]:
        print('Exemplo labels sem imagem:', missing_images[:5])