In [None]:
import h5py
import numpy as np
from pathlib import Path
from tqdm import tqdm
from PIL import Image
import random

In [None]:
def get_dataset_image_filenames(data_dir, split, class_map, generated_class_name):
    
    all_image_filenames = []
    all_labels = []
    
    for class_name, class_id in class_map.items():
        
        if class_name == generated_class_name:
            continue
        
        image_path = Path(data_dir, split, class_name)
        image_filenames = [
            file
            for file in image_path.iterdir()
            if file.is_file() and file.suffix == ".png"
        ]
        all_image_filenames += image_filenames
        all_labels += [class_id for _ in range(len(image_filenames))]
    
    return all_image_filenames, all_labels

def get_generated_image_filenames(gen_data_dir, generated_class_id, n_gen_images, n_gen_samples_per_image):
    
    gen_image_path = Path(gen_data_dir)
    gen_dirs = [dir_name for dir_name 
                in gen_image_path.iterdir()
                if dir_name.is_dir()
                ]
    
    if n_gen_images == "all":
        gen_dirs_subset = gen_dirs
        n_gen_images = len(gen_dirs)
    else:
        gen_dirs_subset = random.sample(gen_dirs, n_gen_images)
     
    all_image_filenames = []
    all_labels = [generated_class_id for _ in range(n_gen_images*n_gen_samples_per_image)]
    
    for gen_dirname in gen_dirs_subset:
        gen_image_filenames = [image_name for image_name 
                               in gen_dirname.iterdir()
                               if image_name.is_file() and image_name.suffix == ".png"
                               ]
        images = random.sample(gen_image_filenames, n_gen_samples_per_image)
        all_image_filenames += images
        
    return all_image_filenames, all_labels
         
def save_h5_dataset_file(h5_path, image_filenames, labels, split, target_size):
    
    with h5py.File(h5_path, "w") as dataset_file:
        str_h5 = h5py.string_dtype(encoding="utf-8")
        labels_h5 = dataset_file.create_dataset(
            "labels", shape=(len(labels),), dtype=np.int32
        )
        filenames_h5 = dataset_file.create_dataset(
            "filenames", shape=(len(image_filenames),), dtype=str_h5
        )
        imgs_h5 = dataset_file.create_dataset(
            "images",
            shape=(len(image_filenames), 3, target_size[0], target_size[1]),
            dtype=np.uint8,
            compression="gzip",
        )
        print(f"{split} dataset:", len(image_filenames), "images")
        for i, (file, label) in enumerate(
            tqdm(
                zip(image_filenames, labels), total=len(image_filenames)
            )
        ):
            img = Image.open(file).convert("RGB").resize(target_size, Image.BICUBIC)
            img_arr = np.array(img, dtype=np.uint8)
            imgs_h5[i] = np.transpose(img_arr, (2, 0, 1))
            labels_h5[i] = label
            filenames_h5[i] = file.name

def save_generated_dataset(
    data_dir, 
    out_dir, 
    generated_data_dir, 
    generated_class_name, 
    n_gen_images, 
    n_gen_samples_per_image, 
    split,
    class_map, 
    target_size=(68, 68),
    prefix=None
    ):
    
    h5_filename = f"{split}_data.h5"
    if prefix is not None:
        h5_filename = prefix + "_" + h5_filename
    
    h5_path = Path(out_dir, h5_filename)
    
    filenames, labels = get_dataset_image_filenames(
        data_dir, split, class_map, generated_class_name
    )
    print("Dataset images: ", len(filenames))
    
    gen_fielnames, gen_labels = get_generated_image_filenames(
        generated_data_dir, class_map[generated_class_name], n_gen_images, n_gen_samples_per_image
    )
    print("Generated images: ", len(gen_fielnames))
    
    filenames += gen_fielnames
    labels += gen_labels

    save_h5_dataset_file(h5_path, filenames, labels, split, target_size)    
    
    
    
    
    
    
    

In [None]:

DATA = "C:\\Users\\MS\\Desktop\\IDEAS\\Projekty\\CD34\\Dane\\CD34_tiles_68x68_zestaw_1.2"
GEN = "C:\\Users\\MS\\Desktop\\IDEAS\\Projekty\\CD34\\Dane\\CD34_stable_diffusion\\images_megokaryocytes_1_2_retrained_b_1"
OUT = "..\\data\\generated"

save_generated_dataset(
    data_dir=DATA, 
    out_dir=OUT, 
    split="train",
    generated_data_dir=GEN, 
    generated_class_name="Megakariocyty", 
    n_gen_images="all", 
    n_gen_samples_per_image=1, 
    class_map={
        # "komorki": 0, 
        "Megakariocyty": 0, 
        "Mieloblasty": 1,
        # "Naczynia": 3,
        # "Artefakty": 4,
        # "Megakariocyty generowane": 5
        }, 
    target_size=(68, 68),
    prefix="binary"
    )

In [None]:
DATA = "C:\\Users\\MS\\Desktop\\IDEAS\\Projekty\\CD34\\Dane\\CD34_tiles_68x68_zestaw_1.2"
split = ["train", "test"]
class_map={
        "komorki": 0, 
        "Megakariocyty": 1, 
        "Mieloblasty": 2,
        }

for sp in split:
    for class_name in class_map.keys():
        files = [x for x in Path(DATA, sp, class_name).iterdir() if x.suffix == ".png"]
        print(sp, class_name, len(files))

In [None]:
!uv add nbformat

In [None]:
import plotly.graph_objects as go

# Example data
categories = ["komorki", "Megakariocyty", "Mieloblasty"]
values_train = [7125, 59, 15831]
values_test =  [3540, 22, 2291]

fig = go.Figure(data=[
    go.Bar(name="Model 1", x=categories, y=values_train),
    go.Bar(name="Model 2", x=categories, y=values_test)
])

# Change bar mode
fig.update_layout(
    barmode="group",
    title="Grouped Bar Plot",
    xaxis_title="Category",
    yaxis_title="Value",
    plot_bgcolor="white"
)

fig.show()



In [None]:
import pandas as pd
import plotly.express as px

df = pd.DataFrame({
    "Klasa": ["Komórki", "Megakariocyty", "Mieloblasty"] * 2,
    "Liczność": [7125, 59, 15831, 3540, 22, 2291],
    "Podzbiór": ["treningowy"] * 3 + ["testowy"] * 3
})

fig = px.bar(df, x="Podzbiór", y="Liczność", color="Klasa", barmode="group", text="Liczność")
fig.show()

In [None]:
import torch

B = 2
E = 5
test = torch.rand((B, E)) * 10.0

torch.nn.functional.normalize(test, dim=1)