In [14]:
import os
import numpy as np
import tensorflow as tf
from PIL import Image
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../utils')

import os
import sys

# Dynamically add 'src' to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(os.path.join(project_root, 'src'))

#from utils.data import select_tomo_ids
#from utils.model_plot import plot_history_regression
#from ml_logic.preprocess import batches_images_ram,selection_images_labels,read_img_jpg
#from utils.image_padding import padd_picture


### 1. Pad and Save 2D Slices
def pad_image_to_square(image_path, save_path, target_size=960):
    img = Image.open(image_path).convert('RGB')
    padded_img = Image.new('RGB', (target_size, target_size), (255, 255, 255))
    padded_img.paste(img, (0, 0))
    padded_img.save(save_path)


def pad_all_tomos(raw_root, processed_root, tomo_ids, target_size=960):
    for tid in tomo_ids:
        in_dir = os.path.join(raw_root, tid)
        out_dir = os.path.join(processed_root, tid)
        os.makedirs(out_dir, exist_ok=True)
        for fname in os.listdir(in_dir):
            if fname.endswith('.jpg') or fname.endswith('.png'):
                pad_image_to_square(os.path.join(in_dir, fname), os.path.join(out_dir, fname), target_size)


### 2. Load Slices for a Single Tomogram
def load_stack(tomo_folder, n_slices=300, img_size=(960, 960)):
    slice_files = sorted([f for f in os.listdir(tomo_folder) if f.endswith('.jpg') or f.endswith('.png')])[:n_slices]
    stack = []
    for fname in slice_files:
        img = Image.open(os.path.join(tomo_folder, fname)).resize(img_size).convert('RGB')
        stack.append(np.array(img, dtype=np.float32) / 255.0)

    if len(stack) < n_slices:
        pad_img = np.ones((img_size[0], img_size[1], 3), dtype=np.float32)
        stack += [pad_img] * (n_slices - len(stack))

    return np.stack(stack, axis=0)  # Shape: (n_slices, H, W, 3)


### 3. Load All Tomograms
def load_dataset(tomo_paths, labels, n_slices=300, img_size=(960, 960)):
    X, y = [], []
    for path, label in zip(tomo_paths, labels):
        volume = load_stack(path, n_slices, img_size)
        X.append(volume)
        y.append(label)
    return np.array(X), np.array(y)


### 4. Create Train/Val/Test Sets
#def create_tf_datasets(X, y, batch_size=2, val_fraction=0.1, test_fraction=0.1):
#    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=test_fraction, random_state=42)
#    val_size = val_fraction / (1 - test_fraction)
#    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=val_size,  random_state=42)

#    def to_dataset(X, y):
#        return tf.data.Dataset.from_tensor_slices((X, y)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
#
#    return to_dataset(X_train, y_train), to_dataset(X_val, y_val), to_dataset(X_test, y_test)

from math import ceil

def create_tf_datasets(X, y, batch_size=2, val_fraction=0.2, test_fraction=0.2):
    total = len(X)
    n_test = max(1, int(total * test_fraction))
    n_val = max(1, int(total * val_fraction))
    n_train = total - n_val - n_test

    # Manual split for small dataset
    X, y = np.array(X), np.array(y)
    indices = np.arange(total)
    np.random.seed(42)
    np.random.shuffle(indices)

    X_train, y_train = X[indices[:n_train]], y[indices[:n_train]]
    X_val, y_val = X[indices[n_train:n_train + n_val]], y[indices[n_train:n_train + n_val]]
    X_test, y_test = X[indices[n_train + n_val:]], y[indices[n_train + n_val:]]

    def to_dataset(X, y):
        return tf.data.Dataset.from_tensor_slices((X, y)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    return to_dataset(X_train, y_train), to_dataset(X_val, y_val), to_dataset(X_test, y_test)


In [15]:
import pandas as pd

path_train_csv= '../data/csv_raw/train_labels.csv'
path_image= '../data/pictures_process/train_process'
df = pd.read_csv(path_train_csv).copy()
df.head(10)

Unnamed: 0.1,Unnamed: 0,row_id,tomo_id,Motor_axis_0,Motor_axis_1,Motor_axis_2,Array_shape_axis_0,Array_shape_axis_1,Array_shape_axis_2,Voxel_spacing,Number_of_motors
0,0,16,tomo_049310,-1.0,-1.0,-1.0,500,924,956,19.7,0
1,1,30,tomo_098751,-1.0,-1.0,-1.0,500,924,956,16.1,0
2,2,54,tomo_136c8d,-1.0,-1.0,-1.0,500,924,956,19.7,0
3,3,59,tomo_146de2,-1.0,-1.0,-1.0,500,924,956,16.1,0
4,4,84,tomo_1dc5f9,-1.0,-1.0,-1.0,500,924,956,19.7,0
5,5,121,tomo_28f9c1,-1.0,-1.0,-1.0,500,924,956,16.1,0
6,6,173,tomo_39b15b,-1.0,-1.0,-1.0,500,924,956,16.1,0
7,7,181,tomo_3b8291,-1.0,-1.0,-1.0,500,924,956,19.7,0
8,8,193,tomo_40b215,-1.0,-1.0,-1.0,500,924,956,16.1,0
9,9,219,tomo_4baff0,-1.0,-1.0,-1.0,500,924,956,16.1,0


In [16]:
import os
import numpy as np
from PIL import Image
from pathlib import Path
def compute_mean_slices(input_dir, output_dir, group_size=10):
    os.makedirs(output_dir, exist_ok=True)

    # Get and sort all slice file names
    slices = sorted(f for f in os.listdir(input_dir) if f.endswith('.jpg'))

    for i in range(0, len(slices), group_size):
        group = slices[i:i + group_size]
        imgs = [np.array(Image.open(os.path.join(input_dir, f)), dtype=np.float32) for f in group]
        mean_img = np.mean(imgs, axis=0).astype(np.uint8)

        out_name = f'mean_{i//group_size + 1:04d}.jpg'
        Image.fromarray(mean_img).save(os.path.join(output_dir, out_name))

In [38]:
from pathlib import Path
import os

#dir_image = '../data/pictures_process/train_process/'
dir_image = '../data/pictures_raw/train/'
dir_mean_par = '../data/pictures_process/train_process'
parent_dir = Path(dir_image)
subdirs = [p for p in parent_dir.iterdir() if p.is_dir()]

dir_mean = Path(dir_mean_par) / 'tomo_mean'
os.makedirs(dir_mean, exist_ok=True)

for dirr in subdirs:
    tomo_id = dirr.name
    output_dir = dir_mean / tomo_id
    os.makedirs(output_dir, exist_ok=True)
    compute_mean_slices(str(dirr), str(output_dir), group_size=10)

In [None]:
### count how many tomo_ids have 300 slices

root_folder = '../data/pictures_raw/train'
target_count = 300
valid_tomos = []

# Loop through each tomo_id directory
for tomo_dir_name in os.listdir(root_folder):
    tomo_dir = os.path.join(root_folder, tomo_dir_name)

    if not os.path.isdir(tomo_dir):
        continue

    # Count only .jpg files
    jpg_count = len([
        f for f in os.listdir(tomo_dir)
        if f.lower().endswith('.jpg')
    ])

    if jpg_count == target_count:
        valid_tomos.append(tomo_dir_name)

len(valid_tomos)

428

In [40]:
from pathlib import Path
from PIL import Image
import numpy as np
import skimage.io as img
from skimage.exposure import equalize_adapthist
import os

# Paths and parameters
train_folder = Path('../data/pictures_process/train_process')
root_folder = train_folder / 'tomo_mean'
clip_sizes = [1]  # clip limits

#for tomo_dir_name in os.listdir(root_folder):
for tomo_dir_name in valid_tomos:
    tomo_dir = root_folder / tomo_dir_name

    if not tomo_dir.is_dir():
        continue

    for mean_picture_name in os.listdir(tomo_dir):
        mean_picture_path = tomo_dir / mean_picture_name

        # Skip non-images
        if not mean_picture_name.lower().endswith('.jpg'):
            continue

        print(f"Reading: {mean_picture_path}")
        tomo_mean = img.imread(mean_picture_path)

        for clip_size in clip_sizes:
            # Prepare paths
            safe_clip = str(clip_size).replace('.', '')
            output_root = train_folder / f'tomo_adaptequal_{safe_clip}'
            output_dir = output_root / tomo_dir_name

            output_dir.mkdir(parents=True, exist_ok=True)

            # Apply contrast adaptation
            adapt_equalized_picture = equalize_adapthist(tomo_mean, clip_limit=clip_size)
            adapt_equalized_uint8 = (adapt_equalized_picture * 255).astype(np.uint8)

            # Save image with original filename
            output_path = output_dir / mean_picture_name
            im = Image.fromarray(adapt_equalized_uint8)
            im.save(output_path)

            print(f"Saved: {output_path}")

Reading: ../data/pictures_process/train_process/tomo_mean/tomo_c11e12/mean_0006.jpg
Saved: ../data/pictures_process/train_process/tomo_adaptequal_1/tomo_c11e12/mean_0006.jpg
Reading: ../data/pictures_process/train_process/tomo_mean/tomo_c11e12/mean_0012.jpg
Saved: ../data/pictures_process/train_process/tomo_adaptequal_1/tomo_c11e12/mean_0012.jpg
Reading: ../data/pictures_process/train_process/tomo_mean/tomo_c11e12/mean_0013.jpg
Saved: ../data/pictures_process/train_process/tomo_adaptequal_1/tomo_c11e12/mean_0013.jpg
Reading: ../data/pictures_process/train_process/tomo_mean/tomo_c11e12/mean_0007.jpg
Saved: ../data/pictures_process/train_process/tomo_adaptequal_1/tomo_c11e12/mean_0007.jpg
Reading: ../data/pictures_process/train_process/tomo_mean/tomo_c11e12/mean_0011.jpg
Saved: ../data/pictures_process/train_process/tomo_adaptequal_1/tomo_c11e12/mean_0011.jpg
Reading: ../data/pictures_process/train_process/tomo_mean/tomo_c11e12/mean_0005.jpg
Saved: ../data/pictures_process/train_process/

In [26]:
df_select = df[df['tomo_id'].isin(subdirs)]
df_select

Unnamed: 0.1,Unnamed: 0,row_id,tomo_id,Motor_axis_0,Motor_axis_1,Motor_axis_2,Array_shape_axis_0,Array_shape_axis_1,Array_shape_axis_2,Voxel_spacing,Number_of_motors


In [10]:

# Define your paths and labels
raw_root = "../data/pictures_raw/train"
processed_root = "../data/pictures_process/train_process"
tomo_ids = select_tomo_ids(df, number_of_slices=[300], number_of_motors=[0,1])
labels = [0, 1]  # Example corresponding labels


# Step 1: Pad slices
#pad_all_tomos(raw_root, processed_root, tomo_ids)

# Step 2: Build full paths
tomo_paths = [os.path.join(processed_root, tid) for tid in tomo_ids]
print(tomo_paths)


['../data/pictures_process/train_process/tomo_2dd6bd', '../data/pictures_process/train_process/tomo_3264bc', '../data/pictures_process/train_process/tomo_54e1a7', '../data/pictures_process/train_process/tomo_974fd4', '../data/pictures_process/train_process/tomo_b8595d', '../data/pictures_process/train_process/tomo_c3619a', '../data/pictures_process/train_process/tomo_d6e3c7', '../data/pictures_process/train_process/tomo_db2a10', '../data/pictures_process/train_process/tomo_e96200', '../data/pictures_process/train_process/tomo_fd41c4', '../data/pictures_process/train_process/tomo_285454', '../data/pictures_process/train_process/tomo_0308c5', '../data/pictures_process/train_process/tomo_0333fa', '../data/pictures_process/train_process/tomo_066095', '../data/pictures_process/train_process/tomo_06e11e', '../data/pictures_process/train_process/tomo_072a16', '../data/pictures_process/train_process/tomo_0c2749', '../data/pictures_process/train_process/tomo_0d4c9e', '../data/pictures_process/t

In [13]:
# Step 3: Load volumes
X, y = load_dataset(tomo_paths, labels)


In [None]:
# Step 4: Create train/val/test datasets
train_ds, val_ds, test_ds = create_tf_datasets(X, y)


2025-06-06 17:37:28.929596: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
# Step 5: Print a batch shape
for volume, label in train_ds.take(1):
    print("Volume shape:", volume.shape)
    print("Label shape:", label.shape)