In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
!pip install tensorflow-addons

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-macosx_10_14_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting typeguard<3.0.0,>=2.7
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.23.0 typeguard-2.13.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [40]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import tensorflow as tf
import numpy as np
from pathlib import Path
import os
import glob
import sys


from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate, cross_val_score
from tensorflow.keras import Sequential, Input, layers
from tensorflow_addons.metrics import FBetaScore
from tensorflow.keras.metrics import Recall, Precision

sys.path.append('../src')
from src.utils.data import get_csv_from_bq,select_tomo_ids
from src.utils.model_plot import plot_history_regression
from src.ml_logic.preprocess import batches_images_ram,selection_images_labels,read_img_jpg

In [None]:


def selection_images_labels(df, num_slices=[300], num_motors=[0,1]):

   # Step 3: Load parent directory and finding all subdirs
    dir_mean_image = '../data/pictures_process/train_process/'
    parent_dir = Path(dir_mean_image)
    subdirs = [str(p) for p in parent_dir.iterdir() if p.is_dir()]
    names = [p.name for p in parent_dir.iterdir() if p.is_dir()]

   # Step 1: Filter tomos
    #tomo_ids = 'tomo_0a8f05', 'tomo_0a180f', 'tomo_0c3a99', 'tomo_0c3d78'
    tomo_ids = select_tomo_ids(df, number_of_slices=num_slices, number_of_motors=num_motors)
    df_select = df[df['tomo_id'].isin(names)].copy()


    # Step 4: Match subdirectories and labels
    filtered_image_paths = []
    labels = []

    for _, row in df_select.iterrows():
        tomo_id = row['tomo_id']
        matched = [p for p in subdirs if tomo_id in os.path.basename(p)]

        if matched:
            filtered_image_paths.append(matched[0])  # If multiple, take the first
            labels.append(row['Number_of_motors'])
        else:
            print(f"⚠️ No image found for tomo_id: {tomo_id}")

    print(f"Matched {len(filtered_image_paths)} image-label pairs")

    labels = np.array(labels, dtype=np.float32)
    return filtered_image_paths, labels

# Define image reading function
def read_img_jpg(path, label):
    print(path)
    img = tf.io.read_file(path)

    #plt.imshow(img.numpy())
    img = tf.image.decode_jpeg(img, channels=3)
    print(img.shape)
   # plt.imshow(img.numpy())
    img = tf.cast(img, tf.float32) / 255.0  # normalize to [0, 1]
    return img, label

def batches_stack_images_ram(
    filtered_image_paths,
    labels,
    shuffle=True,
    batch_size=2,
    split=True,
    val_fraction=0.2,
    test_fraction=0.2,
    seed=42,
    xy_size=960,
    target_depth=800
):
    dataset_size = len(filtered_image_paths)
    data = list(zip(filtered_image_paths, labels))
    if shuffle:
        rng = np.random.default_rng(seed)
        rng.shuffle(data)

    filtered_image_paths, labels = zip(*data)
    filtered_image_paths = list(filtered_image_paths)
    labels = list(labels)

    if split:
        val_size = 2#int(val_fraction * dataset_size)
        test_size = 2#int(test_fraction * dataset_size)
        train_size = dataset_size - val_size - test_size

        test_paths = filtered_image_paths[:test_size]
        test_labels = labels[:test_size]
        val_paths = filtered_image_paths[test_size:test_size + val_size]
        val_labels = labels[test_size:test_size + val_size]
        train_paths = filtered_image_paths[test_size + val_size:]
        train_labels = labels[test_size + val_size:]

        train_ds = tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
        val_ds = tf.data.Dataset.from_tensor_slices((val_paths, val_labels))
        test_ds = tf.data.Dataset.from_tensor_slices((test_paths, test_labels))

        train_ds = train_ds.map(
            lambda x, y: load_stack_and_label(x, y, n_slices=target_depth, img_size=(xy_size, xy_size)),
            num_parallel_calls=tf.data.AUTOTUNE).batch(batch_size)

        val_ds = val_ds.map(
            lambda x, y: load_stack_and_label(x, y, n_slices=target_depth, img_size=(xy_size, xy_size)),
            num_parallel_calls=tf.data.AUTOTUNE).batch(batch_size)

        test_ds = test_ds.map(
            lambda x, y: load_stack_and_label(x, y, n_slices=target_depth, img_size=(xy_size, xy_size)),
            num_parallel_calls=tf.data.AUTOTUNE).batch(batch_size)

        return train_ds, val_ds, test_ds, test_paths, test_labels

    else:
        dataset = tf.data.Dataset.from_tensor_slices((filtered_image_paths, labels))
        dataset = dataset.map(lambda x, y: load_stack_and_label(x, y,n_slices=target_depth, img_size=(xy_size, xy_size)).batch(batch_size)
        return dataset, filtered_image_paths, labels


In [22]:
tomo_ids = select_tomo_ids(df, number_of_slices=[300], number_of_motors=[0,1])
print(tomo_ids.head(10))
df_select = df[df['tomo_id'].isin(tomo_ids)].copy()
df_select.sample(frac=1, random_state=42).head(10)

58     tomo_2dd6bd
60     tomo_3264bc
77     tomo_54e1a7
98     tomo_974fd4
114    tomo_b8595d
120    tomo_c3619a
131    tomo_d6e3c7
136    tomo_db2a10
142    tomo_e96200
154    tomo_fd41c4
Name: tomo_id, dtype: object


Unnamed: 0.1,Unnamed: 0,row_id,tomo_id,Motor_axis_0,Motor_axis_1,Motor_axis_2,Array_shape_axis_0,Array_shape_axis_1,Array_shape_axis_2,Voxel_spacing,Number_of_motors
335,335,717,tomo_f8b46e,123.0,728.0,352.0,300,928,928,13.1,1
477,477,297,tomo_651ecd,152.0,455.0,337.0,300,960,928,13.1,1
593,593,712,tomo_f76529,31.0,586.0,683.0,300,960,928,15.6,1
246,246,645,tomo_dbc66d,-1.0,-1.0,-1.0,300,960,928,16.8,0
222,222,412,tomo_935f8a,-1.0,-1.0,-1.0,300,960,928,16.8,0
580,580,647,tomo_dcb9b4,153.0,493.0,359.0,300,960,928,15.6,1
551,551,554,tomo_bebadf,143.0,360.0,410.0,300,960,928,13.1,1
402,402,732,tomo_fe050c,138.0,134.0,542.0,300,959,928,15.6,1
189,189,179,tomo_3b1cc9,-1.0,-1.0,-1.0,300,960,928,13.1,0
206,206,338,tomo_72b187,-1.0,-1.0,-1.0,300,960,928,13.1,0


In [None]:
tomo_ids

pandas.core.series.Series

In [28]:
import os
import re
import numpy as np
from PIL import Image
from src.utils.image_padding import padd_picture

def pad_stack_z_with_2d_padding(
    folder_path,
    padded_folder_path=None,
    xy_size=960,
    target_depth=300,
    pad_value=255
):
    """
    Pads 2D slices in a folder to (xy_size, xy_size) and stacks them into a 3D volume.
    Optionally saves padded images to disk if `padded_folder_path` is given.

    Returns:
    --------
    np.ndarray: padded stack of shape (target_depth, xy_size, xy_size, 3)
    """

    def padd_picture(image_path, size):
        img = Image.open(image_path).convert('RGB')
        new_img = Image.new('RGB', (size, size), (255, 255, 255))
        new_img.paste(img, (0, 0))
        return new_img

    def numeric_sort_key(filename):
        numbers = re.findall(r'\d+', filename)
        return int(numbers[0]) if numbers else -1

    # Get sorted image filenames
    image_files = sorted(
        [f for f in os.listdir(folder_path) if f.endswith(('.jpg', '.png'))],
        key=numeric_sort_key
    )

    padded_images = []

    for file in image_files:
        source_path = os.path.join(folder_path, file)
        padded_img = padd_picture(source_path, xy_size)

        # Save if a destination path is given
        if padded_folder_path:
            dest_path = os.path.join(padded_folder_path, file)
            os.makedirs(os.path.dirname(dest_path), exist_ok=True)
            padded_img.save(dest_path)

        padded_images.append(np.array(padded_img))

    stack = np.stack(padded_images, axis=0)  # (Z, xy_size, xy_size, 3)
    current_depth = stack.shape[0]

    if current_depth > target_depth:
        raise ValueError(f"Stack depth {current_depth} exceeds target {target_depth}")

    # Pad in Z-axis
    pad_slices = target_depth - current_depth
    if pad_slices > 0:
        padding = np.full(
            (pad_slices, xy_size, xy_size, 3),
            fill_value=pad_value,
            dtype=stack.dtype
        )
        stack = np.concatenate([stack, padding], axis=0)

    return stack

In [38]:
padded_stack = pad_stack_z_with_2d_padding(
    folder_path='../data/pictures_raw/train/tomo_fd41c4',
    padded_folder_path='../data/pictures_process/train_process/tomo_fd41c4',
    xy_size=960,
    #target_depth=800,
    #pad_value=255  # white background
)

print(padded_stack.shape)  # (800, 960, 960, 3)

'''
tomo_2dd6bd
tomo_3264bc
tomo_54e1a7
tomo_974fd4
tomo_b8595d
tomo_c3619a
tomo_d6e3c7
tomo_db2a10
tomo_e96200
tomo_fd41c4
'''

(300, 960, 960, 3)


'\ntomo_2dd6bd\ntomo_3264bc\ntomo_54e1a7\ntomo_974fd4\ntomo_b8595d\ntomo_c3619a\ntomo_d6e3c7\ntomo_db2a10\ntomo_e96200\ntomo_fd41c4\n'

In [None]:
import os
import numpy as np
from PIL import Image

def load_n_slices_from_tomo(tomo_folder, label,  img_size=(960, 960)):
    """
    Load first n_slices images from a tomo folder, stack into a numpy array.

    Args:
        tomo_folder (str): Path to the tomo directory containing slices.

        img_size (tuple): Desired (H, W) size to resize images.

    Returns:
        np.ndarray: Array shape (n_slices, H, W, 3), dtype float32 normalized [0,1].
    """
    slice_files = sorted([
        f for f in os.listdir(tomo_folder)
        if f.endswith('.jpg') or f.endswith('.png')
    ])
    # Take only first n_slices or all if fewer
    slice_files = slice_files[:n_slices]

    slices = []
    for fname in slice_files:
        path = os.path.join(tomo_folder, fname)
        img = Image.open(path).convert('RGB').resize(img_size)
        arr = np.array(img, dtype=np.float32) / 255.0  # normalize to [0,1]
        slices.append(arr)
    # If fewer slices than n_slices, pad with white images
    if len(slices) < n_slices:
        n_pad = n_slices - len(slices)
        pad_img = np.ones((img_size[0], img_size[1], 3), dtype=np.float32)
        slices.extend([pad_img] * n_pad)

    stack = np.stack(slices, axis=0)  # shape: (n_slices, H, W, 3)
    return stack

In [None]:
load_n_slices_from_tomo(tomo_folder, 300, img_size=(960, 960))

In [138]:
def load_stack_and_label(tomo_folder, label, xy_size=960, target_depth=800):
    def _load(tomo_folder, label):
        stack = pad_stack_z_with_2d_padding(
            folder_path=tomo_folder.numpy().decode(),
            padded_folder_path=None,  # You can keep this None to avoid saving to disk
            xy_size=xy_size,
            target_depth=target_depth
        )
        return stack.astype(np.float32), np.float32(label)

    volume, label = tf.py_function(_load, [tomo_folder, label], [tf.float32, tf.float32])
    volume.set_shape((target_depth, xy_size, xy_size, 3))
    label.set_shape(())
    return volume, label


In [127]:
buffer_size = 8
batch_size = 2

dataset = tf.data.Dataset.from_tensor_slices((filtered_image_paths, labels))
dataset = dataset.map(load_stack_and_label, num_parallel_calls=tf.data.AUTOTUNE)
dataset = dataset.shuffle(buffer_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [130]:
filtered_image_paths,labels = selection_images_labels(df, num_slices=[300,500,800], num_motors=[0,1])
filtered_image_paths,labels


Matched 8 image-label pairs


(['../data/pictures_process/train_process/tomo_0a180f',
  '../data/pictures_process/train_process/tomo_0c3d78',
  '../data/pictures_process/train_process/tomo_0c2749',
  '../data/pictures_process/train_process/tomo_0d4c9e',
  '../data/pictures_process/train_process/tomo_0a8f05',
  '../data/pictures_process/train_process/tomo_0da370',
  '../data/pictures_process/train_process/tomo_0c3a99',
  '../data/pictures_process/train_process/tomo_2a6ca2'],
 array([0., 0., 0., 0., 1., 1., 1., 1.], dtype=float32))

In [142]:
train_ds, val_ds, test_ds, test_paths, test_labels = batches_stack_images_ram(filtered_image_paths,
    labels,
    shuffle=True,
    batch_size=2,
    split=True,
    val_fraction=0.2,
    test_fraction=0.2,
    seed=42,
    xy_size=960,
    target_depth=800)


In [None]:
for volume, label in train_ds.take(1):
    print("Volume batch shape:", volume.shape)
    print("Label batch shape:", label.shape)

: 

In [13]:
path_train_csv= '../data/csv_raw/train_labels.csv'
path_image= '../data/pictures_process/train_process'

In [14]:
df = pd.read_csv('../data/csv_raw/train_labels.csv').copy()
df.head(10)

Unnamed: 0.1,Unnamed: 0,row_id,tomo_id,Motor_axis_0,Motor_axis_1,Motor_axis_2,Array_shape_axis_0,Array_shape_axis_1,Array_shape_axis_2,Voxel_spacing,Number_of_motors
0,0,16,tomo_049310,-1.0,-1.0,-1.0,500,924,956,19.7,0
1,1,30,tomo_098751,-1.0,-1.0,-1.0,500,924,956,16.1,0
2,2,54,tomo_136c8d,-1.0,-1.0,-1.0,500,924,956,19.7,0
3,3,59,tomo_146de2,-1.0,-1.0,-1.0,500,924,956,16.1,0
4,4,84,tomo_1dc5f9,-1.0,-1.0,-1.0,500,924,956,19.7,0
5,5,121,tomo_28f9c1,-1.0,-1.0,-1.0,500,924,956,16.1,0
6,6,173,tomo_39b15b,-1.0,-1.0,-1.0,500,924,956,16.1,0
7,7,181,tomo_3b8291,-1.0,-1.0,-1.0,500,924,956,19.7,0
8,8,193,tomo_40b215,-1.0,-1.0,-1.0,500,924,956,16.1,0
9,9,219,tomo_4baff0,-1.0,-1.0,-1.0,500,924,956,16.1,0


In [48]:
df_select = df[df['tomo_id'].isin(['tomo_0a8f05', 'tomo_0a180f', 'tomo_0c3a99', 'tomo_0c3d78'])].copy()
df_select

Unnamed: 0.1,Unnamed: 0,row_id,tomo_id,Motor_axis_0,Motor_axis_1,Motor_axis_2,Array_shape_axis_0,Array_shape_axis_1,Array_shape_axis_2,Voxel_spacing,Number_of_motors
49,49,31,tomo_0a180f,-1.0,-1.0,-1.0,800,928,960,13.1,0
50,50,35,tomo_0c3d78,-1.0,-1.0,-1.0,800,928,960,13.1,0
292,292,32,tomo_0a8f05,52.0,587.0,575.0,300,928,928,13.1,1
343,343,34,tomo_0c3a99,125.0,645.0,227.0,300,959,928,15.6,1


In [1]:
df[df['Array_shape_axis_0']==300]

NameError: name 'df' is not defined

In [103]:
def initiate_model3D():

    model = Sequential()
     ### First Convolution & MaxPooling
    model.add(Input(shape=(train_ds.shape)))
    model.add(layers.Conv3D(32, (3,3,3), activation = 'relu',padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPool2D(pool_size=(3, 3, 3)))
    model.add(layers.Conv3D(64, (4,4,4), activation = 'relu',padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPool2D(pool_size=(3, 3, 3)))

    ## Flattening
    model.add(layers.Flatten())

    ### One Fully Connected layer - "Fully Connected" is equivalent to saying "Dense"
    model.add(layers.Dense(10,activation='relu'))

    model.add(layers.Dropout(0.3))  # Optional
    model.add(layers.Dense(1,activation='sigmoid'))

    return model

In [104]:
model = initiate_model3D()
model.summary()


AttributeError: 'BatchDataset' object has no attribute 'shape'

In [49]:
import tensorflow as tf
def euclidean_loss(y_true, y_pred):
    return tf.reduce_mean(tf.norm(y_pred - y_true, axis=1))
def compile_model3D_regress(model):
    ### Model compilation
    #model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['FBetaScore',Recall(),'precision','accuracy'], run_eagerly=True)

    model.compile(loss= 'binary_crossentropy',
                  optimizer = 'adam',
                  metrics=['accuracy',
                           FBetaScore(num_classes=2,
                                      average='micro',
                                      beta=2.0),
                           Recall(),
                        #    'precision',
                           ],)

    return model


In [12]:
from src.ml_logic.interface import train
model = initiate_model3D()
model = compile_model3D_regress(model)

In [None]:
def model_train (model, X_train_processed, y_train, batch_size=16 ,epochs=100):
    verbose=1
    validation_split=0.1
    es = EarlyStopping(patience=3,
                       restore_best_weights=True)

    history = model.fit(X_train_processed,
                        y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        # validation_split=validation_split,
                        validation_data=(X_val, y_val),
                        callbacks=[es],
                        verbose=verbose)
    return history


NameError: name 'train_ds' is not defined