In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as img
import pandas as pd
import tensorflow as tf
import numpy as np
import pandas as pd

import os
import glob
import sys

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('./drive/MyDrive/train_labels.csv')
df.sample(3)

Unnamed: 0.1,Unnamed: 0,row_id,tomo_id,Motor_axis_0,Motor_axis_1,Motor_axis_2,Array_shape_axis_0,Array_shape_axis_1,Array_shape_axis_2,Voxel_spacing,Number_of_motors
470,470,275,tomo_5f235a,119.0,656.0,372.0,300,960,928,13.1,1
335,335,717,tomo_f8b46e,123.0,728.0,352.0,300,928,928,13.1,1
646,646,469,tomo_a5ac23,169.0,574.0,895.0,300,959,928,15.6,2


In [None]:
def select_tomo_ids(df, number_of_slices=[300], number_of_motors=[0, 1], y_shape_range=(924, 960), x_shape_range=(924, 960)) -> pd.Series:
    '''
    Return the list of the tomo_ids obtained by filtering the DataFrame base on the given parameters

            Parameters:
                    df (pd.Dataframe): the dataset to filter
                    number_of_slices (list:int): number of slices per tomogram
                    max_number_of_motors (list:int): max number of motors
                    y_shape_range(tuple:int): tuple of the (min, max) y size of pictures
                    x_shape_range(tuple:int): tuple of the (min, max) x size of pictures

            Returns:
                    pd.Series: pandas Series of the tomo_ids corresponding to the filter
    '''
    df = df[(df['Array_shape_axis_1'] >= y_shape_range[0]) & (df['Array_shape_axis_2'] <= y_shape_range[1])]
    df = df[(df['Array_shape_axis_1'] >= x_shape_range[0]) & (df['Array_shape_axis_2'] <= x_shape_range[1])]
    df = df[(df['Array_shape_axis_0'].isin(number_of_slices)) & (df['Number_of_motors'].isin(number_of_motors))]


    return df.tomo_id


def selection_images_labels(df, dir_images, num_slices=[300], num_motors=[1]):

    ''''
    function to return the path to the selected images (which type, which tomos, how many motors,
    shape of the images)
    Parameters:
    ----------
    df = database (train)
    dir_images(str) = directory with the images we want to feed to the model
    num_slices, num_motors, y_shape_range, x_shape_range = params for the select_tomo_ids function

    Returns:
    -------
    filtered_image_paths (list or np.array): List of image paths.

    labels (np.array or list): Corresponding labels.
    '''

   # Step 1: Filter tomos
    tomo_ids = select_tomo_ids(df, number_of_slices=num_slices, number_of_motors=num_motors)
    df_select = df[df['tomo_id'].isin(tomo_ids)].copy()

    # Step 2: Set up labels
    df_select['motor_coord'] = df_select.apply(lambda row: (row['Motor_axis_2'], row['Motor_axis_1']), axis=1)

    # Step 3: Load all images
    dir_mean_image = f'./drive/MyDrive/{dir_images}'
    all_images = glob.glob(os.path.join(dir_mean_image, '**', '*.jpg'), recursive=True)

    print(f"Found {len(all_images)} images in {dir_mean_image}")

    # Step 4: Match images using substring matching
    filtered_image_paths = []
    labels = []

    for _, row in df_select.iterrows():
        tomo_id = row['tomo_id']
        matched = [p for p in all_images if tomo_id in os.path.basename(p)]

        if matched:
            filtered_image_paths.append(matched[0])  # If multiple, take the first
            labels.append(row['Number_of_motors'])
        else:
            print(f"⚠️ No image found for tomo_id: {tomo_id}")

    print(f"Matched {len(filtered_image_paths)} image-label pairs")

    labels = np.array(labels, dtype=np.float32)
    return filtered_image_paths, labels


# Define image reading function
def read_img_jpg(path, label):
    """
    Reads a JPEG image from a file path, decodes it as a grayscale image (1 channel),
    normalizes pixel values to the range [0, 1], and returns it along with its label.

    Parameters:
    ----------
    path : tf.Tensor
        A scalar string tensor representing the file path to the JPEG image.

    label : tf.Tensor or any
        The label associated with the image (e.g., coordinates or class ID).

    Returns:
    -------
    img : tf.Tensor
        A 3D float32 tensor of shape (height, width, 1) representing the normalized image.

    label : same as input
        The original label passed in, unchanged.
    """
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=1)
    img = tf.cast(img, tf.float32) / 255.0  # normalize to [0, 1]
    return img, label



def batches_images_ram(
    read_img_jpg,
    filtered_image_paths,
    labels,
    shuffle=True,
    batch_size=32,
    split=False,
    val_fraction=0.2,
    test_fraction=0.2,
    seed=42
):
    """
    Load images and labels as tf.data.Dataset, optionally shuffle and batch,
    and optionally split into train/val/test datasets.

    Parameters:
    -----------
    read_img_jpg : function
        Function to load and preprocess image from path.

    filtered_image_paths : list or np.array
        List of image paths.

    labels : np.array or list
        Corresponding labels.

    shuffle : bool, default=True
        Whether to shuffle the dataset.

    batch_size : int, default=32
        Batch size.

    split : bool, default=False
        Whether to split dataset into train/val/test.

    val_fraction : float, default=0.2
        Fraction of data for validation.

    test_fraction : float, default=0.2
        Fraction of data for testing.

    seed : int, default=42
        Random seed for shuffling.

    Returns:
    --------
    If split=False:
        dataset : tf.data.Dataset
            Dataset with (image, label) pairs, batched and optionally shuffled.

    If split=True:
        train_ds, val_ds, test_ds : tf.data.Dataset
            The three splits, all batched and shuffled as specified.
    """

    dataset_size = len(filtered_image_paths)
    # Combine and optionally shuffle the data as a list of tuples
    data = list(zip(filtered_image_paths, labels))
    if shuffle:
        rng = np.random.default_rng(seed)
        rng.shuffle(data)

    # Unzip the shuffled data back
    filtered_image_paths, labels = zip(*data)

    # Convert back to lists or arrays
    filtered_image_paths = list(filtered_image_paths)
    labels = list(labels)

    if split:
        # Compute sizes
        val_size = int(val_fraction * dataset_size)
        test_size = int(test_fraction * dataset_size)
        train_size = dataset_size - val_size - test_size

        # Split into slices
        test_paths = filtered_image_paths[:test_size]
        print(test_paths)
        test_labels = labels[:test_size]

        val_paths = filtered_image_paths[test_size:test_size + val_size]
        print(val_paths)
        val_labels = labels[test_size:test_size + val_size]

        train_paths = filtered_image_paths[test_size + val_size:]
        print(train_paths)
        train_labels = labels[test_size + val_size:]

        # Create tf.data.Dataset for each
        train_ds = tf.data.Dataset.from_tensor_slices((train_paths, train_labels)).map(read_img_jpg).batch(batch_size)
        val_ds = tf.data.Dataset.from_tensor_slices((val_paths, val_labels)).map(read_img_jpg).batch(batch_size)
        test_ds = tf.data.Dataset.from_tensor_slices((test_paths, test_labels)).map(read_img_jpg).batch(batch_size)

        return train_ds, val_ds, test_ds #, test_paths, test_labels

    else:
        # Single dataset
        dataset = tf.data.Dataset.from_tensor_slices((filtered_image_paths, labels))
        dataset = dataset.map(read_img_jpg).batch(batch_size)
        return dataset, filtered_image_paths, labels


def plot_history_regression(history, xlims=None, ylims=None):
    '''
    Function to plot learning curves for a regression task
    Parameters:
        history: output from a model.fit
        xlims,ylims (tuple of float, optional): limits for x and y axes, if not
                                                provided are defined as
                                                (0,max(epochs)), (0,max(loss))
    '''
    if xlims is None:
        xlims = (0, max(history.epoch))
    if ylims is None:
        ylims = (0, max(history.history['loss']))
    print(xlims,ylims)
    fig, ax = plt.subplots(1, 2, figsize=(15,5))
    ax[0].set_title('loss')
    ax[0].plot(history.epoch, history.history["loss"], label="Train loss")
    ax[0].plot(history.epoch, history.history["val_loss"], label="Validation loss")
    ax[0].set_ylim(ylims)
    ax[0].set_xlim(xlims)
    ax[0].set_xlabel('Epochs')
    ax[0].set_ylabel('Loss')

    ax[1].set_title('recall loss')
    ax[1].plot(history.epoch, history.history["recall"], label="Train recall")
    ax[1].plot(history.epoch, history.history["val_recall"], label="Validation recall")
    ax[1].set_ylim(ylims)
    ax[1].set_xlim(xlims)
    ax[1].set_xlabel('Epochs')
    ax[1].set_ylabel('recall')
    ax[0].legend()
    ax[1].legend()



In [None]:
def generate_base_data():
  all_slices_number = df['Array_shape_axis_0'].unique()

  filtered_image_paths,labels = selection_images_labels(df, 'adaptequal_1_padded', num_slices=list(all_slices_number), num_motors=[0, 1])

  train_ds, val_ds, test_ds = batches_images_ram(
      read_img_jpg,
      filtered_image_paths,
      labels,
      shuffle=True,
      batch_size=32,
      split=True,
      val_fraction=0.2,
      test_fraction=0.2,
      seed=42)

  return train_ds, val_ds, test_ds

In [None]:
train_ds, val_ds, test_ds = generate_base_data()

Found 648 images in ./drive/MyDrive/adaptequal_1_padded
Matched 578 image-label pairs
['./drive/MyDrive/adaptequal_1_padded/tomo_dae195.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_f2fa4a.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_cabaa0.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_f7f28b.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_ed1c97.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_ff505c.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_8f4d60.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_2aeb29.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_651ecd.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_e96200.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_0d4c9e.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_2dcd5c.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_983fce.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_7b1ee3.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_8b6795.jpg', './drive/MyDrive/adaptequal_1_padded/tomo_dcb9b4.jpg', './drive/MyDrive/adaptequal_1_pad

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.applications import Xception
from tensorflow.keras.callbacks import EarlyStopping

def train_model():
    IMG_SIZE = 224

    input_tensor = Input(shape=(960, 960, 1), name="grayscale_input")

    # Resize et conversion N&B -> RGB
    x = layers.Resizing(IMG_SIZE, IMG_SIZE)(input_tensor)
    x = layers.Conv2D(3, (3, 3), padding='same')(x)

    backbone = Xception(include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
    backbone.trainable = False  # gel du backbone

    #for layer in backbone.layers[:-2]:
    #    layer.trainable = False

    x = backbone(x)

    # Flatten + Dense
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(128, activation='relu', kernel_regularizer='l2', bias_regularizer='l2')(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(5, activation='relu', kernel_regularizer='l2', bias_regularizer='l2')(x)
    x = layers.Dropout(0.5)(x)

    # Sortie des keypoints
    out = layers.Dense(1, activation='sigmoid')(x)

    model = Model(inputs=input_tensor, outputs=out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'recall'])

    # Callback early stopping
    es = EarlyStopping(monitor="val_loss", patience=20, restore_best_weights=True, verbose=1)

    # Entraînement
    history = model.fit(train_ds, validation_data=val_ds, epochs=200, callbacks=[es])

    plot_history_regression(history)

    return model, history

In [None]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-3.1.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.0 (from mlflow)
  Downloading mlflow_skinny-3.1.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading databricks_sdk-0.56.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading opentele

In [None]:
X_test = []
y_test = []

for batch_x, batch_y in test_ds:
    X_test.append(batch_x.numpy())
    y_test.append(batch_y.numpy())

# Fusionner les batches
X_test = np.concatenate(X_test, axis=0)
y_test = np.concatenate(y_test, axis=0)

In [None]:
import mlflow
from sklearn.metrics import fbeta_score

mlflow.set_tracking_uri("http://34.79.119.176:5000")

mlflow.set_experiment("test_experiment_colab")  # créé si inexistant

with mlflow.start_run():
    model, history = train_model()

    # Enregistrer le modèle
    mlflow.tensorflow.log_model(model, artifact_path="model")

    # Enregistrer des métriques
    loss, acc, recall = model.evaluate(X_test, y_test, verbose=0)
    mlflow.log_metric("test_loss", loss)
    mlflow.log_metric("test_accuracy", acc)
    mlflow.log_metric("test_recall", acc)

    mlflow.log_param("model_architecture", "Xception")
    mlflow.log_param("epochs", 200)
    mlflow.log_param("batch_size", 32)
    mlflow.log_param("early_stopping", True)
    mlflow.log_param("loss_function", model.loss)
    mlflow.log_param("optimizer", type(model.optimizer).__name__)
    mlflow.log_param("learning_rate", model.optimizer.learning_rate.numpy())

    y_pred = model.predict(X_test)

    # Si y_pred est un vecteur de probabilités (ex : pour une classe binaire)
    y_pred_labels = (y_pred > 0.5).astype(int)

    fbeta = fbeta_score(y_test, y_pred_labels, beta=2)

    mlflow.log_metric("test_fbeta", fbeta)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m83683744/83683744[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step
Epoch 1/200
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 21s/step - accuracy: 0.5800 - loss: 2.9656 - recall: 0.6298 - val_accuracy: 0.6783 - val_loss: 2.2536 - val_recall: 0.9833
Epoch 2/200
[1m 1/11[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m3:23[0m 20s/step - accuracy: 0.6562 - loss: 2.2658 - recall: 1.0000

In [None]:
from sklearn.metrics import fbeta_score

y_pred = model.predict(X_test)

# Si y_pred est un vecteur de probabilités (ex : pour une classe binaire)
y_pred_labels = (y_pred > 0.5).astype(int)

from sklearn.metrics import fbeta_score
sklearn_score = fbeta_score(y_test, y_pred_labels, beta=2)

model.evaluate(X_test, y_test, verbose=0), sklearn_score

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_pred_labels, y_test))

In [None]:
from re import M
import os

X_pred = []

for picture in sorted(os.listdir('./drive/MyDrive/tomo_00e047')):
    t = img.imread(f'./drive/MyDrive/tomo_00e047/{picture}')/255
    t = tf.expand_dims(t, -1)
    t = tf.image.resize(t, [960,960])
    t = tf.expand_dims(t, axis=0)
    X_pred.append(float(model.predict(t , verbose=0)))

In [None]:
best_slice = np.argmax(X_pred)


best_slice, X_pred[best_slice]

In [None]:
X_pred[169]

In [None]:
!pip install shap

In [None]:
X_test.shape

In [None]:
import shap

# A masker that will mask out partitions of the input image
masker = shap.maskers.Image("blur(960,960)", X_test[0].shape)

# Finally create the explainer
explainer = shap.Explainer(model, masker)

# Explain some images using 500 evaluations of the model
# to estimate the SHAP values
shap_values = explainer(X_test[15:20], max_evals=500, batch_size=50,
                        outputs=shap.Explanation.argsort.flip[:4],
                        silent=True)# A masker that will mask out partitions of the input image
masker = shap.maskers.Image("blur(960,960)", X_test[0].shape)

# Finally create the explainer
explainer = shap.Explainer(model, masker)

# Explain some images using 500 evaluations of the model
# to estimate the SHAP values
shap_values = explainer(X_test[15:20], max_evals=500, batch_size=50,
                        outputs=shap.Explanation.argsort.flip[:4],
                        silent=True)

In [None]:
shap.image_plot(shap_values, pixel_values=X_test[1:5], width=15)

In [None]:
from sklearn.metrics import classification_report

print(classification_report([1 if pred >=0.5 else 0 for pred in y_pred], y_test))

In [None]:
X_test