In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm.keras import TqdmCallback

import pandas as pd
import numpy as np
import seaborn as sns
from pandas.core.indexing import _IndexSlice
import matplotlib.pyplot as plt
from functools import partial
import umap
from sklearn.preprocessing import StandardScaler
from pandas.core.dtypes.common import is_numeric_dtype, is_object_dtype

from pathlib import Path

import common_functions as fnc

idx: _IndexSlice = pd.IndexSlice

In [None]:
from data_import import df, samples, didx, DATA_PATH

## Example plots

In [None]:
# Example plots
ax = df.loc[:, didx(
    fluorometer="MULTI-COLOR-PAM",
    CO2_level="Air", 
    strain="Chlorella vulgaris",
    # SP_color=455
)].dropna().plot(legend=False)
ax.set_xscale("log")
ax.set_xlabel("Time [ms]")
ax.set_ylabel("Fluorescence [Detector V]")
ax.set_title("MCPAM - Example")

ax = df.loc[:, didx(fluorometer="AquaPen", CO2_level="Air")].dropna().plot(legend=False)
ax.set_xscale("log")
ax.set_xlabel("Time [ms]")
ax.set_ylabel("Fluorescence [AU]")
ax.set_title("AquaPen - Example")

# Model training

## Select the data to be trained on

In [4]:
dat = df.loc[
    0.01:, : # Exclude data before the light pulse
    # didx(
    #     fluorometer="MULTI-COLOR-PAM", # Only use MCPAM data
    #     strain='Synechocystis sp. PCC 6803', # Only use Synechocystis data
    # )
].dropna()

### Map the treatment effects

In [5]:
# Read the map of effects transformed into one-hot encoding
effects_map = pd.read_csv(
    DATA_PATH / "effects_map.csv",
    header=[0,1],
    index_col=[0,1],
    ).astype(float).fillna(0).astype(bool)

# Exclude Light intensity and temperature from targets
effects_map = effects_map[[
    'control_measurement',
    'PSII_closed',
    'CBB_inhibited',
    'TOX_inhibited',
    'electron_drain'
]]

# Get the effects and map the mto the targets
effects = samples.loc[dat.columns.get_level_values(0), ["Effect in PSET", "Treatment"]]

targets = effects_map.loc[pd.MultiIndex.from_frame(effects)].droplevel(1, axis=1)
targets = targets.astype(int)
targets.index = dat.columns

# Make a Multiindex with a duplicated entries
target_names = targets.columns
targets.columns = pd.MultiIndex.from_arrays([targets.columns, targets.columns])

# Select features

## Get experimental conditions

In [6]:
# Get the conditions as the Multiindex-columns
conditions = dat.columns.to_frame()
# conditions.index = dat.columns.get_level_values(0)

# Select the relevant columns
condition_types = pd.Series({
    'Strain': "string",
    'CO2 level': "numeric", # There is a meaning to a higher CO2 concentration (maybe make categorical?)
    'Cultivation + experiment temperature': "numeric",
    'Cultivation light intensity': "numeric",
    'Dark or light acclimated': "string",
    'Growth light color (nm)': "string",
    'Fluorometer': "string",
    'SP color (nm)': "string", # There is no linear relationship between wavelength and effect
    'SP intensity': "numeric",
    'OD680 MC-1000': "numeric",
    'OD720 MC-1000': "numeric",
})

conditions = conditions[condition_types.index]

# Replace certain column values

# Replace CO2 level with the actual (assumed) numerical ppm
conditions["CO2 level"] = conditions["CO2 level"].replace({
    "Air": "0.0004",
    "High CO2": "0.05"
}).astype(float)

# Replace SP color with categorical value because the numerical gradient is not meaningful
conditions["SP color (nm)"] = conditions["SP color (nm)"].astype(str)

# Make a Multiindex with a duplicated entries
conditions.columns = pd.MultiIndex.from_arrays([conditions.columns, conditions.columns])

# Encode conditions in one-hot
categorical_conditions = condition_types[condition_types == "string"].index.to_numpy()
numerical_conditions = condition_types[condition_types != "string"].index.to_numpy()

## Sample OJIP

In [None]:
# Select the number of sampled points
n_points = 40

# Time points, logspaced
log_time_points = np.linspace(
    np.log10(dat.index[0]),
    np.log10(dat.index[-1]),
    n_points
)
time_points = 10 ** log_time_points

# Pre-populate the interp function
_interp = partial(np.interp, time_points, dat.index)

# Interpolate the selected points
ojip_sampled = dat.apply(_interp)
ojip_sampled.index = pd.MultiIndex.from_product([
    ["ojip"],
    ["ojip_" + x for x in log_time_points.round(2).astype(str)]
])

# Add sampled points to features
ojip_sampled = ojip_sampled.T

# Subset the data to the samples and time to be included in the analysis 
ax = dat.plot(legend=False)

for t in time_points:
    ax.axvline(t)

ax.set_xscale("log")

# Add data types
ojip_types = pd.Series({"ojip":"time-series-gradients"})

## Collect dataset

In [8]:
# Collect all data sets

dat_sets = [
    conditions,
    ojip_sampled,
    targets
]

dat_full = pd.concat(dat_sets, axis=1)

In [None]:
# Split data into training set
dat_train, _dat_trainval = train_test_split(
    dat_full,
    test_size=0.2, 
    random_state=42,
    stratify=dat_full[target_names]
)

# Split data into test and validation set
dat_test, dat_val = train_test_split(
    _dat_trainval,
    test_size=0.5, 
    random_state=42,
    stratify=_dat_trainval[target_names]
)

print(f"train: {dat_train.shape}, test: {dat_test.shape}, val: {dat_val.shape}")


if not np.all(dat_train[target_names].drop_duplicates().sum(axis=0) == 1):
    raise RuntimeError("Not all targets are in the training set")

# Make into dataset
train_ds = fnc.df_to_dataset(dat_train, targets=target_names)
test_ds = fnc.df_to_dataset(dat_test, targets=target_names)
val_ds = fnc.df_to_dataset(dat_val, targets=target_names)

## Prepare data

In [10]:
# Get the types of all features
feature_types = pd.concat([
    condition_types,
    ojip_types
])

# Create containers for inputs and encodings
all_inputs = {}
encoded_features_dict = {}
encoded_features = []

In [None]:
for col_name, col_dtype in feature_types.items():

    print(col_name)
    # Create a numeric normalisation layer
    if col_dtype == "numeric":
        col = layers.Input(shape=(1,), name=col_name)
        normalization_layer = fnc.get_normalization_layer(col_name, train_ds)
        encoded_col = normalization_layer(col)
    
    # Create a string enconding layer, could also work for integer encoding
    elif col_dtype == "string":
        col = layers.Input(shape=(1,), name=col_name, dtype='string')
        encoding_layer = fnc.get_category_encoding_layer(name=col_name,
                                                    dataset=train_ds,
                                                    dtype='string',
                                                    max_tokens=5)
        encoded_col = encoding_layer(col)

    # Create a layer to normalise time series and calculate gradients
    elif col_dtype == "time-series-gradients":
        col = layers.Input(shape=(dat_full[col_name].shape[1],), name=col_name)
        reshaped_col = layers.Reshape((dat_full[col_name].shape[1], 1))(col)
        normalization_layer = fnc.NormalizedTimeSeriesWithDerivatives()
        encoded_col = normalization_layer(reshaped_col)
        encoded_col = layers.Flatten()(encoded_col)
        
    else:
        raise KeyError(f"No handling for col_dtype {col_dtype} defined")

    all_inputs[col_name] = col
    encoded_features.append(encoded_col)
    encoded_features_dict[col_name] = encoded_col

# Define the preprocessing layer
preprocessing_layer = keras.Model(
    all_inputs,
    encoded_features_dict,
    name="preprocessing_layer"
)

In [None]:
keras.utils.plot_model(
    preprocessing_layer,
    show_shapes=True,
    show_layer_names=True,
    # rankdir="LR",
    to_file="figures/test_preprocessing.png",
    dpi=100
)

## Create UMAP mapping

In [13]:
# Make dataframe into model input dict
def get_inputdict_from_df(df, all_inputs):
    return {key: df[key].to_numpy() for key in all_inputs}

In [14]:
# Concatenate the outputs of the preprocessing layer to perform UMAP
concatenated_preprocess = keras.Model(
    all_inputs,
    layers.concatenate(list(preprocessing_layer(all_inputs).values())),
    name="concatenation_layer"
)

In [None]:
# Set a random seed for UMAP
UMAP_seed = 2025

# Scale the features
df_features_scaled = concatenated_preprocess(get_inputdict_from_df(dat_full, all_inputs)).numpy()

# Create the UMAP embedding
reducer = umap.UMAP(random_state=UMAP_seed)
embedding = pd.DataFrame(
    reducer.fit_transform(df_features_scaled),
    index=dat_full.index,
    columns=["UMAP_1", "UMAP_2"]
).reset_index()

# Plot
categories = df.columns.names[1:]
fig, axes = plt.subplots(
    int(np.ceil(len(categories)/3)),
    3,
    figsize=(7,15),
    sharey=True,
    sharex=True,
)

for category, ax in zip(categories, axes.flatten()):
    sns.scatterplot(
        embedding,
        x="UMAP_1",
        y="UMAP_2",
        hue=category,
        ax=ax,
        legend=False
    )
    ax.set_title(category)

    if len(embedding[category].value_counts()) == 1:
        ax.text(s="one category",x=0.98, y=0.98, ha="right", va="top", transform=ax.transAxes, size=7)

fig.tight_layout()

In [None]:
# Plot targest on UMAP
# Add UMAP to targets
embedding_targets = pd.concat([
    targets.droplevel(-1, axis=1).droplevel(list(range(1,20)), axis=0),
    embedding.set_index("Label").loc[:, ["UMAP_1", "UMAP_2"]],
], axis=1)

# Plot
categories = effects_map.columns.get_level_values(0)
fig, axes = plt.subplots(
    int(np.ceil(len(categories)/3)),
    3,
    figsize=(7,5),
    sharex=True,
    sharey=True
)

for category, ax in zip(categories, axes.flatten()):
    sns.scatterplot(
        embedding_targets,
        x="UMAP_1",
        y="UMAP_2",
        hue=category,
        ax=ax,
        legend=False
    )
    ax.set_title(category)

    if len(embedding_targets[category].value_counts()) == 1:
        ax.text(s="one category",x=0.98, y=0.98, ha="right", va="top", transform=ax.transAxes, size=7)

fig.tight_layout()

## Create test model

In [18]:
# Test model
all_features = preprocessing_layer(all_inputs)
all_features = layers.concatenate(list(all_features.values()))
x = layers.Dense(32, activation="relu")(all_features)
x = layers.Dropout(0.5)(x)
dense_output = layers.Dense(targets.shape[1], activation="sigmoid")(x)

outputs = {
    "control_measurement": layers.Lambda(lambda x: tf.expand_dims(x[:, 0], axis=-1), name="control_measurement")(dense_output),
    "PSII_closed": layers.Lambda(lambda x: tf.expand_dims(x[:, 1], axis=-1), name="PSII_closed")(dense_output),
    "CBB_inhibited": layers.Lambda(lambda x: tf.expand_dims(x[:, 2], axis=-1), name="CBB_inhibited")(dense_output),
    "TOX_inhibited": layers.Lambda(lambda x: tf.expand_dims(x[:, 3], axis=-1), name="TOX_inhibited")(dense_output),
    "electron_drain": layers.Lambda(lambda x: tf.expand_dims(x[:, 4], axis=-1), name="electron_drain")(dense_output),
}

model = keras.Model(all_inputs, outputs)

In [None]:
# Compile test model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics={key:[
        keras.metrics.BinaryAccuracy(threshold=0.5),
        keras.metrics.Recall(thresholds=0.5),
        keras.metrics.Precision(thresholds=0.5),
        keras.metrics.F1Score(threshold=0.5),
        ] for key in target_names},
)

In [None]:
# Use `rankdir='LR'` to make the graph horizontal.
keras.utils.plot_model(
    model,
    show_shapes=True,
    show_layer_names=True,
    # rankdir="LR",
    to_file="figures/model.png",
    dpi=100
)

In [None]:
# Train the model
history = model.fit(
    train_ds,
    epochs=300,
    batch_size=32,
    verbose=0,
    validation_data=val_ds,
    callbacks=[TqdmCallback(verbose=1)]
)

# Evaluate
# loss, mae = model.evaluate(X_test_scaled, Y_test_scaled, verbose=0)
# print(f"Mean Absolute Error on Test Set: {mae}")

# Plot the loss over the Epochs
fnc.plot_loss_development(history)

In [None]:
model.evaluate(test_ds, return_dict=True)

In [None]:
test = model.predict(test_ds.map(lambda x,y : x))
test = pd.DataFrame({k:v.flatten() for k,v in test.items()})

In [None]:
train_ds.map(lambda x,y : y).as_numpy_iterator()

In [None]:
test.plot(subplots=True)

In [None]:
test.describe()

In [None]:
pd.DataFrame(model.predict(test_ds.map(lambda x,y : x)))
# Does not look correct

In [None]:
fnc.get_model_metrics()

In [None]:
# not_working = []
# outputs = {}
# for i in range(len(input_dict)):
#     try:
#         test_feature = list(input_dict.keys())[i]

#         test_model = keras.Model({test_feature:all_inputs[test_feature]}, encoded_features_dict[test_feature])
#         test_model.compile(loss='mae', optimizer='adam')

#         outputs[test_feature] = test_model.predict({test_feature:input_dict[test_feature]})
#     except Exception:
#         not_working.append(test_feature)

## Create machine learning model

In [None]:
## Define the model
# Define the feature inputs
ojip_input = Input(shape=(X_train_scaled.shape[1],), name="ojip_input")

x = keras.layers.Reshape((X_train_scaled.shape[1],1), name="LSTM_1_reshape")(ojip_input)

# Hidden layer
x = LSTM(64, activation="tanh", name="LSTM_1")(x)
x = Dropout(0.3, name="LSTM_1_Dropout")(x)

# Output layer
output = Dense(Y_train_scaled.shape[1], activation="relu", name="prediction")(x)

model = keras.Model(
    inputs=[ojip_input],
    outputs=[output],
)


##  Compile the model
model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.MeanAbsoluteError]
)

##  Train the model
history = model.fit(
    [X_train_scaled],
    [Y_train_scaled],
    validation_split=0.1,
    epochs=500,
    verbose=0,
    batch_size=10,
    callbacks=[TqdmCallback(verbose=1)]
)

plot_loss_development(history)

In [None]:
model.summary()

# Look at all models

In [None]:
fig, axes = plt.subplots(len(models_metrics), sharex=True)

for model, ax in zip(models_metrics, axes.flatten()):
    # Plot the model metrics
    models_metrics[model].plot(kind="bar", ax=ax)
    ax.set_title(model)

In [None]:

# model.compile(loss='mae', optimizer='adam')
# model.fit(X_train, y_train, batch_size=32, epochs=10, verbose=0)

# print(model.evaluate(X_test, y_test))
# # 10.704551696777344

# # normalize the inputs outside the model
# normalizer = Normalization()
# normalizer.adapt(X_train)

# X_train_normalized = normalizer(X_train)
# X_test_normalized = normalizer(X_test)

# inputs = Input(shape=[None, 1])
# x = LSTM(4, return_sequences=True)(inputs)
# x = LSTM(2, return_sequences=True)(x)
# x = LSTM(2, return_sequences=True)(x)
# x = LSTM(4, return_sequences=True)(x)
# x = TimeDistributed((Dense(1)))(x)
# model = Model(inputs, x)

# model.compile(loss='mae', optimizer='adam')
# model.fit(X_train_normalized, y_train, batch_size=32, epochs=10, verbose=0)

# print(model.evaluate(X_test_normalized, y_test))
# # 10.748750686645508

In [64]:
# import tensorflow as tf

# class TimeSeriesNormalization(layers.Layer):
#     def __init__(self, epsilon=1e-6):
#         super(TimeSeriesNormalization, self).__init__()
#         self.epsilon = epsilon  # To prevent division by zero

#     def call(self, inputs):
#         """
#         Normalize each time series independently to zero mean and unit variance.

#         Args:
#             inputs: Tensor of shape (batch_size, time_steps, features)

#         Returns:
#             Normalized tensor of the same shape
#         """
#         mean = tf.reduce_mean(inputs, axis=1, keepdims=True)  # Compute mean along time axis
#         std = tf.math.reduce_std(inputs, axis=1, keepdims=True)  # Compute std along time axis

#         return (inputs - mean) / (std + self.epsilon)  # Normalize

# # Example usage
# batch_size, time_steps, features = 32, 100, 5
# input_data = tf.random.normal((batch_size, time_steps, features))  # Simulated time series data

# normalization_layer = TimeSeriesNormalization()
# normalized_data = normalization_layer(input_data)

# print("Input shape:", input_data.shape)
# print("Normalized shape:", normalized_data.shape)


In [78]:
# fig,ax = plt.subplots()
# ax.plot(normalized_data.numpy().std(axis=1))
# ax.set_ylim(-1,3)