In [71]:
import pandas as pd
import numpy as np
from pandas.core.indexing import _IndexSlice
import matplotlib.pyplot as plt
from functools import partial

from pathlib import Path

idx: _IndexSlice = pd.IndexSlice

In [3]:
# Define the path to the data
DATA_PATH = Path("data/01_known_effects")

In [4]:
# Load the list of samples
samples = pd.read_csv(
    DATA_PATH / "experiments.csv",
    index_col=0
    )

# Create an index from the experiment table
samples_index_fields = {
        'Label': "label",
        # 'Effect in PSET',
        'Treatment': "treatment",
        'Experimenter, location': "experimenter_location",
        'Strain': "strain",
        'CO2 level': "CO2_level",
        'Cultivation + experiment temperature': "temperature",
        'Cultivation light intensity': "Light_intensity",
        'Dark or light acclimated': "light_acclimation",
        'Growth light color (nm)': "light_color",
        'Cultivator': "cultivator",
        'Medium': "medium",
        'Fluorometer': "fluorometer",
        'SP color (nm)': "SP_color",
        'SP intensity': "SP_intensity",
        'Measuring vessel': "vessel",
        'OD680 MC-1000': "OD680",
        'OD720 MC-1000': "OD720",
        'ΔOD': "deltaOD",
        'OD680/720 raw': "OD680_720",
}

samples_index = pd.MultiIndex.from_frame(
    samples.reset_index().loc[:, list(samples_index_fields.keys())]
)

In [5]:
# for i in samples_index_fields.values():
#     print(f"{i},")# : str | list[str | int] | slice | None = slice(None),")

# Create a function for easy indexing
def didx(
    label: str | list[str | int] | slice | None = slice(None),
    treatment: str | list[str | int] | slice | None = slice(None),
    experimenter_location: str | list[str | int] | slice | None = slice(None),
    strain: str | list[str | int] | slice | None = slice(None),
    CO2_level: str | list[str | int] | slice | None = slice(None),
    temperature: str | list[str | int] | slice | None = slice(None),
    Light_intensity: str | list[str | int] | slice | None = slice(None),
    light_acclimation: str | list[str | int] | slice | None = slice(None),
    light_color: str | list[str | int] | slice | None = slice(None),
    cultivator: str | list[str | int] | slice | None = slice(None),
    medium: str | list[str | int] | slice | None = slice(None),
    fluorometer: str | list[str | int] | slice | None = slice(None),
    SP_color: str | list[str | int] | slice | None = slice(None),
    SP_intensity: str | list[str | int] | slice | None = slice(None),
    vessel: str | list[str | int] | slice | None = slice(None),
    OD680: str | list[str | int] | slice | None = slice(None),
    OD720: str | list[str | int] | slice | None = slice(None),
    deltaOD: str | list[str | int] | slice | None = slice(None),
    OD680_720: str | list[str | int] | slice | None = slice(None),
) -> _IndexSlice:
    res: list[str | list[str | int] | slice | None] = [
        x
        for x in [
            label,
            treatment,
            experimenter_location,
            strain,
            CO2_level,
            temperature,
            Light_intensity,
            light_acclimation,
            light_color,
            cultivator,
            medium,
            fluorometer,
            SP_color,
            SP_intensity,
            vessel,
            OD680,
            OD720,
            deltaOD,
            OD680_720,
        ]
        if x is not None
    ]
    return idx[*res]

## Load the data

In [38]:
# Get the paths to the samples files
files = {}

for i in samples_index.get_level_values(0):
    _index = f"{i:04}"

    # Get the file to the current index
    file = list(DATA_PATH.glob(f"ojip_data/{_index}*"))[0]

    # Set the options for reading the data based on the used fluorometer
    if samples.loc[i, "Fluorometer"] == "MULTI-COLOR-PAM":
        skiprows = 0
        skipfooter = 0
        sep=";"
        index_col=0
        select_col = "Fluo, V"
        time_to_ms = 1
    elif samples.loc[i, "Fluorometer"] == "AquaPen":
        skiprows = 7
        skipfooter = 38
        sep=r"\s"
        index_col=0
        select_col = "OJIP"
        time_to_ms = 1e-3
    else:
        print(i, samples.loc[i, "Fluorometer"])
        break

    # Read the data with the pre-defined options
    _df = pd.read_table(
        file,
        skiprows=skiprows,
        skipfooter=skipfooter,
        sep=sep,
        index_col=index_col,
        engine='c' if skipfooter == 0 else 'python'
    )[select_col]

    _df.index = pd.Index(np.round(_df.index * time_to_ms, 2))

    # Save the data
    files[i] = _df

# Concatenate the data
df = pd.DataFrame(files).sort_index(axis=1)
df.columns = samples_index

## Example plots

In [None]:
# Example plots

ax = df.loc[:, didx(fluorometer="MULTI-COLOR-PAM", CO2_level="Air", )].iloc[:, :5].dropna().plot(legend=False)
ax.set_xscale("log")
ax.set_xlabel("Time [ms]")
ax.set_ylabel("Fluorescence [Detector V]")
ax.set_title("MCPAM - Example")

ax = df.loc[:, didx(fluorometer="AquaPen", CO2_level="Air")].dropna().plot(legend=False)
ax.set_xscale("log")
ax.set_xlabel("Time [ms]")
ax.set_ylabel("Fluorescence [AU]")
ax.set_title("AquaPen - Example")

## Training with Feature selection
Exclude AquaPen data until a conversion is found

In [98]:
# Subset the data to the samples and time to be included in the analysis 
dat = df.loc[0.01:, didx(fluorometer="MULTI-COLOR-PAM")].dropna()

In Tran2024, nine features were extracted:
- F0
- Fm
- F(50 µs)
- F(100 µs)
- F(300 µs)
- F(2 ms)
- F(30 ms)
- Timing of Fm
- Area above the curve between F0 and Fm

In [None]:
# Plot the selected data
dat = df.loc[0.01:, didx(fluorometer="MULTI-COLOR-PAM")].dropna()
ax = dat.plot(legend=False)

for t in [0.05, 0.1, 0.3, 2, 30]:
    ax.axvline(t)

ax.set_xscale("log")

### Potential other features:
- Log-spaced subsampling
- Timing of inflection points (I, J)

In [127]:
# Create a feature table
df_features = pd.DataFrame(index=dat.columns, columns=pd.MultiIndex([[],[]], [[],[]], dtype=[str, float]))

# F0 [AU]
df_features.loc[:, idx["F0 [AU]", np.nan]] = dat.iloc[:3].mean()

# Fm [AU]
df_features.loc[:, idx["Fm [AU]", np.nan]] = dat.max()

# Fm timing [ms]
df_features.loc[:, idx["Fm time [ms]", np.nan]] = dat.idxmax()

In [130]:
# Time points, logspaced
time_points = np.logspace(
    np.log10(dat.index[0]),
    np.log10(dat.index[-1]),
    10
)[1:-1]

# Pre-populate the interp function
_interp = partial(np.interp, time_points, dat.index)

# Interpolate the selected points
F_sampled = dat.apply(_interp)
F_sampled.index = pd.MultiIndex.from_product([["Fsampled"], time_points])

# Add sampled points to features
df_features = pd.concat([df_features, F_sampled.T], axis=1)

In [None]:
df_features