# Cell Velocity and Density Analysis Pipeline

This notebook implements the cell velocity calculations and temporal analysis described in the paper. The pipeline performs:

1. **Cell Velocity Calculation** - Computes individual cell velocities from tracking data
2. **Moving Average Smoothing** - Applies temporal smoothing to reduce noise
3. **Unit Conversion** - Converts pixel-based measurements to biological units (μm, hours)
4. **Temporal Analysis** - Analyzes cellular properties at specific timepoints
5. **Visualization** - Creates publication-ready plots showing cellular dynamics

The analysis focuses on how cellular properties (velocity, density, area, ERK activity) change over time in different spatial regions (outer ERK band, inner ERK band, center) as shown in the appendix figures.

In [None]:
# Import required libraries
import os
import glob
import natsort
import pandas as pd
import numpy as np


def get_project_path():
    """
    Returns the project path based on the operating system.

    Returns:
        str: Network path on Windows, mount path on Linux/Mac
    """
    if os.name == "nt":
        # Windows network path to the imaging data server
        return "\\\\izbkingston.izb.unibe.ch\\imaging.data\\PertzLab\\StemCellProject\\"
    else:
        # Unix/Linux mount path
        return "/mnt/imaging.data/PertzLab/StemCellProject"


def get_output_path():
    """
    Returns the output path for the analysed data.

    Returns:
        str: Full path to the experiment's analyzed data directory
    """
    project_path = get_project_path()
    # Define the specific experiment directory structure
    output_path_parts = [
        "20240609_20xConf_Colonies_E6_bFGF_BMP4_10minInterval_ExpPAOLO",
        "1stPart_liveImaging_Confocal1plane",
        "Analysed_Data",
    ]
    return os.path.join(project_path, *output_path_parts)


def get_fovs(output_path: str = get_output_path()):
    """
    Returns a sorted list of FOV names from the output path.

    Args:
        output_path: Path to search for FOV directories

    Returns:
        list: Naturally sorted list of FOV names (e.g., ['FOV_0', 'FOV_1', ...])
    """
    # Find all directories matching the FOV pattern
    fovs = glob.glob(os.path.join(output_path, "FOV_*"))
    fovsname = []
    for fov in fovs:
        if os.path.isdir(fov):
            fovsname.append(os.path.basename(fov))
    # Use natural sorting to ensure proper numeric order (FOV_1, FOV_2, ..., FOV_10)
    return natsort.natsorted(fovsname)


# Configuration constants
PIXEL_SIZE_UM = 0.645  # Pixel size in micrometers
BINNING_FACTOR = 6  # Binning factor applied during acquisition
FRAME_INTERVAL_SECONDS = 600  # Time between frames in seconds

# Analysis parameters
fovs = get_fovs()
fov = fovs[7]  # Select FOV 7 for this analysis

# Define timepoints for analysis (in frames)
times = [36, 72, 144, 216, 252]  # Specific timepoints of interest

# Convert frames to hours for temporal analysis
conversion_factor_frame_hours = (
    FRAME_INTERVAL_SECONDS / 3600
)  # Convert seconds to hours
time_hours = [t * conversion_factor_frame_hours for t in times]

# Load the tracking data for the selected FOV
df = pd.read_parquet(os.path.join(get_output_path(), f"{fov}_df.parquet"))

print(f"Loaded data for {fov} with {len(df)} rows")
print(f"Analysis timepoints: {times} frames = {time_hours} hours")
print(f"Data columns: {list(df.columns)}")

time_hours

## Step 1: Calculate Cell Velocities

This section computes individual cell velocities from tracking data using an optimized approach:

1. **Displacement Calculation**: For each cell, calculate the distance moved between consecutive timepoints
2. **Velocity Computation**: Divide displacement by time interval to get instantaneous velocity
3. **Moving Average**: Apply temporal smoothing to reduce noise and reveal trends
4. **Data Integration**: Merge velocity data back with the main tracking dataset

The algorithm uses vectorized operations with pandas for efficient computation across all cells and timepoints.

In [None]:
import numpy as np
import pandas as pd


def calculate_cell_velocities_optimized(df):
    """
    Calculate cell velocities from tracking data using vectorized operations.

    This function computes instantaneous velocities for each cell by:
    1. Calculating displacement between consecutive timepoints
    2. Dividing by time interval to get velocity
    3. Handling missing data and cell divisions properly

    Args:
        df: DataFrame with tracking data containing 'label', 't', 'x', 'y', 'erk_band' columns

    Returns:
        DataFrame with velocity information for each cell at each timepoint
    """
    # Sort data by cell ID and time for proper sequential processing
    df_sorted = df.sort_values(["label", "t"])

    # Use pandas shift to get previous positions for each cell
    # This is much faster than loops for large datasets
    df_shifted = df_sorted.copy()
    df_shifted["x_prev"] = df_sorted.groupby("label")["x"].shift(1)
    df_shifted["y_prev"] = df_sorted.groupby("label")["y"].shift(1)
    df_shifted["t_prev"] = df_sorted.groupby("label")["t"].shift(1)

    # Remove entries without previous timepoint (first frame for each cell)
    df_valid = df_shifted.dropna(subset=["x_prev", "y_prev", "t_prev"])

    # Calculate displacement components
    df_valid["dx"] = df_valid["x"] - df_valid["x_prev"]
    df_valid["dy"] = df_valid["y"] - df_valid["y_prev"]
    df_valid["dt"] = df_valid["t"] - df_valid["t_prev"]

    # Calculate Euclidean distance and instantaneous velocity
    df_valid["displacement"] = np.sqrt(df_valid["dx"] ** 2 + df_valid["dy"] ** 2)
    df_valid["velocity"] = df_valid["displacement"] / df_valid["dt"]

    # Return relevant columns
    result_df = df_valid[["label", "t", "dt", "displacement", "velocity", "erk_band"]]

    return result_df


# Berechnung der Geschwindigkeiten
velocity_df = calculate_cell_velocities_optimized(df)

# Gleitender Durchschnitt - optimierte Version
velocity_df.sort_values(["label", "t"], inplace=True)


def apply_moving_average(df, window_size=5):
    """
    Apply moving average smoothing to velocity data.

    Args:
        df: DataFrame with velocity data
        window_size: Size of the moving window for smoothing

    Returns:
        DataFrame with smoothed velocity column added
    """
    df_sorted = df.sort_values(["label", "t"])
    # Apply centered moving average with minimum periods to handle edge cases
    df_sorted["velocity_ma"] = df_sorted.groupby("label")["velocity"].transform(
        lambda x: x.rolling(window=window_size, min_periods=1, center=True).mean()
    )
    return df_sorted


# Apply moving average smoothing with window size of 10 frames
velocity_df_with_ma = apply_moving_average(velocity_df, window_size=10)

# Zusammenführen mit dem ursprünglichen DataFrame
df_with_velocity = df.merge(
    velocity_df_with_ma[["label", "t", "velocity", "velocity_ma"]],
    on=["label", "t"],
    how="left",
)
df_with_velocity.sort_values(["label", "t"], inplace=True)

In [None]:
# Apply moving average to additional cellular properties
columns_for_ma = ["area", "cell_density", "CNr"]  # CNr = C/N ratio (ERK activity)

print("Applying moving average to cellular properties...")
df_with_velocity.sort_values(by=["label", "t"], inplace=True)

# Calculate moving averages for each cellular property
for col in columns_for_ma:
    df_with_velocity[col + "_ma"] = df_with_velocity.groupby("label")[col].transform(
        lambda x: x.rolling(window=5, min_periods=1, center=True).mean()
    )

# Convert measurements from pixels to biological units
print("Converting units from pixels to micrometers...")

# Velocity conversion: pixels/frame → μm/hour
# Factor 1: pixel size (0.645 μm/pixel)
# Factor 2: binning factor (6x binning during acquisition)
# Factor 3: time conversion (frames → hours) handled by frame interval
velocity_conversion_factor = PIXEL_SIZE_UM * BINNING_FACTOR

df_with_velocity["velocity_ma"] = (
    df_with_velocity["velocity_ma"] * velocity_conversion_factor
)
df_with_velocity["velocity"] = df_with_velocity["velocity"] * velocity_conversion_factor

# Area conversion: pixels² → μm²
# Apply squared pixel size conversion
area_conversion_factor = PIXEL_SIZE_UM**2
df_with_velocity["area"] = df_with_velocity["area"] * area_conversion_factor
df_with_velocity["area_ma"] = df_with_velocity["area_ma"] * area_conversion_factor

# Cell density conversion: cells/pixel → cells/μm
# Invert pixel size to get density per unit area
density_conversion_factor = 1 / PIXEL_SIZE_UM
df_with_velocity["cell_density"] = (
    df_with_velocity["cell_density"] * density_conversion_factor
)
df_with_velocity["cell_density_ma"] = (
    df_with_velocity["cell_density_ma"] * density_conversion_factor
)

print("Unit conversion complete:")
print(f"- Velocity: pixels/frame → μm/h (factor: {velocity_conversion_factor:.3f})")
print(f"- Area: pixels² → μm² (factor: {area_conversion_factor:.6f})")
print(f"- Density: cells/pixel → cells/μm (factor: {density_conversion_factor:.3f})")

In [None]:
# Filter data for specific timepoints and remove outliers
print("Filtering data for analysis...")

# Select only the specified timepoints and exclude background regions (erk_band = 0)
df_with_velocity_t = df_with_velocity.query("t in @times and erk_band != 0")
df_filtered = df_with_velocity_t.copy()

print(f"After timepoint and region filtering: {len(df_filtered)} rows")

# Remove extreme outliers using percentile-based filtering
# This helps ensure robust statistical analysis by removing measurement artifacts
columns_to_filter = ["CNr", "cell_density_ma", "velocity_ma", "area_ma"]

print("Removing outliers based on percentile thresholds...")
for col in columns_to_filter:
    # Use 0.5th and 99.5th percentiles to remove extreme values
    # This removes the most extreme 1% of data points
    lower_bound = df_filtered[col].quantile(0.005)
    upper_bound = df_filtered[col].quantile(0.995)

    # Create mask for values within acceptable range
    mask = (df_filtered[col] >= lower_bound) & (df_filtered[col] <= upper_bound)

    # Apply filter and report results
    n_before = len(df_filtered)
    df_filtered = df_filtered[mask]
    n_after = len(df_filtered)

    print(
        f"  {col}: removed {n_before - n_after} outliers "
        f"(bounds: {lower_bound:.3f} - {upper_bound:.3f})"
    )

# Reset index after filtering
df_filtered = df_filtered.reset_index(drop=True)

# Print final filtering summary
print(f"\nFiltering summary:")
print(f"- Original dataset: {len(df_with_velocity)} rows")
print(f"- After timepoint/region filter: {len(df_with_velocity_t)} rows")
print(f"- After outlier removal: {len(df_filtered)} rows")
print(
    f"- Total removed: {len(df_with_velocity_t) - len(df_filtered)} rows "
    f"({100 * (len(df_with_velocity_t) - len(df_filtered)) / len(df_with_velocity_t):.1f}%)"
)

# Check ERK band distribution
print(f"\nERK band distribution in filtered data:")
print(df_filtered["erk_band"].value_counts().sort_index())

## Step 2: Temporal Analysis and Visualization

The visualization uses notched box plots to show the distribution of each property across timepoints and spatial regions, as shown in the appendix figures of the paper.

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

# Set up the figure with 2x2 subplot arrangement
fig, axes = plt.subplots(2, 2, sharex=True, figsize=(16, 10))

# Prepare data and color scheme
df_plot = df_filtered
erk_bands = sorted(df_plot["erk_band"].unique())
colors = plt.get_cmap("tab10").colors


def boxplot_notch(ax, y, x, hue, data, ylabel):
    """
    Create a notched box plot with grouped data by timepoint and ERK band.

    Args:
        ax: Matplotlib axis object
        y: Column name for y-axis values
        x: Column name for x-axis grouping (timepoints)
        hue: Column name for color grouping (ERK bands)
        data: DataFrame with the data
        ylabel: Label for y-axis
    """
    positions = []
    box_data = []
    color_list = []

    # Box plot styling parameters
    width = 0.18  # Width of individual boxes
    band_gap = 0.03  # Gap between ERK bands within each timepoint

    # Create box plots for each timepoint and ERK band combination
    for i, t in enumerate(times):
        for j, band in enumerate(erk_bands):
            # Calculate position for this box
            # Center boxes around integer positions with small offsets for each band
            pos = i + (j - (len(erk_bands) - 1) / 2) * (width + band_gap)

            # Extract data for this timepoint and ERK band
            vals = data[(data[x] == t) & (data[hue] == band)][y].dropna().values

            if len(vals) > 0:  # Only plot if data exists
                box_data.append(vals)
                positions.append(pos)
                color_list.append(colors[j % len(colors)])

    # Create the box plot if we have data
    if box_data:
        bp = ax.boxplot(
            box_data,
            positions=positions,
            widths=width,
            notch=True,  # Add notches to show confidence intervals
            patch_artist=True,  # Fill boxes with colors
            showfliers=False,  # Don't show outliers (already filtered)
            medianprops=dict(color="black", linewidth=1.5),  # Black median line
        )

        # Apply colors to boxes
        for patch, color in zip(bp["boxes"], color_list):
            patch.set_facecolor(color)
            patch.set_alpha(0.7)  # Semi-transparent boxes

    ax.set_ylabel(ylabel, fontsize=12)
    ax.grid(True, alpha=0.3)  # Add subtle grid
    return


# Create individual subplots for each cellular property
print("Creating publication plots...")

# Plot 1: ERK Activity (CNr)
boxplot_notch(axes[0, 0], "CNr", "t", "erk_band", df_plot, "ERK Activity (CNr) [a.u.]")
axes[0, 0].set_title("ERK Activity Over Time", fontsize=14, fontweight="bold")
axes[0, 0].set_xticks([])  # Remove x-ticks for top row

# Plot 2: Cell Density
boxplot_notch(
    axes[0, 1], "cell_density_ma", "t", "erk_band", df_plot, "Cell Density [cells/μm]"
)
axes[0, 1].set_title("Cell Density Over Time", fontsize=14, fontweight="bold")
axes[0, 1].set_xticks([])  # Remove x-ticks for top row

# Plot 3: Cell Velocity
boxplot_notch(
    axes[1, 0], "velocity_ma", "t", "erk_band", df_plot, "Cell Velocity [μm/h]"
)
axes[1, 0].set_title("Cell Velocity Over Time", fontsize=14, fontweight="bold")

# Plot 4: Nucleus Area
boxplot_notch(axes[1, 1], "area_ma", "t", "erk_band", df_plot, "Nucleus Area [μm²]")
axes[1, 1].set_title("Nucleus Area Over Time", fontsize=14, fontweight="bold")

# Set x-axis labels for bottom row
time_labels = ["6 hours", "12 hours", "24 hours", "36 hours", "42 hours"]
for ax in axes[1, :]:
    ax.set_xticks(range(len(times)))
    ax.set_xticklabels(time_labels, rotation=45, ha="right")
    ax.set_xlabel("Time post-plating", fontsize=12)

# Create legend
labels_for_erk_band = ["Outer ERK Band", "Inner ERK Band", "Center"]
legend_handles = [
    Patch(facecolor=colors[i % len(colors)], label=labels_for_erk_band[i], alpha=0.7)
    for i in range(len(erk_bands))
]

# Add legend below the plots
fig.legend(
    handles=legend_handles,
    labels=labels_for_erk_band,
    loc="lower center",
    bbox_to_anchor=(0.5, -0.02),
    ncol=3,
    frameon=False,
    fontsize=12,
)

# Adjust layout and save
plt.tight_layout(rect=[0, 0.05, 1, 0.95])  # Leave space for legend
plt.suptitle(
    f"Cellular Dynamics Analysis - {fov.upper()}",
    fontsize=16,
    fontweight="bold",
    y=0.98,
)

# Save in multiple formats for publication
output_name = f"cellular_dynamics_{fov}_temporal_analysis"
plt.savefig(f"{output_name}.svg", format="svg", bbox_inches="tight", dpi=300)
plt.savefig(f"{output_name}.png", format="png", bbox_inches="tight", dpi=300)
plt.show()