This notebook reproduces the **quantification of cell rearrangement events** described in the manuscript.  

## Analysis Workflow

- **Event Identification**
  - Junctional changes were manually tracked from time-lapse membrane imaging (≥100 min), in FIJI.
  - Events were classified into three categories:
    - T1 (Neighbor exchange): one junction shrinks while an orthogonal junction expands, leading to a swap of neighbors.  
    - T2 (Cell extrusion): junctions contract around a cell as it exits the basal layer.  
    - T3 (Cell insertion): junctions open to accommodate a new cell intercalating into the basal layer.  

- **Normalization**
  - Rearrangement rates normalized to:
    1. The total number of cells in the analyzed region at \(t=0\).  
    2. The imaging duration.  

- **Cumulative Quantification**
  - Cumulative rearrangements over time calculated by summing events up to each time point.  
  - Values normalized to the initial cell number.  

## Outputs
- Counts of T1, T2, and T3 events per imaging sequence  
- Rearrangement rates (events per cell per unit time)  
- Cumulative rearrangement curves normalized to initial cell number  

In [None]:
# imports
import pandas as pd
import numpy as np
import os
import itertools
import openpyxl
import math
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.cm as cm
from pathlib import Path
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

### Step1: load data

In [None]:
file_path = 'input-file.xlsx'
df = pd.read_excel(file_path, sheet_name='Sheet1')

output_folder = 'output-folder-path'

### Step2: data processing

In [None]:
# === CLEAN COLUMN NAMES === (just in case)
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
print("✅ Columns:", df.columns.tolist())

# === Add duration in minutes ===
df['duration_min'] = (df['time_points'] - 1) * 10  # time in minutes

# === Compute normalised rates ===
# Per cell → divide by number of cells
df['total_rate_per_cell_per_min'] = df['total_rearrange_count'] / (df['duration_min'] * df['cell_number'])
df['T1_rate_per_cell_per_min'] = df['t1_count'] / (df['duration_min'] * df['cell_number'])
df['T2_rate_per_cell_per_min'] = df['t2_count'] / (df['duration_min'] * df['cell_number'])
df['T3_rate_per_cell_per_min'] = df['t3_count'] / (df['duration_min'] * df['cell_number'])

# Per area → divide by cropped_region (µm²)
df['total_rate_per_um2_per_min'] = df['total_rearrange_count'] / (df['duration_min'] * df['cropped_region'])
df['T1_rate_per_um2_per_min'] = df['t1_count'] / (df['duration_min'] * df['cropped_region'])
df['T2_rate_per_um2_per_min'] = df['t2_count'] / (df['duration_min'] * df['cropped_region'])
df['T3_rate_per_um2_per_min'] = df['t3_count'] / (df['duration_min'] * df['cropped_region'])

# === Compute ratios over total_rearrange_count ===
df['T1_ratio'] = df['t1_count'] / df['total_rearrange_count']
df['T2_ratio'] = df['t2_count'] / df['total_rearrange_count']
df['T3_ratio'] = df['t3_count'] / df['total_rearrange_count']

display(df)
df.to_excel(f'{output_folder}total_rates_output-20250703.xlsx', index=False)

print("✅ Computed normalised rates PER MINUTE and saved to summary_with_rates.xlsx")

### Step3: calculate and visualize the proportion of transition events

In [None]:
# === GROUP BY 'Condition' AND COMPUTE RATIOS ===
grouped = df.groupby('Condition')[['total_rearrange_count', 't1_count', 't2_count', 't3_count']].sum()
ratios_df = grouped[['t1_count','t2_count','t3_count']].div(grouped['total_rearrange_count'], axis=0)
ratios_df.columns = ['T1_ratio', 'T2_ratio', 'T3_ratio']
print(ratios_df)

# === STACKED BAR PLOT ===
ax = ratios_df.plot(
    kind='bar',
    stacked=True,
    figsize=(3, 4),
    color=['#ff7f0e', '#2ca02c', '#1f77b4'],
    alpha=0.8
)

plt.ylabel('Ratio of total rearrangements (%)', fontsize=10)
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.legend(['T1', 'T2', 'T3'], title='Transition Type')
plt.tight_layout()

plot_file = f"{output_folder}t1-2-3ratio.pdf"
plt.savefig(plot_file, format='pdf')
plt.show()
print(f"✅ Stacked bar plot saved → {plot_file}")


### Step4: cumulative transition events analysis

In [None]:
base_dir = Path("base_folder") # structure: base_folder >> subfolder {condition} >>  csv. containing counted dots at certain time points
conditions = ["condition1", "condition2", "condition3", "condition4", "condition5"] # named as condition settings
extensions = {".csv"}  

In [None]:
# === define functions to concatenate data for each condition
rows = []
for cond in conditions:
    folder = base_dir / cond
    for p in folder.glob("*"):
        if p.is_file() and p.suffix.lower() in extensions:
            rows.append({
                "filename": p.name,                         
                "cell_count": "",                           
                "condition": cond,                         
                "rel_path": str(p.relative_to(base_dir)),   
                "abs_path": str(p.resolve())                
            })

df = pd.DataFrame(rows).sort_values(["condition", "filename"])

excel_out = base_dir / "cell_counts_template.xlsx"
csv_out   = base_dir / "cell_counts_template.csv"
with pd.ExcelWriter(excel_out, engine="openpyxl") as w:
    df.to_excel(w, index=False, sheet_name="cell_counts")
df.to_csv(csv_out, index=False)

print(f"Made:\n- {excel_out}\n- {csv_out}")


In [None]:
# === define plotting function
mapping_path = base_dir / "cell_counts.xlsx"  # or .csv with same columns
data_ext = ".csv"
frame_interval_min = 10
target_time_points = 10                       # for mean±SD panel only
out_dir = base_dir / "plots"
out_dir.mkdir(exist_ok=True)
# ------------------------------------------------

fixed_colors = {
    "condition1": "#0072B2",
    "condition2": "#E69F00",
    "condition3": "#D55E00",
    "condition4": "#009E73",
    "condition5": "#CC79A7",
}

# ---------- helpers ----------
def load_mapping(p: Path) -> pd.DataFrame:
    """Needs: rel_path, condition, cell_count, time_points."""
    if p.suffix.lower() == ".csv":
        df = pd.read_csv(p)
    else:
        df = pd.read_excel(p, sheet_name=0)
    needed = {"rel_path", "condition", "cell_count", "time_points"}
    missing = needed - set(df.columns)
    if missing:
        raise ValueError(f"Mapping file missing columns: {missing}")
    df["rel_path"] = df["rel_path"].astype(str)
    return df

def insert_zero_frame(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure (t=0, count=0) exists as baseline."""
    if "t" not in df.columns or "count" not in df.columns:
        raise ValueError("Dataframe must have columns 't' and 'count'.")
    if 0.0 not in df["t"].values:
        df = pd.concat([pd.DataFrame({"t":[0.0], "count":[0]}), df], ignore_index=True)
    return df

def build_trace_individual(file_path: Path, cell_count: int, time_points: int):
    """
    INDIVIDUAL TRACES:
    - Use the whole CSV but cut at 'time_points'.
    - Insert (0,0) if missing, then shift t -> t+1 so index 0 is baseline.
    - No padding beyond 'time_points'.
    """
    df = pd.read_csv(file_path)
    df = insert_zero_frame(df)

    frames = df["t"].round().astype(int).to_numpy() + 1   # shift
    counts = df["count"].to_numpy()

    T = int(time_points)
    counts_per_frame = np.zeros(T, dtype=float)
    for fr, ct in zip(frames, counts):
        if 0 <= fr < T:
            counts_per_frame[fr] += ct

    cum = np.cumsum(counts_per_frame)
    norm_cum = cum / float(cell_count)
    time_min = np.arange(T) * frame_interval_min
    return time_min, norm_cum

def build_trace_meanstd(file_path: Path, cell_count: int, target_T: int = 10):
    """
    MEAN±SD PANEL:
    - Read only the first 'target_T' rows from CSV.
    - Insert (0,0) if missing, shift t -> t+1.
    - Return fixed-length arrays of length 'target_T'.
    """
    df = pd.read_csv(file_path, nrows=target_T)
    df = insert_zero_frame(df)

    frames = df["t"].round().astype(int).to_numpy() + 1   # shift
    counts = df["count"].to_numpy()

    counts_per_frame = np.zeros(target_T, dtype=float)
    for fr, ct in zip(frames, counts):
        if 0 <= fr < target_T:
            counts_per_frame[fr] += ct

    cum = np.cumsum(counts_per_frame)
    norm_cum = cum / float(cell_count)
    time_min = np.arange(target_T) * frame_interval_min
    return time_min, norm_cum

def fit_through_origin(x: np.ndarray, y: np.ndarray):
    """Return slope, R^2, and fitted y for a line forced through origin."""
    x = x.astype(float); y = y.astype(float)
    sxx = np.dot(x, x)
    sxy = np.dot(x, y)
    slope = sxy / sxx if sxx != 0 else np.nan
    yhat = slope * x
    ss_res = np.sum((y - yhat) ** 2)
    ss_tot = np.sum((y - y.mean()) ** 2)
    r2 = 1.0 - ss_res / ss_tot if ss_tot != 0 else np.nan
    return slope, r2, yhat


In [None]:
# === call defined function
mapping = load_mapping(mapping_path)
conditions_in_data = list(pd.unique(mapping["condition"]))
color_map = {c: fixed_colors.get(c, plt.cm.tab20(i % 20)) for i, c in enumerate(conditions_in_data)}

# Storage
raw_traces = {c: [] for c in conditions_in_data}   # (time, trace) with actual time_points
per_condition_fixed = {c: [] for c in conditions_in_data}  # fixed length (10) for mean±SD
rep_counts = {c: 0 for c in conditions_in_data}
time_axis_fixed = np.arange(target_time_points) * frame_interval_min

for _, row in mapping.iterrows():
    fpath = base_dir / row["rel_path"]
    if not (fpath.is_file() and fpath.suffix.lower() == data_ext):
        continue
    cond = row["condition"]
    cells = int(row["cell_count"])
    tpts = int(row["time_points"])

    # individual (stop at time_points)
    t_ind, tr_ind = build_trace_individual(fpath, cells, time_points=tpts)
    raw_traces[cond].append((t_ind, tr_ind))

    # mean±SD (first 10 rows, t->t+1)
    _, tr_fix = build_trace_meanstd(fpath, cells, target_T=target_time_points)
    per_condition_fixed[cond].append(tr_fix)

    rep_counts[cond] += 1

## Step5: cumulative transition events visualization

In [None]:
# -------------------- PLOT 1: individual traces — one PDF per condition --------------------
global_ymax = 0.0
for cond in conditions:
    if raw_traces[cond]:
        global_ymax = max(global_ymax, max(tr.max() for _, tr in raw_traces[cond]))
ylims = (0, global_ymax * 1.05)  # 5% headroom

for cond in conditions:
    fig, ax = plt.subplots(figsize=(5,5))
    ax.set_title(f"{cond} (n={rep_counts[cond]})")
    ax.set_xlabel("Time [min]")
    ax.set_ylabel("Cumulative transitions / cell")

    n_traces = len(raw_traces[cond])
    cmap = cm.get_cmap("tab20", n_traces)  

    for idx, (tmin, tr) in enumerate(raw_traces[cond]):
        ax.plot(tmin, tr, color=cmap(idx), alpha=0.8, lw=2, label=f"trace {idx+1}")

    ax.grid(True, alpha=0.25)
    ax.set_ylim(*ylims)                           

    outfile = out_dir / f"individual_{cond}.pdf"
    ax.set_xlim(0, 90)
    ax.set_ylim(0, 0.4)
    fig.tight_layout()
    fig.savefig(outfile)                
    plt.show(fig)

# # -------------------- PLOT 2: mean ± SD + linear fit (through origin) --------------------
plt.figure(figsize=(6, 5))
for cond in conditions:
    traces = per_condition_fixed[cond]
    if not traces:
        continue
    M = np.vstack(traces)     # n_rep x 10
    mean = M.mean(axis=0)
    std  = M.std(axis=0)

    slope, r2, yhat = fit_through_origin(time_axis_fixed, mean)

    # Print slopes
    print(f"{cond}: slope={slope:.4f} per min | {slope*frame_interval_min:.4f} per 10-min frame | R^2={r2:.3f}")

    c = color_map[cond]
    plt.plot(time_axis_fixed, mean,
             label=f"{cond} (n={rep_counts[cond]}) — slope={slope:.3f}, R²={r2:.3f}",
             color=c, lw=2.6)
    plt.fill_between(time_axis_fixed, mean - std, mean + std, color=c, alpha=0.20, linewidth=0)
    plt.plot(time_axis_fixed, yhat, color=c, linestyle="--", lw=1.6, alpha=0.85)

plt.xlabel("Time [min]")
plt.ylabel("Cumulative transitions / cell")
plt.title("Mean ± SD with linear fit through origin (first 10 rows; t → t+1; t=0 inserted)")
plt.grid(True, alpha=0.25)
plt.xlim(0, time_axis_fixed[-1])
plt.legend(frameon=False, fontsize=9, ncol=2)
plt.tight_layout()
plt.savefig(out_dir / "mean_sd_with_regression_origin.pdf")
plt.show()
