This is a code to deal with the analysis from the artificial condensates:

Needs: 
    1. Normalize the condensate mean intensities to the intensities at the dilute phase

    2. Radius growth is normalized to nucleation time

        For this, look at what particles are present at end time, then look for the earliest timeframe with the most in common
        
    3. Normalize total intensities to nucleation time (same as radius)

First: load in the requirements,
then load in the file locations and names


In [None]:
#REQUIREMENTS

import os
import re
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import gaussian_kde
import tifffile

In [None]:
#DEFINITIONS:
input_folder = '/Volumes/holtl02lab/holtl02labspace/Holt_Lab_Members/Nora_Holt/Live_Imaging_Other/250413_HeLa-2112-2113_2712-2713_6h-TFH/Results_FirstTFH'  # Replace with your input folder path
output_folder = '/Volumes/holtl02lab/holtl02labspace/Holt_Lab_Members/Nora_Holt/Live_Imaging_Other/250413_HeLa-2112-2113_2712-2713_6h-TFH/Results_FirstTFH/output'  # Replace with your output folder path
masks_folder = '/Volumes/holtl02lab/holtl02labspace/Holt_Lab_Members/Nora_Holt/Live_Imaging_Other/250413_HeLa-2112-2113_2712-2713_6h-TFH/First_TFH_Masks' #Replace with the fodler for your masks on the 1st timepoint/pre-TFH image
output_filename_spots = 'all_data_spots'  # Name of output file
output_filename_tracks = 'all_data_tracks'  # Name of output file
group_names = ["HeLa-2113_2712-2714_Dox", "HeLa-2112_2712-2714_Dox", "Untreated"] #Names of groups to sort


In [None]:
# This code concatenates the _spots and _tracks files from TrackMate by their respective groups so that 
# The output is: you now have one big .csv file with all of the data from all the spots.csv (and another for tracks.csv) for condition 1, another for condition 2, 


def concatenate_and_sort_csv(input_folder, output_folder, output_filename_spots, output_filename_tracks, masks_folder):
    def process_files(files, suffix, group_name, sort_columns, ascending_order):
        dfs = []
        for file in files:
            if group_name not in file:
                continue
            
            file_path = os.path.join(input_folder, file)

            try:
                df = pd.read_csv(file_path, header=0)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
                continue

            core_filename = file[:-len(suffix)]
            df.insert(0, 'original_filename', core_filename)

            if 'TRACK_ID' in df.columns:
                df['unique_name'] = df['original_filename'] + "_" + df['TRACK_ID'].astype(str)
            else:
                print(f"Warning: 'TRACK_ID' column missing in {file}")
                df['unique_name'] = df['original_filename']

            cols = list(df.columns)
            cols.remove('unique_name')
            cols.insert(1, 'unique_name')
            df = df[cols]

            if 'LABEL' in df.columns:
                df = df.drop(columns=['LABEL'])

            # Compute mean intensity
            try:
                img_path = os.path.join(masks_folder, f"{core_filename}.tif")
                mask_path = os.path.join(masks_folder, f"{core_filename}_MASK.tif")

                image = tifffile.imread(img_path)
                mask = tifffile.imread(mask_path)

                if image.shape != mask.shape:
                    print(f"Shape mismatch in {core_filename}: image {image.shape}, mask {mask.shape}")
                    mean_intensity = np.nan
                else:
                    roi_pixels = image[mask > 0]
                    mean_intensity = roi_pixels.mean() if roi_pixels.size > 0 else np.nan
            except Exception as e:
                print(f"Error reading image or mask for {core_filename}: {e}")
                mean_intensity = np.nan

            df["Mean_Intensity_t0"] = mean_intensity
            
            # Move "Mean_Intensity_t0" to right after "unique_name"
            cols = list(df.columns)
            cols.remove("Mean_Intensity_t0")
            insert_idx = cols.index("unique_name") + 1
            cols.insert(insert_idx, "Mean_Intensity_t0")
            df = df[cols]
            dfs.append(df)

        if not dfs:
            print(f"No matching files for group '{group_name}' and suffix '{suffix}'")
            return pd.DataFrame()  # return empty dataframe

        result = pd.concat(dfs, ignore_index=True)
        for col in result.columns:
            if col not in ['original_filename', 'unique_name']:
                result[col] = pd.to_numeric(result[col], errors='coerce')

        result = result.sort_values(by=sort_columns, ascending=ascending_order)
        return result

    for group_name in ["HeLa-2113_2712-2714_Dox", "HeLa-2112_2712-2714_Dox", "Untreated"]:
        spots_files = [f for f in os.listdir(input_folder) if f.endswith('_spots.csv')]
        tracks_files = [f for f in os.listdir(input_folder) if f.endswith('_tracks.csv')]

        if spots_files:
            spots_result = process_files(spots_files, '_spots.csv', group_name, ['unique_name', 'FRAME'], [True, True])
            if not spots_result.empty:
                output_file_spots = os.path.join(output_folder, f"{output_filename_spots}_{group_name}.csv")
                spots_result.to_csv(output_file_spots, index=False, header=True)
                print(f"✅ Saved spots CSV: {output_file_spots}")

        if tracks_files:
            tracks_result = process_files(tracks_files, '_tracks.csv', group_name, ['unique_name'], [True])
            if not tracks_result.empty:
                output_file_tracks = os.path.join(output_folder, f"{output_filename_tracks}_{group_name}.csv")
                tracks_result.to_csv(output_file_tracks, index=False, header=True)
                print(f"✅ Saved tracks CSV: {output_file_tracks}")

concatenate_and_sort_csv(input_folder, output_folder, output_filename_spots, output_filename_tracks, masks_folder)


Now that I have the all_data.csv files, I need to be able to skip the concatenate_and_sort script if I want to reanalyze existing data in the future:

In [None]:
df_spots_dox_2113 = pd.read_csv(os.path.join(output_folder, f"{output_filename_spots}_HeLa-2113_2712-2714_Dox.csv"), header=0)
df_tracks_dox_2113 = pd.read_csv(os.path.join(output_folder, f"{output_filename_tracks}_HeLa-2113_2712-2714_Dox.csv"), header=0)
df_spots_dox_2112 = pd.read_csv(os.path.join(output_folder, f"{output_filename_spots}_HeLa-2112_2712-2714_Dox.csv"), header=0)
df_tracks_dox_2112 = pd.read_csv(os.path.join(output_folder, f"{output_filename_tracks}_HeLa-2112_2712-2714_Dox.csv"), header=0)
df_spots_un = pd.read_csv(os.path.join(output_folder, f"{output_filename_spots}_Untreated.csv"), header=0)
df_tracks_un = pd.read_csv(os.path.join(output_folder, f"{output_filename_tracks}_Untreated.csv"), header=0)

print(df_spots_dox_2113.columns.tolist())
print(df_tracks_dox_2113.columns.tolist())
print(df_spots_dox_2112.columns.tolist())
print(df_tracks_dox_2112.columns.tolist())
print(df_spots_un.columns.tolist())
print(df_tracks_un.columns.tolist())


In [None]:
# Assign one comparison at a time:

df_tracks_dox = df_tracks_dox_2112

Here I want to graph the nucleation points for all tracks identified in the .csv files in each group

In [None]:
# Remove rows where 'TRACK_INDEX' is blank
df_tracks_dox_filtered = df_tracks_dox.dropna(subset=['TRACK_INDEX'])
df_tracks_un_filtered = df_tracks_un.dropna(subset=['TRACK_INDEX'])

# Save the filtered data to new CSV files
df_tracks_dox_filtered.to_csv(os.path.join(output_folder, f"{output_filename_tracks}_Dox-filtered.csv"), index=False)
df_tracks_un_filtered.to_csv(os.path.join(output_folder, f"{output_filename_tracks}_Untreated-filtered.csv"), index=False)

# Modify 'TRACK_START' by dividing by 60, rounding down, and keeping 0 as 0
#df_tracks_dox_filtered['TRACK_START'] = df_tracks_dox_filtered['TRACK_START'].apply(lambda x: max(0, np.floor(x / 60)))
#df_tracks_un_filtered['TRACK_START'] = df_tracks_un_filtered['TRACK_START'].apply(lambda x: max(0, np.floor(x / 60)))

# Define bins (1-minute bins)
bin_width = 1
max_bin = max(df_tracks_dox_filtered['TRACK_START'].max(), df_tracks_un_filtered['TRACK_START'].max()) + bin_width
bins = np.arange(0, max_bin, bin_width)

# Compute histogram counts
dox_counts, _ = np.histogram(df_tracks_dox_filtered['TRACK_START'], bins=bins)
un_counts, _ = np.histogram(df_tracks_un_filtered['TRACK_START'], bins=bins)

# Create DataFrame for output
histogram_df = pd.DataFrame({
    "TRACK_START": bins[:-1],  # Bin start values
    "TRACK_START": bins[:-1],  # Same as TRACK_START
    "NUCLEATION_EVENTS_Dox": dox_counts,
    "NUCLEATION_EVENTS_Untreated": un_counts
})

# Save to CSV
output_csv_path = os.path.join(output_folder, "nucleation_events.csv")
histogram_df.to_csv(output_csv_path, index=False)

print(f"Saved nucleation event counts to {output_csv_path}")

# Plot the density of 'TRACK_START' for Dox and Untreated groups
plt.figure(figsize=(10, 6))

# Plot density for Dox group
sns.kdeplot(df_tracks_dox_filtered['TRACK_START'], color='darkblue', label='Dox', shade=True, alpha=0.5)

# Plot density for Untreated group
sns.kdeplot(df_tracks_un_filtered['TRACK_START'], color='darkred', label='Untreated', shade=True, alpha=0.5)

# Labels and title
plt.xlabel("Nucleation time (in minutes)")
plt.ylabel("Density")
plt.title("Density Plot of TRACK_START for Dox and Untreated Groups")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.5)

plt.savefig(f"{output_folder}/nucleation_density.png", bbox_inches="tight")
# Show the plot
plt.show()

plt.figure(figsize=(10, 6))

# Define bins (1-minute bins)
bin_width = 1
bins = np.arange(0, max(df_tracks_dox_filtered['TRACK_START'].max(), df_tracks_un_filtered['TRACK_START'].max()) + bin_width, bin_width)

# Plot histogram for Dox
sns.histplot(df_tracks_dox_filtered['TRACK_START'], bins=bins, color='blue', label='Dox', alpha=0.6, kde=False, stat='count')

# Plot histogram for Untreated
sns.histplot(df_tracks_un_filtered['TRACK_START'], bins=bins, color='red', label='Untreated', alpha=0.6, kde=False, stat='count')

# Labels and title
plt.xlabel("Nucleation time (in minutes)")
plt.ylabel("Track count")
plt.title("Histogram of Track Nucleation Times")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.5)

# Save and show
plt.savefig(f"{output_folder}/nucleation_histogram.png", bbox_inches="tight")
plt.show()

plt.figure(figsize=(10, 6))

# Define bins
bin_width = 1
bins = np.arange(0, max(df_tracks_dox_filtered['TRACK_START'].max(), df_tracks_un_filtered['TRACK_START'].max()) + bin_width, bin_width)

# Histogram: Dox
sns.histplot(df_tracks_dox_filtered['TRACK_START'], bins=bins, color='blue', label='Dox (count)', alpha=0.4, kde=False, stat='count')

# Histogram: Untreated
sns.histplot(df_tracks_un_filtered['TRACK_START'], bins=bins, color='red', label='Untreated (count)', alpha=0.4, kde=False, stat='count')

# KDE: manually compute and scale density × 100
from scipy.stats import gaussian_kde

# KDE for Dox
dox_kde = gaussian_kde(df_tracks_dox_filtered['TRACK_START'])
x_vals = np.linspace(0, max(bins), 500)
dox_density = dox_kde(x_vals) * 100  # scale KDE

# KDE for Untreated
un_kde = gaussian_kde(df_tracks_un_filtered['TRACK_START'])
un_density = un_kde(x_vals) * 100  # scale KDE

# Plot scaled KDE curves
plt.plot(x_vals, dox_density, color='darkblue', label='Dox (density ×100)', linewidth=2)
plt.plot(x_vals, un_density, color='darkred', label='Untreated (density ×100)', linewidth=2)

# Labels and title
plt.xlabel("Nucleation time (in minutes)")
plt.ylabel("Track count / Scaled density")
plt.title("Track Nucleation Time: Histogram + KDE Overlay (Density ×100)")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.5)

# Save and show
plt.savefig(f"{output_folder}/nucleation_histogram_kde_scaled.png", bbox_inches="tight")
plt.show()

plt.figure(figsize=(10, 6))

# Scatter plot for Dox
sns.scatterplot(
    data=df_tracks_dox_filtered,
    x="TRACK_START",
    y="Mean_Intensity_t0",
    color="blue",
    label="Dox",
    alpha=0.6
)

# Scatter plot for Untreated
sns.scatterplot(
    data=df_tracks_un_filtered,
    x="TRACK_START",
    y="Mean_Intensity_t0",
    color="red",
    label="Untreated",
    alpha=0.6
)

plt.xlabel("Nucleation time (in minutes)")
plt.ylabel("Mean Intensity Dilute Phase")
plt.title("Nucleation time vs. Mean Intensity Dilute Phase")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.5)

# Save and show
plt.savefig(f"{output_folder}/mean-intensity-t0_vs_time.png", bbox_inches="tight")
plt.show()

plt.figure(figsize=(8, 6))

# Combine the two filtered DataFrames and add a group column
df_tracks_dox_filtered["Group"] = "Dox"
df_tracks_un_filtered["Group"] = "Untreated"
df_combined = pd.concat([df_tracks_dox_filtered, df_tracks_un_filtered])

# Boxplot
sns.boxplot(
    data=df_combined,
    x="Group",
    y="Mean_Intensity_t0",
    palette={"Dox": "blue", "Untreated": "red"},
    showcaps=True,
    boxprops={"facecolor": "none", "edgecolor": "black"},
    whiskerprops={"color": "black"},
    medianprops={"color": "black"}
)

# Overlay points
sns.stripplot(
    data=df_combined,
    x="Group",
    y="Mean_Intensity_t0",
    jitter=True,
    alpha=0.5,
    palette={"Dox": "blue", "Untreated": "red"}
)

# Labels and title
plt.ylabel("Mean Intensity t0")
plt.title("Mean Intensity of Dilute Phase (t0) by Group")
plt.grid(True, linestyle="--", alpha=0.5)

# Save and show
plt.savefig(f"{output_folder}/mean_intensity_t0_boxplot.png", bbox_inches="tight")
plt.show()



For the next set of cells, I want to be able to run these scripts without running the nucleation points.

TO FILTER BASED ON WHAT SPOTS ARE PRESENT WITHIN A CERTAIN RANGE (ex: 11-30, 0-15, etc):

1. I have column "FRAME". I need to look for each instance where FRAME = {lst}, then identify the "unique_name" for each instance of {lst}. 

2. Then I need to create a subset df (df_subset_lst_{group}) that has all rows & columns for each "unique_name" pulled from (1).

3. Then I need to create two additional subsets with the following conditions:

    ONE:
    
    4. Starting from df_subset_lst_{group} I need a subset for all "unique_name" values with "FRAME in range {first} to {lst}
    
    5. I need to graph the number of spots vs. time and the total area of spots vs. time in range {first}-{lst}


    TWO:
    
    4. Starting from df_subset_lst_{group} I need to find the value counts to determine the mode frame number before {lst}. This is {nuc_frame}. 
    
    5. I need to run a check for value and frame counts in case I want to manually alter the {nuc_frame}
    
    6. Then I need to normalize the values by the values at frame {nuc_frame}

In [None]:
#This creates a subset for 30 as the endpt. if I want to go forward only with tracks that are present at the end of the image.
first = 0
lst =  100 #this is the last frame number for the range normalization

# Loop through each group
df_subset_lst_dox = None
df_subset_lst_un = None

for group_name in group_names:
    # Read the respective CSV files for each group
    df_spots = pd.read_csv(os.path.join(output_folder, f"{output_filename_spots}_{group_name}.csv"), header=0)
    df_tracks = pd.read_csv(os.path.join(output_folder, f"{output_filename_tracks}_{group_name}.csv"), header=0)
    
    # Get the unique names for the specified frame (lst)
    unique_names_lst = df_spots[df_spots['FRAME'] == lst]['unique_name']
    print(f"Unique names for {group_name} at frame {lst}:", unique_names_lst)
    
    # Create a subset dataframe based on the unique names
    df_subset_lst = df_spots[df_spots['unique_name'].isin(unique_names_lst)]
    
    # Store the subset result in the corresponding variable
    if group_name == "HeLa-2112_2712-2714_Dox":
        df_subset_lst_dox = df_subset_lst
    else:
        df_subset_lst_un = df_subset_lst

In [None]:
# Define a function to filter and save the dataframe, and define the max intensity for t0 dilute phase
def process_and_save_subset(df_subset_lst, group_name, first, lst, output_folder, max_intensity=400):
    # Filter for frames within the range [first - 1, lst]
    df_subset_lst = df_subset_lst[(df_subset_lst["FRAME"] >= (first - 1)) & 
                                  (df_subset_lst["FRAME"] <= lst)]

    # Optional filter for maximum Mean_Intensity_t0
    if max_intensity is not None:
        df_subset_lst = df_subset_lst[df_subset_lst["Mean_Intensity_t0"] <= max_intensity]

    # Add COUNT column: sequential numbering within each unique_name
    df_subset_lst["COUNT"] = df_subset_lst["FRAME"] - first

    # Save to CSV
    filename = f"subset_spots_{group_name}_{first}-{lst}.csv"
    output_path = os.path.join(output_folder, filename)
    df_subset_lst.to_csv(output_path, index=False)

    # Return the processed dataframe
    return df_subset_lst

# Process for Dox
df_subset_range_dox = process_and_save_subset(df_subset_lst_dox, "Dox", first, lst, output_folder)

# Process for Untreated
df_subset_range_un = process_and_save_subset(df_subset_lst_un, "Untreated", first, lst, output_folder)


#sanity check for filtration:
plt.figure(figsize=(8, 6))

# Combine the two filtered DataFrames and add a group column
df_subset_range_dox["Group"] = "Dox"
df_subset_range_un["Group"] = "Untreated"
df_combined = pd.concat([df_subset_range_dox, df_subset_range_un])

# Boxplot
sns.boxplot(
    data=df_combined,
    x="Group",
    y="Mean_Intensity_t0",
    palette={"Dox": "blue", "Untreated": "red"},
    showcaps=True,
    boxprops={"facecolor": "none", "edgecolor": "black"},
    whiskerprops={"color": "black"},
    medianprops={"color": "black"}
)

# Overlay points
sns.stripplot(
    data=df_combined,
    x="Group",
    y="Mean_Intensity_t0",
    jitter=True,
    alpha=0.5,
    palette={"Dox": "blue", "Untreated": "red"}
)

# Labels and title
plt.ylabel("Mean Intensity t0")
plt.title("Mean Intensity of Dilute Phase (t0) by Group")
plt.grid(True, linestyle="--", alpha=0.5)

# Save and show
plt.show()


In [None]:
#Graph the number of spots vs. time and the total area of spots vs. time in range {first}-{lst}

# Define colors for Dox and Untreated groups
dox_light = "lightblue"
dox_dark = "darkblue"
untreated_light = "lightcoral"
untreated_dark = "darkred"

# Define output CSV file path
filename = f"spots_area_{first}-{lst}.csv"
csv_output_path = os.path.join(output_folder, filename)

# Initialize an empty list to collect data for CSV
csv_data = []

# Define variables and their shorthand names for plotting
variables = {
    "unique_track_ids": "spots-count",
    "total_area": "total-area",
}
y_labels = {
    "unique_track_ids": "Number of Condensates",
    "total_area": "Total Condensate Area",
}

# Loop over each variable (Unique TRACK_ID count and Total Area)
for var, shorthand in variables.items():
    ## **Plot 1: Individual Traces + Group Averages**
    plt.figure(figsize=(8, 6))

    # Group by 'original_filename' and 'FRAME', then calculate metrics for Dox
    grouped_dox = df_subset_range_dox.groupby(['original_filename', 'FRAME']).agg(
        unique_track_ids=('TRACK_ID', 'nunique'),
        total_area=('AREA', 'sum')
    ).reset_index()

    # Plot individual traces for Dox (light color)
    for _, group in grouped_dox.groupby("original_filename"):
        plt.plot(group["FRAME"], group[var], color=dox_light, alpha=0.3, linewidth=1)

    # Group by 'FRAME' for the average and SEM of Dox
    dox_avg = grouped_dox.groupby("FRAME").agg(
        avg_y=(var, 'mean'),
        sem_y=(var, lambda x: np.std(x) / np.sqrt(len(x)))
    ).reset_index()

    # Plot group averages for Dox (dark color)
    plt.plot(dox_avg["FRAME"], dox_avg["avg_y"], color=dox_dark, marker="o", linestyle="-", linewidth=2, label="Dox (Mean)")

    # Group by 'original_filename' and 'FRAME', then calculate metrics for Untreated
    grouped_un = df_subset_range_un.groupby(['original_filename', 'FRAME']).agg(
        unique_track_ids=('TRACK_ID', 'nunique'),
        total_area=('AREA', 'sum')
    ).reset_index()

    # Plot individual traces for Untreated (light color)
    for _, group in grouped_un.groupby("original_filename"):
        plt.plot(group["FRAME"], group[var], color=untreated_light, alpha=0.3, linewidth=1)

    # Group by 'FRAME' for the average and SEM of Untreated
    untreated_avg = grouped_un.groupby("FRAME").agg(
        avg_y=(var, 'mean'),
        sem_y=(var, lambda x: np.std(x) / np.sqrt(len(x)))
    ).reset_index()

    # Combine both groups
    df_combined = pd.concat([grouped_dox, grouped_un], ignore_index=True)

    # Compute overall averages per FRAME across both groups
    df_avg = df_combined.groupby("FRAME").agg(
        avg_spots_count=('unique_track_ids', 'mean'),
        avg_total_area=('total_area', 'mean')
    ).reset_index()

    # Merge the averages into the combined dataframe
    df_combined = df_combined.merge(df_avg, on="FRAME", how="left")

    # Select and reorder columns
    df_combined = df_combined[[
        "original_filename", "FRAME", "unique_track_ids", "total_area",
        "avg_spots_count", "avg_total_area"
    ]]

    # Rename columns for clarity
    df_combined.columns = [
        "original_filename", "FRAME", "spots_count", "total_area",
        "avg_spots_count-perframe", "avg_total_area-perframe"
    ]

    # Append to CSV data list
    csv_data.append(df_combined)

   # Plot group averages for Untreated (dark color)
    plt.plot(untreated_avg["FRAME"], untreated_avg["avg_y"], color=untreated_dark, marker="o", linestyle="-", linewidth=2, label="Untreated (Mean)")

    # Labels and legend
    plt.xlabel("t/t0")
    plt.ylabel(y_labels[var])
    plt.title(f"Individual Traces and Group Averages ({y_labels[var]})")
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.5)

    # Save plot
    plt.savefig(f"{output_folder}/{shorthand}-all_{first}-{lst}.png", bbox_inches="tight")
    plt.show()

    ## **Plot 2: Mean with SEM Error Bars**
    plt.figure(figsize=(8, 6))

    # Plot group means with error bars for Dox
    plt.errorbar(dox_avg["FRAME"], dox_avg["avg_y"], yerr=dox_avg["sem_y"], fmt="o-", color=dox_dark, capsize=5, label="Dox (Mean ± SEM)")

    # Plot group means with error bars for Untreated
    plt.errorbar(untreated_avg["FRAME"], untreated_avg["avg_y"], yerr=untreated_avg["sem_y"], fmt="o-", color=untreated_dark, capsize=5, label="Untreated (Mean ± SEM)")

    # Labels and legend
    plt.xlabel("t/t0")
    plt.ylabel(y_labels[var])
    plt.title(f"Group Averages with SEM Error Bars ({y_labels[var]})")
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.5)

    # Save plot
    plt.savefig(f"{output_folder}/{shorthand}-avg_{first}-{lst}.png", bbox_inches="tight")
    plt.show()


# Concatenate all variables into one DataFrame
df_final = pd.concat(csv_data, ignore_index=True)

# Save to CSV
df_final.to_csv(csv_output_path, index=False)

print(f"CSV file saved: {csv_output_path}")


NOW I NEED TO NORMALIZE:

Normalize to the lowest frame number at least 2/3 of the total number of spots at the endpoint

For 30 in this dataset: it would be frame 11 because frame 11 has 394 spots, and 394 > 2/3(581)

In [None]:
df_subset_lst_dox['FRAME'].value_counts()

In [None]:
df_subset_lst_un['FRAME'].value_counts()

In [None]:
# Get the value counts of FRAME for Dox and Untreated
frame_counts_dox = df_subset_lst_dox['FRAME'].value_counts().sort_index()
frame_counts_un = df_subset_lst_un['FRAME'].value_counts().sort_index()

# Find the highest frame count for both groups
max_count_dox = frame_counts_dox.max()
max_count_un = frame_counts_un.max()

# Determine the lower max count between the two groups
lower_max_count = min(max_count_dox, max_count_un)
higher_max_count =  max(max_count_dox, max_count_un)

# Compute 2/3 of the lower max count
threshold = lower_max_count * (2 / 3)
threshhold_2 = higher_max_count * (2 / 3)

# Find the lowest FRAME number where count >= threshold for both groups
lowest_frame_dox = frame_counts_dox[frame_counts_dox >= threshold].index.min()
lowest_frame_un = frame_counts_un[frame_counts_un >= threshold].index.min()

# The final lowest frame is the minimum of the two lowest frames
nuc_frame = min(int(lowest_frame_dox), int(lowest_frame_un))

# Print results
print(f"Highest frame count for Dox: {max_count_dox}")
print(f"Highest frame count for Untreated: {max_count_un}")
print(f"Threshold (2/3 of the lower max count): {threshold:.2f}")
print(f"Threshold (2/3 of the lower max count): {threshhold_2:.2f}")
print(f"Lowest frame with count above threshold (for both groups): {nuc_frame}")

In [None]:
nuc_frame = 10 #use this to manually assign the frame range if needed
lst = 50 #reassign last

In [None]:
# Define a function to filter and save the dataframe
def process_and_save_subset(df_subset_lst, group_name, nuc_frame, lst, output_folder):
    # Ensure the dataframe is not None
    if df_subset_lst is None:
        raise ValueError(f"Dataframe for {group_name} is None, please check data loading.")
    
    # Print debug information to verify dataframe contents before processing
    print(f"Processing {group_name} dataframe:")
    print(df_subset_lst.head())  # Show first few rows for verification

    # Filter for frames within the range [0, lst]
    df_subset_lst = df_subset_lst[(df_subset_lst["FRAME"] >= (nuc_frame - 1)) & 
                                  (df_subset_lst["FRAME"] <= lst)]

    # Compute COUNT relative to nuc_frame
    df_subset_lst["COUNT"] = df_subset_lst["FRAME"] - nuc_frame + 1

    # Save to CSV
    filename = f"subset_spots_{group_name}_{nuc_frame}-{lst}.csv"
    output_path = os.path.join(output_folder, filename)
    df_subset_lst.to_csv(output_path, index=False)

    # Return the processed dataframe
    return df_subset_lst

# Process for Dox
df_subset_lst_dox = process_and_save_subset(df_subset_lst_dox, "Dox", nuc_frame, lst, output_folder)

# Process for Untreated
df_subset_lst_un = process_and_save_subset(df_subset_lst_un, "Untreated", nuc_frame, lst, output_folder)


In [None]:
# Define a function to normalize data for a given group (Dox or Untreated)
def normalize_group(df_subset_lst, group_name, nuc_frame, lst, output_folder):
    df_subset_lst = df_subset_lst.copy()
    print("Before apply:", df_subset_lst.shape)

    # --- STEP 1: Normalize intensity columns to Mean_Intensity_t0 ---
    intensity_cols = ["MEAN_INTENSITY_CH1", "MEDIAN_INTENSITY_CH1", "TOTAL_INTENSITY_CH1"]
    for col in intensity_cols:
        df_subset_lst[f"{col}_t0NORM"] = df_subset_lst[col] / df_subset_lst["Mean_Intensity_t0"]

    # --- STEP 2: Normalize RADIUS, AREA, PERIMETER to their value at nuc_frame ---
    norm_cols_direct = ["RADIUS", "AREA", "PERIMETER"]

    # Keep only rows with nuc_frame to use as reference
    valid_names = df_subset_lst[df_subset_lst["FRAME"] == nuc_frame]["unique_name"].unique()
    filtered_df = df_subset_lst[df_subset_lst["unique_name"].isin(valid_names)].copy()
    if "unique_name" not in filtered_df.columns:
        raise KeyError("'unique_name' not in DataFrame columns:\n" + str(filtered_df.columns))

    # Get reference values at nuc_frame for direct normalization
    ref_direct = filtered_df[filtered_df["FRAME"] == nuc_frame].set_index("unique_name")[norm_cols_direct]

    def normalize_direct(group):
        group = group.copy()
        if nuc_frame in group["FRAME"].values:
            ref = ref_direct.loc[group["unique_name"].iloc[0]]
            for col in norm_cols_direct:
                group[f"{col}_NORM"] = group[col] / ref[col]
        else:
            print(f"Skipping direct norm for {group['unique_name'].iloc[0]}")
        return group


    filtered_df = filtered_df.groupby("unique_name",group_keys=False).apply(normalize_direct)
    print("After normalization:", filtered_df.shape)

    # --- STEP 3: Normalize *_t0norm columns to their value at nuc_frame ---
    norm_cols_t0norm = [f"{col}_t0NORM" for col in intensity_cols]
    ref_t0norm = filtered_df[filtered_df["FRAME"] == nuc_frame].set_index("unique_name")[norm_cols_t0norm]

    def normalize_t0(group):
        group = group.copy()
        if nuc_frame in group["FRAME"].values:
            ref = ref_t0norm.loc[group["unique_name"].iloc[0]]
            for col in norm_cols_t0norm:
                group[col.replace("_t0NORM", "_NORM")] = group[col] / ref[col]
        else:
            print(f"Skipping t0norm for {group['unique_name'].iloc[0]}")
        return group

    filtered_df = filtered_df.groupby("unique_name", group_keys=False).apply(normalize_t0).reset_index(drop=True)
    print("After t0 normalization:", filtered_df.shape)

    # --- STEP 4: Filter to [nuc_frame - 1, lst] and add COUNT ---
    filtered_df = filtered_df[(filtered_df["FRAME"] >= (nuc_frame - 1)) & 
                              (filtered_df["FRAME"] <= lst)]

    filtered_df["COUNT"] = filtered_df["FRAME"] - nuc_frame + 1

    # --- STEP 5: Save ---
    filename = f"normalized_spots_{group_name}_{nuc_frame}-{lst}.csv"
    filtered_df.to_csv(os.path.join(output_folder, filename), index=False)

    return filtered_df


# Normalize for Dox group
df_norm_dox = normalize_group(df_subset_lst_dox, "HeLa-2112_2712-2714_Dox", nuc_frame, lst, output_folder)

# Normalize for Untreated group
df_norm_un = normalize_group(df_subset_lst_un, "Untreated", nuc_frame, lst, output_folder)

In [None]:
print(df_norm_dox.columns.tolist())
print(df_norm_un.columns.tolist())

In [None]:
print("DOX Averages:")
print(df_norm_dox.groupby("COUNT")["MEAN_INTENSITY_CH1_NORM"].mean())

print("Untreated Averages:")
print(df_norm_un.groupby("COUNT")["MEAN_INTENSITY_CH1_NORM"].mean())

In [None]:
# Define colors
dox_light = "lightblue"
dox_dark = "darkblue"
untreated_light = "lightcoral"
untreated_dark = "darkred"

# Define variables and their shorthand names
variables = {
    "RADIUS_NORM": "radius",
    "RADIUS": "radius-NO-NORM",
    "MEAN_INTENSITY_CH1_NORM": "mean-intensity",
    "MEAN_INTENSITY_CH1_t0NORM": "mean-intensity-t0",
    "MEAN_INTENSITY_CH1": "mean-intensity-NO-NORM",
    "TOTAL_INTENSITY_CH1_NORM": "total-intensity",
    "AREA_NORM": "area",
    "PERIMETER_NORM": "perimeter",
}
y_labels = {
    "RADIUS_NORM": "R/R0",
    "RADIUS": "R",
    "MEAN_INTENSITY_CH1_NORM": "Mean Intensity (Norm)",
    "MEAN_INTENSITY_CH1_t0NORM": "Mean Intensity/Diffuse Phase",
    "MEAN_INTENSITY_CH1": "Mean Intensity",
    "TOTAL_INTENSITY_CH1_NORM": "Total Intensity (Norm)",
    "AREA_NORM": "A/A0",
    "PERIMETER_NORM": "P/P0",
}

# Loop over each variable to create the plots
for var, shorthand in variables.items():
    ## **Plot 1: Individual Traces + Group Averages**
    plt.figure(figsize=(8, 6))

    # Plot individual traces for DOX (light color)
    for _, group in df_norm_dox.groupby("unique_name"):
        plt.plot(group["COUNT"], group[var], color=dox_light, alpha=0.3, linewidth=1)

    # Plot individual traces for Untreated (light color)
    for _, group in df_norm_un.groupby("unique_name"):
        plt.plot(group["COUNT"], group[var], color=untreated_light, alpha=0.3, linewidth=1)

    # Compute group averages
    dox_avg = df_norm_dox.groupby("COUNT")[var].mean()
    untreated_avg = df_norm_un.groupby("COUNT")[var].mean()

    # Plot group averages in dark colors
    plt.plot(dox_avg.index, dox_avg.values, color=dox_dark, marker="o", linestyle="-", linewidth=2, label="Dox (Mean)")
    plt.plot(untreated_avg.index, untreated_avg.values, color=untreated_dark, marker="o", linestyle="-", linewidth=2, label="Untreated (Mean)")

    # Labels and legend
    plt.xlabel("t/t0")
    plt.ylabel(y_labels[var])
    plt.title(f"Individual Traces and Group Averages ({y_labels[var]})")
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.5)

    # Save plot
    plt.savefig(f"{output_folder}/{shorthand}-all_norm_spots_{nuc_frame}-{lst}.png", bbox_inches="tight")

    # Show plot
    plt.show()

    ## **Plot 2: Mean with SEM Error Bars**
    plt.figure(figsize=(8, 6))

    # Compute SEM (Standard Error of the Mean)
    dox_sem = df_norm_dox.groupby("COUNT")[var].sem()
    untreated_sem = df_norm_un.groupby("COUNT")[var].sem()

    # Plot group means with error bars
    plt.errorbar(dox_avg.index, dox_avg.values, yerr=dox_sem.values, fmt="o-", color=dox_dark, capsize=5, label="Dox (Mean ± SEM)")
    plt.errorbar(untreated_avg.index, untreated_avg.values, yerr=untreated_sem.values, fmt="o-", color=untreated_dark, capsize=5, label="Untreated (Mean ± SEM)")

    # Labels and legend
    plt.xlabel("t/t0")
    plt.ylabel(y_labels[var])
    plt.title(f"Group Averages with SEM Error Bars ({y_labels[var]})")
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.5)

    # Save plot
    plt.savefig(f"{output_folder}/{shorthand}-avg_norm_spots_{nuc_frame}-{lst}.png", bbox_inches="tight")

    # Show plot
    plt.show()
