In [None]:
import sys
sys.path.append("..")
import anndata as ad
import pandas as pd

In [None]:
sys.path.append("..")
from analysis_functions.plotting import *
from analysis_functions.sampling import *
from analysis_functions.utils import *

In [None]:
import polars as pl

In [None]:
import os
from pathlib import Path

def find_highest_numbered_subfolder_with_file(root_folder, target_file = 'featICF_nuclei.parquet'):
    """
    Navigates through subfolders named as integers under the given root_folder.
    Returns the path of the file in the highest numbered subfolder that contains it.
    If the file isn't found in any subfolders, returns None.

    Parameters:
    root_folder (str): Path to the root folder containing numbered subfolders.
    target_file (str): Name of the file to search for in subfolders.
    """
    highest_file_path = None
    highest_number = -1

    for subdir, dirs, files in os.walk(root_folder):
        for dirname in dirs:
            # Attempt to convert folder name to an integer
            try:
                folder_number = int(dirname)
                # Check if this folder contains the target file
                potential_path = Path(subdir) / dirname / target_file
                if potential_path.exists() and folder_number > highest_number:
                    # Update highest number and file path if this is the largest so far
                    highest_number = folder_number
                    highest_file_path = potential_path
            except ValueError:
                # Non-integer folder names are ignored
                continue

    return highest_file_path



def add_suffix_to_column_names(df, suffix):
    """
    Adds a prefix and underscore to all column names in the Polars DataFrame.

    Parameters:
    df (pl.DataFrame): The original Polars DataFrame.
    prefix (str): The prefix string to add to each column name.

    Returns:
    pl.DataFrame: A new DataFrame with updated column names.
    """
    # Create a dictionary mapping old names to new names
    rename_dict = {col: f"{col}_{suffix}" for col in df.columns}

    # Rename the columns
    df = df.rename(rename_dict)

    return df


def load_and_stack_dataframes(df_list):
    """
    Loads multiple DataFrames, ensures column data types match, and stacks them.

    Parameters:
    df_list (list): A list of DataFrames to be stacked.

    Returns:
    pl.DataFrame: A new DataFrame with all provided DataFrames stacked.
    """

    # Initialize an empty list to hold the aligned DataFrames
    aligned_dfs = []

    # Define the target data types based on the first DataFrame as a reference
    # This assumes all DataFrames have the same column names and order
    reference_dtypes = df_list[0].dtypes

    for df in df_list:
        # Check each column's data type and cast if necessary
        for col, ref_dtype in zip(df.columns, reference_dtypes):
            if df[col].dtype != ref_dtype:
                df = df.with_columns(df[col].cast(ref_dtype))
        aligned_dfs.append(df)

    # Stack all the aligned DataFrames
    stacked_df = pl.concat(aligned_dfs)

    return stacked_df



import re
def is_meta_column(c):
    for ex in '''
        Metadata
        ^Count
        ImageNumber
        Object
        Parent
        Children
        Plate
        compound
        Well
        location
        Location
        _[XYZ]_
        _[XYZ]$
        Phase
        Scale
        Scaling
        Width
        Height
        Group
        FileName
        PathName
        BoundingBox
        URL
        Execution
        ModuleError
        LargeBrightArtefact
    '''.split():
        if re.search(ex, c):
            return True
    return False

def split_data_meta(df):
    data = df[[
        c for c in df.columns
        if 'float' in str(df[c].dtype) or 'int' in str(df[c].dtype)
        if c[0].isupper()
        if not is_meta_column(c)
    ]]
    meta = df.drop(columns=data.columns)
    return data, meta

## Load data

In [None]:
PROJECT_PATH = os.getcwd()
CELLPROFILER_ROOT = "/home/jovyan/share/data/cellprofiler/automation/results"

In [None]:
cell_locations = pl.read_parquet(os.path.join(PROJECT_PATH, "deepprofiler/Results/sc_profiles_all_sampled_5%_BEACTICA.parquet")).select(["Metadata_Plate", "Metadata_cmpdName", "Metadata_Well", "Metadata_Site", "Nuclei_Location_Center_X", "Nuclei_Location_Center_Y"])

In [None]:
meta = pd.read_csv(os.path.join("/home/jovyan/share/data/analyses/benjamin/Single_cell_project/DP_BEACTICA", "inputs", "metadata", "metadata_deepprofiler_beactica.csv")).drop_duplicates(inplace = False)
meta = meta.sort_values(by=['Metadata_Well', 'Metadata_Site'])
meta['Metadata_cmpdName'] = meta['Metadata_cmpdName'].str.upper()
meta["Metadata_cmpdNameConc"] = meta["Metadata_cmpdName"] +   " " + meta["Metadata_cmpdConc"].astype(str)
meta_pl = pl.DataFrame(meta).drop('Unnamed: 0.1', 'Unnamed: 0', "AR", "ER", "RNA", "AGP", "DNA", "Mito")
meta_pl = meta_pl.unique()

In [None]:
meta_filtered = meta_pl.join(cell_locations.select(["Metadata_Plate", "Metadata_Well", "Metadata_Site"]).unique(), how = "inner", on = ["Metadata_Plate", "Metadata_Site", "Metadata_Well"])

## Prepare normalization

In [None]:
def load_cellprofiler(meta, PROJECT_PATH, CELLPROFILER_ROOT):
    plates = ['PB000051',
 'PB000047',
 'PB000049',
 'PB000053',
 'PB000046',
 'PB000048',
 'PB000050',
 'PB000052']
    #out_df = []
    for p in tqdm.tqdm(plates):
        print("Importing plate:", p)
        nuclei_feats = pl.read_parquet(find_highest_numbered_subfolder_with_file(find_file_with_string(CELLPROFILER_ROOT, p)))
        nuclei_feats = add_suffix_to_column_names(nuclei_feats, "nuclei")
        cyto_feats = pl.read_parquet(find_highest_numbered_subfolder_with_file(find_file_with_string(CELLPROFILER_ROOT, p), target_file= "featICF_cytoplasm.parquet"))
        cyto_feats = add_suffix_to_column_names(cyto_feats, "cytoplasm")
        cell_feats = pl.read_parquet(find_highest_numbered_subfolder_with_file(find_file_with_string(CELLPROFILER_ROOT, p), target_file= "featICF_cells.parquet"))
        cell_feats = add_suffix_to_column_names(cell_feats, "cells")   

        df = nuclei_feats.join(
        cell_feats,
        left_on=['Metadata_Barcode_nuclei', 'Metadata_Site_nuclei', 'Metadata_Well_nuclei','Parent_cells_nuclei'],
        right_on=[ 'Metadata_Barcode_cells','Metadata_Site_cells', 'Metadata_Well_cells','ObjectNumber_cells'],
        how='left'
        )
        df = df.join(
        cyto_feats, 
        left_on = ['Metadata_Barcode_nuclei','Metadata_Site_nuclei', 'Metadata_Well_nuclei','Parent_cells_nuclei'],
        right_on = ['Metadata_Barcode_cytoplasm','Metadata_Site_cytoplasm', 'Metadata_Well_cytoplasm','ObjectNumber_cytoplasm'], 
        how='left')

        df = df.with_columns(df["Location_Center_X_nuclei"].cast(pl.Int64))
        df = df.with_columns(df["Location_Center_Y_nuclei"].cast(pl.Int64))
        df = df.with_columns((pl.lit("s") + df["Metadata_Site_nuclei"].cast(pl.Utf8)).alias("Metadata_Site_nuclei"))
        df = df.with_columns(df['Metadata_Barcode_nuclei'].apply(lambda s: s.split('-')[0]).alias('Metadata_Barcode_nuclei'))
        #df.write_parquet(os.path.join(PROJECT_PATH, "cellprofiler/feature_parquets", f"sc_profiles_cellprofiler_{p}.parquet"))
        #temp = cell_locations.join(df, left_on = ["Metadata_Plate", "Metadata_Site", "Metadata_Well", "Nuclei_Location_Center_X", "Nuclei_Location_Center_Y"], right_on=["Metadata_Barcode_nuclei", "Metadata_Site_nuclei", "Metadata_Well_nuclei", "Location_Center_X_nuclei", "Location_Center_Y_nuclei"], how = "inner")
        temp = df.join(meta.select(["Metadata_Plate", "Metadata_Site", "Metadata_cmpdName", "compound_id", "Metadata_cmpdConc", "Metadata_Well"]).unique(), left_on = ["Metadata_Barcode_nuclei", "Metadata_Site_nuclei", "Metadata_Well_nuclei"], right_on = ["Metadata_Plate", "Metadata_Site", "Metadata_Well"], how = "left")
        temp = temp.rename({"Metadata_Barcode_nuclei": "Metadata_Plate", 
                     "Metadata_Well_nuclei": "Metadata_Well", 
                     "Metadata_Site_nuclei": "Metadata_Site"})
        #temp.write_parquet(os.path.join(PROJECT_PATH, "cellprofiler/feature_parquets", f"sc_profiles_cellprofiler_{p}.parquet"))
        gc.collect()
        #return temp

In [None]:
import tqdm
def find_file_with_string(directory, string):
    """
    Finds a file in the specified directory that contains the given string in its name.

    Args:
    directory (str): The directory to search in.
    string (str): The string to look for in the file names.

    Returns:
    str: The path to the first file found that contains the string. None if no such file is found.
    """
    # Check if the directory exists
    if not os.path.exists(directory):
        print(f"The directory {directory} does not exist.")
        return None

    # Iterate through all files in the directory
    for file in os.listdir(directory):
        if string in file:
            return os.path.join(directory, file)

    # Return None if no file is found
    return print(f"No file found with {string}")

def main():
    PROJECT_ROOT = '/share/data/analyses/benjamin/Single_cell_project_rapids/Beactica'
    feat_out = "cellprofiler/feature_parquets/"

    cols_to_drop = ['Children_cytoplasm_Count_nuclei',
 'Location_Center_Z_nuclei',
 'Neighbors_FirstClosestObjectNumber_Adjacent_nuclei',
 'Neighbors_SecondClosestObjectNumber_Adjacent_nuclei',
 'Number_Object_Number_nuclei',
 'Parent_cells_nuclei',
 'ImageNumber_cells',
 'Metadata_AcqID_cells',
 'FileName_CONC_cells',
 'FileName_HOECHST_cells',
 'FileName_ICF_CONC_cells',
 'FileName_ICF_HOECHST_cells',
 'FileName_ICF_MITO_cells',
 'FileName_ICF_PHAandWGA_cells',
 'FileName_ICF_SYTO_cells',
 'FileName_MITO_cells',
 'FileName_PHAandWGA_cells',
 'FileName_SYTO_cells',
 'PathName_CONC_cells',
 'PathName_HOECHST_cells',
 'PathName_ICF_CONC_cells',
 'PathName_ICF_HOECHST_cells',
 'PathName_ICF_MITO_cells',
 'PathName_ICF_PHAandWGA_cells',
 'PathName_ICF_SYTO_cells',
 'PathName_MITO_cells',
 'PathName_PHAandWGA_cells',
 'PathName_SYTO_cells',
 'Children_cytoplasm_Count_cells',
 'Children_nuclei_Count_cells',
 'Location_Center_Z_cells',
 'Neighbors_FirstClosestObjectNumber_Adjacent_cells',
 'Neighbors_SecondClosestObjectNumber_Adjacent_cells',
 'Number_Object_Number_cells',
 'Parent_precells_cells',
 'ImageNumber_cytoplasm',
 'Metadata_AcqID_cytoplasm',
 'FileName_CONC_cytoplasm',
 'FileName_HOECHST_cytoplasm',
 'FileName_ICF_CONC_cytoplasm',
 'FileName_ICF_HOECHST_cytoplasm',
 'FileName_ICF_MITO_cytoplasm',
 'FileName_ICF_PHAandWGA_cytoplasm',
 'FileName_ICF_SYTO_cytoplasm',
 'FileName_MITO_cytoplasm',
 'FileName_PHAandWGA_cytoplasm',
 'FileName_SYTO_cytoplasm',
 'PathName_CONC_cytoplasm',
 'PathName_HOECHST_cytoplasm',
 'PathName_ICF_CONC_cytoplasm',
 'PathName_ICF_HOECHST_cytoplasm',
 'PathName_ICF_MITO_cytoplasm',
 'PathName_ICF_PHAandWGA_cytoplasm',
 'PathName_ICF_SYTO_cytoplasm',
 'PathName_MITO_cytoplasm',
 'PathName_PHAandWGA_cytoplasm',
 'PathName_SYTO_cytoplasm',
 'Number_Object_Number_cytoplasm',
 'Parent_cells_cytoplasm',
 'Parent_nuclei_cytoplasm']

    plates = ['PB000051']
 
    for p in tqdm.tqdm(plates):
        # Construct the file path using a function that finds the correct file
        file_path = find_file_with_string(os.path.join(PROJECT_ROOT, "cellprofiler/feature_parquets/"), p)
        if file_path is not None:
                print(f"Reading in plate {p}")
                feature_df = pl.read_parquet(file_path)
                feature_df = feature_df.drop(cols_to_drop)
                meta_features = [feat for feat in feature_df.columns if not is_meta_column(feat)]
                feature_df = feature_df.with_columns(
                                            [
                                                pl.col(column).cast(pl.Float32)
                                                for column in feature_df.columns
                                                if column not in meta_features and feature_df[column].dtype == pl.Float64
                                            ]
                                        )
                features = [col for col in feature_df.columns if col not in meta_features]
                return feature_df
            

In [None]:
test = main()

## Load normalized features

In [None]:
import gc
col_df = pl.read_parquet(os.path.join(PROJECT_PATH, "cellprofiler/feature_parquets", f"sc_profiles_normalized_cellprofiler_PB000051.parquet"))
meta_features_before = [col for col in col_df.columns if is_meta_column(col)]
blocklist_features = [col for col in col_df.columns if "Correlation_Manders" in col and "_nuclei" in col] +[col for col in col_df.columns if "Correlation_RWC" in col and "_nuclei" in col] +[col for col in col_df.columns if "Granularity_14" in col and "_nuclei" in col] + [col for col in col_df.columns if "Granularity_15" in col and "_nuclei" in col] +[col for col in col_df.columns if "Granularity_16" in col and "_nuclei" in col]
float64_features = [feat for feat in col_df.columns if col_df[feat].dtype == pl.Float64 and feat not in blocklist_features]

def load_and_merge_cellprofiler(cell_locations, feats, blocklist):
    plates = ['PB000051',
 'PB000047',
 'PB000049',
 'PB000053',
 'PB000046',
 'PB000048',
 'PB000050',
 'PB000052']
    
    cells = []

    for p in tqdm.tqdm(plates):
        df = pl.read_parquet(os.path.join(PROJECT_PATH, "cellprofiler/feature_parquets", f"sc_profiles_normalized_cellprofiler_{p}.parquet"))
        df = df.drop(blocklist)
        df = df.with_columns([pl.col(col).cast(pl.Float32) for col in feats])
        df = df.filter(
                        (pl.col("Location_Center_X_nuclei") > 150) &
                        (pl.col("Location_Center_X_nuclei") < 2850) &
                        (pl.col("Location_Center_Y_nuclei") > 150) &
                        (pl.col("Location_Center_Y_nuclei") < 2850)
                    ).filter(pl.col('Metadata_cmpdName').str.contains("\["))
        temp = cell_locations.join(df, left_on = ["Metadata_Plate", "Metadata_Site", "Metadata_Well", "Nuclei_Location_Center_X", "Nuclei_Location_Center_Y"], right_on=["Metadata_Plate", "Metadata_Site", "Metadata_Well", "Location_Center_X_nuclei", "Location_Center_Y_nuclei"], how = "inner")
        temp.write_parquet(os.path.join(PROJECT_PATH, "cellprofiler/feature_parquets", f"sc_profiles_joined_cellprofiler_{p}.parquet"))
        #cells.append(temp)
        gc.collect()
    #out_matched = load_and_stack_dataframes(cells).unique()
    #return out_matched

## Load joined + normalized features

In [None]:
import gc
def load_grit_cellprofiler(plates):
    out = []
    for p in tqdm.tqdm(plates):
        df = pl.read_parquet(os.path.join(PROJECT_PATH, "cellprofiler/feature_parquets", f"sc_profiles_joined_cellprofiler_{p}.parquet"))
        df = df.drop("Metadata_cmpdName_right")
        out.append(df)
        gc.collect()
    out_matched = load_and_stack_dataframes(out).unique()
    return out_matched

In [None]:
normalized_profiles = load_grit_cellprofiler(plates)


In [None]:
meta_features = [col for col in normalized_profiles.columns if is_meta_column(col)]

In [None]:
import pycytominer as pm 
def feature_selection_cellprofiler(normalized_profiles):
    meta_features = [col for col in normalized_profiles.columns if is_meta_column(col)]
    #normalized_profiles = normalized_profiles.filter(pl.col("Children_cytoplasm_Count_nuclei") > 0).filter(pl.col("Children_cytoplasm_Count_cells") > 0).filter(pl.col('Children_nuclei_Count_cells') > 0).filter(~pl.any_horizontal(pl.all().is_null()))
    normalized_profiles = normalized_profiles.filter(~pl.any_horizontal(pl.all().is_null()))
    blocklist_features = [col for col in normalized_profiles.columns if "Correlation_Manders" in col and "_nuclei" in col] +[col for col in normalized_profiles.columns if "Correlation_RWC" in col and "_nuclei" in col] +[col for col in normalized_profiles.columns if "Granularity_14" in col and "_nuclei" in col] + [col for col in normalized_profiles.columns if "Granularity_15" in col and "_nuclei" in col] +[col for col in normalized_profiles.columns if "Granularity_16" in col and "_nuclei" in col]
    features = [feat for feat in normalized_profiles.columns if feat not in meta_features and feat not in blocklist_features]
    final_features_pd = pm.feature_select(normalized_profiles.to_pandas(), features = features, operation = ['variance_threshold'], outlier_cutoff= 1000)
    final_feat = pl.DataFrame(final_features_pd)
    final_feat = final_feat.filter(pl.col("Metadata_cmpdName") != "[SORB]")
    # Drop outlier featuress
    features_temp = [feat for feat in final_feat.columns if feat not in meta_features]
    outlier_columns = [col for col in final_feat[features_temp].columns if final_feat[col].max() > 1000]
    final_features = final_feat.drop(outlier_columns)
    return final_features

In [None]:
final_features = feature_selection_cellprofiler(normalized_profiles) 

In [None]:
features_fixed = [feat for feat in final_features.columns if feat not in meta_features]

In [None]:
final_features.write_parquet(os.path.join(PROJECT_PATH, "cellprofiler/feature_parquets", f"sc_profiles_cellprofiler_final.parquet"))

In [None]:
final_features.group_by("Metadata_cmpdName").count()

## Aggregated analysis

In [None]:
aggregated_df_norm = (
    final_features
    .groupby(['Metadata_Plate', 'Metadata_Well', 'Metadata_cmpdName'])
    .agg([pl.col(feature).mean().alias(feature) for feature in features_fixed])
)

In [None]:
summary_features = aggregated_df_norm.to_pandas()[features_fixed].describe(percentiles= [0.05, 0.95, 0.5])
min_of_min = summary_features.loc['min'].min()  # Minimum of the 'min' values
max_of_max = summary_features.loc['max'].max()  # Maximum of the 'max' values
max_of_95th = summary_features.loc['95%'].max()  # Maximum of the '95th percentile' values
min_of_5th = summary_features.loc['5%'].min()  
print("Minimum of 'min' values:", min_of_min)
print("Maximum of 'max' values:", max_of_max)
print("Maximum of '95th percentile' values:", max_of_95th)
print("Minimum of '5th percentile' values:", min_of_5th)
summary_features

In [None]:
import cuml
import math
def run_umap_and_merge(df, features, option = 'cuml', n_neigh = None, min_dist=0.1, n_components=2, metric='cosine', aggregate=False):
    # Filter the DataFrame for features and metadata
    feature_data = df.select(features).to_pandas()
    meta_features = [col for col in df.columns if col not in features]
    meta_data = df.select(meta_features)
    #n_neighbors = 100
    if n_neigh is None:
        n_neigh = math.ceil(np.sqrt(len(feature_data)))
    # Run UMAP with cuml
    print(f"Starting UMAP with {n_neigh} neighbors")
    if option == "cuml":
        umap_model = cuml.UMAP(n_neighbors=n_neigh,  min_dist=min_dist, n_components=n_components, metric=metric).fit(feature_data)
        umap_embedding = umap_model.transform(feature_data)
    else:
        print(f"Option not available. Please choose 'cuml' or 'standard'")

    #cu_score = cuml.metrics.trustworthiness( feature_data, umap_embedding )
    #print(" cuml's trustworthiness score : ", cu_score )
    
    # Convert UMAP results to DataFrame and merge with metadata
    umap_df = pl.DataFrame(umap_embedding)

    old_column_name = umap_df.columns[0]
    old_column_name2 = umap_df.columns[1]
    # Rename the column
    new_column_name = "UMAP1"
    new_column_name2 = "UMAP2"
    umap_df = umap_df.rename({old_column_name: new_column_name, old_column_name2: new_column_name2})

    merged_df = pl.concat([meta_data, umap_df], how="horizontal")


    if aggregate:
        print("Aggregating data")
        aggregated_data = (df.groupby(['Metadata_Plate', 'Metadata_Well', 'Metadata_cmpdName']).agg([pl.col(feature).mean().alias(feature) for feature in features]))
        aggregated_data = aggregated_data.to_pandas()
        print(aggregated_data)
        aggregated_umap_embedding = umap_model.transform(aggregated_data[features])
        umap_agg = pl.DataFrame(aggregated_umap_embedding)
        umap_agg = umap_agg.rename({old_column_name: new_column_name, old_column_name2: new_column_name2})

        aggregated_meta_data = pl.DataFrame(aggregated_data[['Metadata_Plate', 'Metadata_Well', 'Metadata_cmpdName']])
        merged_agg = pl.concat([aggregated_meta_data, umap_agg], how="horizontal")
        return merged_df, merged_agg

    else:
        return merged_df

In [None]:
aggregated_umap = run_umap_and_merge(aggregated_df_norm, features_fixed, min_dist = 0.2, aggregate= False)

In [None]:
make_jointplot(aggregated_umap.to_pandas(), colouring= "Metadata_cmpdName", cmpd = "")

## Single cells

In [None]:
def make_jointplot(embedding, colouring, cmpd, save_path=None):
    
    # Generate a color palette based on unique values in the colouring column
    unique_treatments = list(embedding[colouring].unique())
    palette = sns.color_palette("Set2", len(unique_treatments))
    color_map = dict(zip(unique_treatments, palette))
    
    # Adjust colors and transparency if colouring is 'Metadat_cmpdName'
    if colouring == 'Metadata_cmpdName':
        if '[DMSO]' in color_map:
            color_map['[DMSO]'] = 'lightgrey'
    
    embedding['color'] = embedding[colouring].map(color_map)
    point_size = 5
    embedding['size'] = point_size
    
    # Increase the DPI for displaying
    plt.rcParams['figure.dpi'] = 300
    
    # Create the base joint plot
    g = sns.JointGrid(x='UMAP1', y='UMAP2', data=embedding, height=10)

    specific_value = '[DMSO]'
    if specific_value in unique_treatments:
        unique_treatments.remove(specific_value)
    unique_treatments.insert(0, specific_value)
    # Plot KDE plots for each category
    for treatment in unique_treatments:
        subset = embedding[embedding[colouring] == treatment]
        
        sns.kdeplot(x=subset["UMAP1"], ax=g.ax_marg_x, fill=True, color=color_map[treatment], legend=False)
        sns.kdeplot(y=subset["UMAP2"], ax=g.ax_marg_y, fill=True, color=color_map[treatment], legend=False)

    # Plot the scatter plots
    for treatment in unique_treatments:
        subset = embedding[embedding[colouring] == treatment]
        alpha_val = 0.3 if treatment == 'DMSO' and colouring == 'Metadat_cmpdName' else 0.5
        g.ax_joint.scatter(subset["UMAP1"], subset["UMAP2"], c=subset['color'], s=subset['size'], label=treatment, alpha=alpha_val, edgecolor='white', linewidth=0.5)
    
    g.ax_joint.set_title(cmpd)
    legend = g.ax_joint.legend(fontsize=10)
    legend.get_frame().set_facecolor('white')
    # Display the plot
    

    
    if save_path != None:
        current_time = datetime.datetime.now()
        timestamp = current_time.strftime("%Y%m%d_%H%M%S")
        g.savefig(f"{save_path}.png", dpi=300)

    plt.show()

In [None]:
sc_umap = run_umap_and_merge(final_features, features_fixed, min_dist = 0.4, aggregate= False)

In [None]:
make_jointplot(sc_umap.to_pandas(), colouring= "Metadata_cmpdName", cmpd = "")