In [1]:
import os 
os.getcwd()
import pandas as pd
import time
import numpy as np
import tqdm
import click
import shutil
import sys
import math
import subprocess
import polars as pl
import pharmbio

## Function definitions

In [2]:
def find_deeper_paths(root_dir, string_list):
    """
    For each string in string_list, join it with root_dir to generate a path,
    and then find directories or files that are one step deeper within that path.

    :param root_dir: The root directory as a string.
    :param string_list: A list of strings which will be appended to the root_dir to form paths.
    :return: A list of paths one level deeper for each string.
    """
    deeper_paths = []
    subdirs = next(os.walk(root_dir))[1]
    for string in string_list:
        matching_dirs = [d for d in subdirs if string in d][0]

        full_path = os.path.join(root_dir, matching_dirs)
        #full_path = os.path.join(root_dir, string)
        # Check if the path exists and is a directory
        if os.path.isdir(full_path):
            # Get all entries in the directory
            entries = next(os.walk(full_path))[1] + next(os.walk(full_path))[2]
            # Form full paths to these entries and extend the deeper_paths list
            deeper_paths.extend([os.path.join(full_path, entry) for entry in entries])
    
    return deeper_paths

def find_latest_parquet(folders, filename= "featICF_nuclei.parquet"):
    "Takes list of parent folders for plates and finds most recent parquet files, set to nucleu features to extract Nuclei Locations"
    
    paths = []

    for folder in folders:
        # Initialize the highest subfolder number and path to the Parquet file
        highest_num = -1
        path_to_file = ""
        
        # Check if folder path exists
        if not os.path.exists(folder):
            print(f"Folder {folder} does not exist.")
            continue
        
        # List all subfolders in the current folder
        for subfolder in os.listdir(folder):
            subfolder_path = os.path.join(folder, subfolder)
            
            # Check if it's a directory and the name is an integer
            if os.path.isdir(subfolder_path) and subfolder.isdigit():
                subfolder_num = int(subfolder)
                
                # Check if the subfolder number is greater than the current highest
                if subfolder_num > highest_num:
                    # Check if the specified file exists in this subfolder
                    potential_file_path = os.path.join(subfolder_path, filename)
                    if os.path.isfile(potential_file_path):
                        highest_num = subfolder_num
                        path_to_file = potential_file_path
        
        # If a valid path was found, add it to the lis
        if path_to_file:
            paths.append(path_to_file)
        else:
            print(f"No '{filename}' found in the folder {folder}.")
    
    return paths


def read_combine_parquets(metadata, root_dir = "/share/data/cellprofiler/automation/results/"):
    """
    Reads and merges Parquet files corresponding to unique plate metadata.

    This function takes metadata that includes unique plate identifiers and locates the 
    corresponding Parquet files within a specified root directory by leveraging the 
    'find_latest_parquets' function. It then reads these Parquet files and combines them into 
    a single DataFrame. Only selected columns are retained during the read operation.

    Parameters:
    :param metadata: A pandas DataFrame containing at least the 'Metadata_Plate' column to identify unique plates.
    :param root_dir : str, The root directory where Parquet files are located. The function will search for Parquet files within this directory. Defaults to "/share/data/cellprofiler/automation/results/".

    :return: combined_df : A pandas DataFrame containing combined data from all the Parquet files associated with the unique plates.

    Raises:
    - Exception: If a Parquet file cannot be read, it prints an error message with the file path and exception.
    """
    file_paths = find_deeper_paths(root_dir, metadata["barcode"].unique())
    combined_df = pd.DataFrame()
    columns = ["Metadata_Barcode", "Metadata_Site", "Metadata_AcqID", "Metadata_Well", "FileName_CONC", "FileName_HOECHST", "FileName_PHAandWGA", "FileName_SYTO", "FileName_MITO", "PathName_MITO", "PathName_HOECHST", "PathName_PHAandWGA", "PathName_SYTO", "Location_Center_X", "Location_Center_Y", "AreaShape_MajorAxisLength"]
    parquet_paths = find_latest_parquet(file_paths)
    for file_path in parquet_paths:
        try:
            # Read the Parquet file with selected columns only
            df = pd.read_parquet(file_path, columns=columns)
            combined_df = pd.concat([combined_df, df], ignore_index=True)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    return combined_df

def generate_metadata_main(metadata, location_df, project_folder: str, project_name: str):
    
    """
    Main function to generate structure of Metadata required by DeepProfiler
    IMPORTANT: Requires these specific columns, if not existent, check your input data!

    :param metadata: Initial metadata input df with compound informations
    :param location_df: Dataframe with location and plate information as generated by previous functions
    :return: Metadata file for DeepProfiler
    """
    if any('moa' in column.lower() for column in metadata.columns):
        metadata_filt = metadata[["barcode", "well", "Metadata_Site", "cbkid", "[moa]", "compound_name", "cmpd_conc"]]
    else: 
        metadata_filt = metadata[["barcode", "well", "Metadata_Site", "cbkid", "compound_name", "cmpd_conc"]]
    new_column_names_metadata = {
    'barcode': 'Metadata_Plate',
    'well': 'Metadata_Well',
    'cbkid': 'Metadata_cmpdName',
    'cmpd_conc': "Metadata_cmpdConc"}
    metadata_filt = metadata_filt.rename(columns=new_column_names_metadata)

    location_filt = location_df[["Metadata_Barcode", "Metadata_Site", "Metadata_Well", "FileName_CONC", "FileName_HOECHST", "FileName_PHAandWGA", "FileName_SYTO", "FileName_MITO"]]
    location_filt = location_filt.drop_duplicates().reset_index(drop = True)
    new_column_names_locations = {
    'Metadata_Barcode': 'Metadata_Plate',
    'FileName_CONC': 'ER',
    'FileName_HOECHST': 'DNA',
    'FileName_PHAandWGA': "AGP",
    "FileName_SYTO": "RNA", 
    "FileName_MITO": "Mito"}
    location_filt = location_filt.rename(columns=new_column_names_locations)
    result = pd.merge(metadata_filt, location_filt, on=['Metadata_Plate', 'Metadata_Well', "Metadata_Site"], how='left')
    result["DNA"] = result["Metadata_Plate"] + "/" + result["DNA"] 
    result["ER"] = result["Metadata_Plate"] + "/" + result["ER"] 
    result["RNA"] = result["Metadata_Plate"] + "/" + result["RNA"] 
    result["AGP"] = result["Metadata_Plate"] + "/" + result["AGP"] 
    result["Mito"] = result["Metadata_Plate"] + "/" + result["Mito"] 
    
    if not os.path.exists(project_folder):
        os.makedirs(project_folder)

    path_to_metadata = project_folder + "/inputs/metadata/metadata_deepprofiler" + project_name + ".csv"
    result.to_csv(path_to_metadata)
    #return result

def generate_locations_deepprofiler(location_df, root_folder):

    """
    Function to generate and write .csv for each site as expected by DeepProfiler. 
    Automatically saves files to pre-defined root folder: Needs to be locations folder in project

    :param meta_centers: Dataframe containing locations of cells as generated by read_combine_parquets
    :param root_folder: Folder with DeepProfiler folder structure 
    :return: None
    """
    #if "locations" not in root_folder:
    #    sys.exit("Error: 'locations' is not part of the root_folder. Please include 'locations' in the path.")

    plates = location_df["Metadata_Barcode"].unique()
    for plate in tqdm.tqdm(plates):
        output_folder = root_folder +  "/inputs/locations/" + plate
        os.makedirs(output_folder, exist_ok=True)
        plate_data = location_df[location_df["Metadata_Barcode"] == plate]
        # Group the data by 'well' and 'site' and save each group as a separate CSV file
        grouped = plate_data.groupby(['Metadata_Well', 'Metadata_Site'])
        for group_name, group_data in grouped:
            well, site = group_name
            filename = f"{well}-s{site}-Nuclei.csv"
            file_path = os.path.join(output_folder, filename)
            if os.path.exists(file_path):
                print(f"File {filename} already exists. Skipping to the next.")
                continue
            group_data['Nuclei_Location_Center_X'] = group_data['Location_Center_X'].astype(int)
            group_data['Nuclei_Location_Center_Y'] = group_data['Location_Center_Y'].astype(int)
            group_data[['Nuclei_Location_Center_X','Nuclei_Location_Center_Y']].to_csv(file_path, index=False)


def create_image_data_symlinks(location_df, output_root_folder: str):
    """
    Creates symbolic links for image files in their respective original directories to reduce run/ copy time.
    Images in folder required by DeepProfiler

    :param feat_df: A pandas DataFrame containing the columns 'PathName_HOECHST' and 'Metadata_Barcode',
                    which include the paths to the source image files and the barcode metadata, respectively.
    :param output_root_folder: The root directory path where the destination folders will be created
                               and where the symbolic links will point to.
    :return: None. The function performs file operations and does not return any value.
    """
    source_folders = location_df["PathName_HOECHST"].unique()
    for folder in tqdm.tqdm(source_folders):
        output_folder = output_root_folder + "/inputs/images"
        destination_folder = os.path.join(output_folder, location_df[location_df["PathName_HOECHST"] == folder]["Metadata_Barcode"].unique()[0])
        os.makedirs(destination_folder, exist_ok=True)
        print("Linking plate:", location_df[location_df["PathName_HOECHST"] == folder]["Metadata_Barcode"].unique()[0])
        if "mikro" in folder:
            folder = folder.replace("mikro", "mikro2")
        for filename in os.listdir(folder):

            if filename.lower().endswith('.tiff') and 'thumb' not in filename.lower():
                source_file_path = os.path.join(folder, filename)
                destination_file_path = os.path.join(destination_folder, filename)
                if os.path.islink(destination_file_path) or os.path.exists(destination_file_path):
                    print(f"Symlink for {filename} already exists. Skipping to the next.")
                    continue
                if not os.path.islink(destination_file_path) and not os.path.exists(destination_file_path):
                    os.symlink(source_file_path, destination_file_path)


def create_project_folder(folder_path):
    """
    Creates a main folder and subfolders as specified.

    The structure will be:
    - <folder_path>/
      - inputs/
        - config/
        - images/
        - locations/
        - metadata/
      - outputs/
        - checkpoint/

    :param folder_path: The main directory path where the folder structure will be created.
    """
    # Main folder
    os.makedirs(folder_path, exist_ok=True)
    print(f"Main folder created at: {folder_path}")

    # Subfolders for Inputs
    inputs_subfolders = ['config', 'images', 'locations', 'metadata']
    for subfolder in inputs_subfolders:
        os.makedirs(os.path.join(folder_path, 'inputs', subfolder), exist_ok=True)
    
    # Subfolder for Outputs
    os.makedirs(os.path.join(folder_path, 'outputs', 'checkpoint'), exist_ok=True)
    #Copy generic config file to folder
    shutil.copy("/home/jovyan/share/data/analyses/benjamin/Single_cell_project/deepprofiler_config_example.json", os.path.join(folder_path, 'inputs', 'config', 'deepprofiler_config_example.json'))

def copy_checkpoint_to_subfolder(output_folder, checkpoint_file_path ):
    """
    Copies the specified checkpoint file to the checkpoint subfolder in the given output folder structure.

    :param output_folder: The main output folder where the subfolders are located.
    :param checkpoint_file_path: The path to the checkpoint file to be copied.
    """
    # Path to the destination checkpoint subfolder    
    checkpoint_dest_folder = os.path.join(output_folder, 'outputs', 'checkpoint')

    # Copy the checkpoint file to the 'checkpoint' subfolder
    checkpoint_dest_path = os.path.join(checkpoint_dest_folder, os.path.basename(checkpoint_file_path))
    shutil.copy2(checkpoint_file_path, checkpoint_dest_path)
    

## Run prep functions

In [60]:
ref_metadata = pd.read_csv("specs2k_cmpd.csv", sep = ",")
ref_metadata = ref_metadata.rename(columns = {"cmpd_id": "compound_id", "well_id" : "well"})



In [61]:
specs2k_plates = list(ref_metadata["barcode"].unique())

In [22]:
from pharmbio.data_processing.quality_control import get_qc_module, get_channels, flag_outlier_images
from pharmbio.dataset.image_quality import get_image_quality_ref, get_image_quality_data

def run_quality_control(project_name,  metadata, qc_plates: list, sd: float):
    "Function runs quality control on plates for project and filters metadata based on selected"
    # Set the environment variable
    
    qc_ref_df = get_image_quality_ref(project_name, filter={"plate_barcode": qc_plates})
    qc_df = get_image_quality_data(qc_ref_df, force_merging_columns="drop")
    flagged_images = flag_outlier_images(qc_df, default_sd_step=(-sd, sd)).select(['image_id','Metadata_AcqID','Metadata_Barcode','Metadata_Well','Metadata_Site','ImageNumber','outlier_flag']).filter(pl.col('outlier_flag') == 0).to_pandas()
    flagged_images.rename(columns={'Metadata_Barcode': 'barcode', 'Metadata_Well': 'well'}, inplace=True)
    specs_meta_full_flags = pd.merge(flagged_images, metadata, on = ["barcode", "well"], how = "left")

    return specs_meta_full_flags

In [23]:
run_quality_control("specs2k", ref_metadata, specs2k_plates, sd = 3)

INFO: Quering the db for specs2k found 1 study: ['specs2k']
__________________________________________________
INFO: 1
INFO: 	specs2k
INFO: 	['P102785', 'P102785', 'P103589', 'P103590', 'P103591', 'P103592', 'P103593', 'P103594', 'P103595', 'P103596', 'P103597', 'P103598', 'P103599', 'P103600', 'P103601', 'P103602', 'P103603', 'P103604', 'P103605', 'P103606', 'P103607', 'P103608', 'P103609', 'P103610', 'P103611', 'P103612', 'P103613', 'P103614', 'P103615', 'P103616', 'P103617', 'P103618', 'P103619', 'P103620', 'P103621']
INFO: 
__________________________________________________
INFO: Successfully imported (2772, 591): /share/data/cellprofiler/automation/results/P103589/4319/6068/qcRAW_images_P103589.parquet
INFO: Successfully imported (2772, 591): /share/data/cellprofiler/automation/results/P103593/4321/6073/qcRAW_images_P103593.parquet
INFO: Successfully imported (2772, 591): /share/data/cellprofiler/automation/results/P103595/4322/6075/qcRAW_images_P103595.parquet
INFO: Successfully 

OutlierSD_FocusScore_-3_3,OutlierSD_MaxIntensity_-3_3,OutlierSD_MeanIntensity_-3_3,OutlierSD_PowerLogLogSlope_-3_3,OutlierSD_StdIntensity_-3_3,outlier_flag
i64,i64,i64,i64,i64,i64
5643,2286,1751,2648,3921,8582


Unnamed: 0,image_id,Metadata_AcqID,barcode,well,Metadata_Site,ImageNumber,outlier_flag,layout_id,solvent,stock_conc,...,cmpd_vol,cmpd_vol_unit,well_vol,well_vol_unit,cmpd_conc,cmpd_conc_unit,cell_line,cells_per_well,treatment,treatment_units
0,4314_P103619_B02_1,4314,P103619,B02,1,1,0,P103619-SPECS-2K-U2OS-48h-P01-L1,DMSO,100.0,...,280.0,nL,40.0,uL,0.7,perc,U2OS,750.0,48.0,h
1,4314_P103619_B02_2,4314,P103619,B02,2,2,0,P103619-SPECS-2K-U2OS-48h-P01-L1,DMSO,100.0,...,280.0,nL,40.0,uL,0.7,perc,U2OS,750.0,48.0,h
2,4314_P103619_B02_3,4314,P103619,B02,3,3,0,P103619-SPECS-2K-U2OS-48h-P01-L1,DMSO,100.0,...,280.0,nL,40.0,uL,0.7,perc,U2OS,750.0,48.0,h
3,4314_P103619_B02_4,4314,P103619,B02,4,1,0,P103619-SPECS-2K-U2OS-48h-P01-L1,DMSO,100.0,...,280.0,nL,40.0,uL,0.7,perc,U2OS,750.0,48.0,h
4,4314_P103619_B02_5,4314,P103619,B02,5,2,0,P103619-SPECS-2K-U2OS-48h-P01-L1,DMSO,100.0,...,280.0,nL,40.0,uL,0.7,perc,U2OS,750.0,48.0,h
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82889,4350_P103618_O23_5,4350,P103618,O23,5,2,0,P103618-SPECS-2K-U2OS-48h-P15-L2,DMSO,10.0,...,40.0,nL,40.0,uL,10.0,uM,U2OS,750.0,48.0,h
82890,4350_P103618_O23_6,4350,P103618,O23,6,3,0,P103618-SPECS-2K-U2OS-48h-P15-L2,DMSO,10.0,...,40.0,nL,40.0,uL,10.0,uM,U2OS,750.0,48.0,h
82891,4350_P103618_O23_7,4350,P103618,O23,7,1,0,P103618-SPECS-2K-U2OS-48h-P15-L2,DMSO,10.0,...,40.0,nL,40.0,uL,10.0,uM,U2OS,750.0,48.0,h
82892,4350_P103618_O23_8,4350,P103618,O23,8,2,0,P103618-SPECS-2K-U2OS-48h-P15-L2,DMSO,10.0,...,40.0,nL,40.0,uL,10.0,uM,U2OS,750.0,48.0,h


In [57]:
def find_deeper_paths(root_dir, string_list):
    """
    For each string in string_list, join it with root_dir to generate a path,
    and then find directories or files that are one step deeper within that path.

    :param root_dir: The root directory as a string.
    :param string_list: A list of strings which will be appended to the root_dir to form paths.
    :return: A list of paths one level deeper for each string.
    """
    deeper_paths = []
    subdirs = next(os.walk(root_dir))[1]
    for string in string_list:
        matching_dirs = [d for d in subdirs if string in d]
        full_path = os.path.join(root_dir, matching_dirs[0])
        #full_path = os.path.join(root_dir, string)
        # Check if the path exists and is a directory
        if os.path.isdir(full_path):
            # Get all entries in the directory
            entries = next(os.walk(full_path))[1] + next(os.walk(full_path))[2]
            # Form full paths to these entries and extend the deeper_paths list
            deeper_paths.extend([os.path.join(full_path, entry) for entry in entries])
    
    return deeper_paths


In [58]:
find_deeper_paths("/home/jovyan/share/data/cellprofiler/automation/results/", list(ref_metadata["barcode"].unique()))

IndexError: list index out of range

In [35]:
def read_combine_parquets(metadata, root_dir = "/home/jovyan/share/data/cellprofiler/automation/results/"):
    """
    Reads and merges Parquet files corresponding to unique plate metadata.

    This function takes metadata that includes unique plate identifiers and locates the 
    corresponding Parquet files within a specified root directory by leveraging the 
    'find_latest_parquets' function. It then reads these Parquet files and combines them into 
    a single DataFrame. Only selected columns are retained during the read operation.

    Parameters:
    :param metadata: A pandas DataFrame containing at least the 'Metadata_Plate' column to identify unique plates.
    :param root_dir : str, The root directory where Parquet files are located. The function will search for Parquet files within this directory. Defaults to "/share/data/cellprofiler/automation/results/".

    :return: combined_df : A pandas DataFrame containing combined data from all the Parquet files associated with the unique plates.

    Raises:
    - Exception: If a Parquet file cannot be read, it prints an error message with the file path and exception.
    """
    file_paths = find_deeper_paths(root_dir, list(metadata["barcode"].unique()))
    combined_df = pd.DataFrame()
    columns = ["Metadata_Barcode", "Metadata_Site", "Metadata_AcqID", "Metadata_Well", "FileName_CONC", "FileName_HOECHST", "FileName_PHAandWGA", "FileName_SYTO", "FileName_MITO", "PathName_MITO", "PathName_HOECHST", "PathName_PHAandWGA", "PathName_SYTO", "Location_Center_X", "Location_Center_Y", "AreaShape_MajorAxisLength"]
    parquet_paths = find_latest_parquet(file_paths)
    for file_path in parquet_paths:
        try:
            # Read the Parquet file with selected columns only
            df = pd.read_parquet(file_path, columns=columns)
            combined_df = pd.concat([combined_df, df], ignore_index=True)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    return combined_df

In [62]:
read_combine_parquets(ref_metadata)

['/home/jovyan/share/data/cellprofiler/automation/results/P103589/4319', '/home/jovyan/share/data/cellprofiler/automation/results/P103590/4324', '/home/jovyan/share/data/cellprofiler/automation/results/P103591/4320', '/home/jovyan/share/data/cellprofiler/automation/results/P103592/4325', '/home/jovyan/share/data/cellprofiler/automation/results/P103593/4321', '/home/jovyan/share/data/cellprofiler/automation/results/P103594/4326', '/home/jovyan/share/data/cellprofiler/automation/results/P103595/4322', '/home/jovyan/share/data/cellprofiler/automation/results/P103596/4327', '/home/jovyan/share/data/cellprofiler/automation/results/P103597/4323', '/home/jovyan/share/data/cellprofiler/automation/results/P103598/4328', '/home/jovyan/share/data/cellprofiler/automation/results/P103599/4330', '/home/jovyan/share/data/cellprofiler/automation/results/P103600/4335', '/home/jovyan/share/data/cellprofiler/automation/results/P103601/4331', '/home/jovyan/share/data/cellprofiler/automation/results/P10360

Unnamed: 0,Metadata_Barcode,Metadata_Site,Metadata_AcqID,Metadata_Well,FileName_CONC,FileName_HOECHST,FileName_PHAandWGA,FileName_SYTO,FileName_MITO,PathName_MITO,PathName_HOECHST,PathName_PHAandWGA,PathName_SYTO,Location_Center_X,Location_Center_Y,AreaShape_MajorAxisLength
0,P103589,2,4319,E12,E12_s2_x1_y0_Fluorescence_730_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_405_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_561_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_488_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,926.797119,11.438328,44.990871
1,P103589,2,4319,E12,E12_s2_x1_y0_Fluorescence_730_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_405_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_561_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_488_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,1044.115845,30.758276,63.868637
2,P103589,2,4319,E12,E12_s2_x1_y0_Fluorescence_730_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_405_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_561_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_488_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,1136.627197,19.286873,58.901974
3,P103589,2,4319,E12,E12_s2_x1_y0_Fluorescence_730_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_405_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_561_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_488_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,1610.732910,2.509317,28.983194
4,P103589,2,4319,E12,E12_s2_x1_y0_Fluorescence_730_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_405_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_561_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_488_nm_Ex.tiff,E12_s2_x1_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,/share/mikro2/squid/specs2k/P103589_specs2k_U2...,1723.228027,11.532456,51.302822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14743716,P103621,7,4340,C12,C12_s7_x0_y2_Fluorescence_730_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_405_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_561_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_488_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_638_nm_Ex.tiff,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,721.762268,2426.577393,92.734200
14743717,P103621,7,4340,C12,C12_s7_x0_y2_Fluorescence_730_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_405_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_561_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_488_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_638_nm_Ex.tiff,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,862.927979,2445.719482,80.762726
14743718,P103621,7,4340,C12,C12_s7_x0_y2_Fluorescence_730_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_405_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_561_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_488_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_638_nm_Ex.tiff,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,1438.423828,2461.214355,88.921585
14743719,P103621,7,4340,C12,C12_s7_x0_y2_Fluorescence_730_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_405_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_561_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_488_nm_Ex.tiff,C12_s7_x0_y2_Fluorescence_638_nm_Ex.tiff,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,/share/mikro2/squid/specs2k/P103621_specs2k_U2...,2488.612305,2488.196777,32.064964


In [3]:
projectfolder = "/home/jovyan/share/data/analyses/benjamin/Single_cell_project/DP_specs2k"

In [19]:
location_df = read_combine_parquets(beactica_plate_info_flag)

In [24]:
location_df["Metadata_Barcode"] = location_df['Metadata_Barcode'].str.extract(regex_pattern2)
location_df

Unnamed: 0,Metadata_Barcode,Metadata_Site,Metadata_AcqID,Metadata_Well,FileName_CONC,FileName_HOECHST,FileName_PHAandWGA,FileName_SYTO,FileName_MITO,PathName_MITO,PathName_HOECHST,PathName_PHAandWGA,PathName_SYTO,Location_Center_X,Location_Center_Y,AreaShape_MajorAxisLength
0,PB000046,1,3061,O10,O10_s1_x0_y0_Fluorescence_730_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_405_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_561_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_488_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,58.388790,13.499105,65.448700
1,PB000046,1,3061,O10,O10_s1_x0_y0_Fluorescence_730_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_405_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_561_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_488_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,2223.653320,27.078863,77.365685
2,PB000046,1,3061,O10,O10_s1_x0_y0_Fluorescence_730_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_405_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_561_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_488_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,2885.796875,32.591450,98.777878
3,PB000046,1,3061,O10,O10_s1_x0_y0_Fluorescence_730_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_405_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_561_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_488_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,2407.544434,66.699623,91.876404
4,PB000046,1,3061,O10,O10_s1_x0_y0_Fluorescence_730_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_405_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_561_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_488_nm_Ex.tiff,O10_s1_x0_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,/share/mikro/squid/beactica/PB000046-P1-L1-v2-...,579.246216,71.594727,73.029297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13517180,PB000053,2,3046,F02,F02_s2_x1_y0_Fluorescence_730_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_405_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_561_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_488_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro/squid/beactica/PB000053-P4-L2-con...,/share/mikro/squid/beactica/PB000053-P4-L2-con...,/share/mikro/squid/beactica/PB000053-P4-L2-con...,/share/mikro/squid/beactica/PB000053-P4-L2-con...,1256.488281,2993.072754,39.377937
13517181,PB000053,2,3046,F02,F02_s2_x1_y0_Fluorescence_730_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_405_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_561_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_488_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro/squid/beactica/PB000053-P4-L2-con...,/share/mikro/squid/beactica/PB000053-P4-L2-con...,/share/mikro/squid/beactica/PB000053-P4-L2-con...,/share/mikro/squid/beactica/PB000053-P4-L2-con...,102.966904,2993.146484,36.687019
13517182,PB000053,2,3046,F02,F02_s2_x1_y0_Fluorescence_730_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_405_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_561_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_488_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro/squid/beactica/PB000053-P4-L2-con...,/share/mikro/squid/beactica/PB000053-P4-L2-con...,/share/mikro/squid/beactica/PB000053-P4-L2-con...,/share/mikro/squid/beactica/PB000053-P4-L2-con...,267.837738,2994.735107,34.771751
13517183,PB000053,2,3046,F02,F02_s2_x1_y0_Fluorescence_730_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_405_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_561_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_488_nm_Ex.tiff,F02_s2_x1_y0_Fluorescence_638_nm_Ex.tiff,/share/mikro/squid/beactica/PB000053-P4-L2-con...,/share/mikro/squid/beactica/PB000053-P4-L2-con...,/share/mikro/squid/beactica/PB000053-P4-L2-con...,/share/mikro/squid/beactica/PB000053-P4-L2-con...,1200.466064,2994.746582,25.113779


In [29]:
def generate_metadata_beactica(metadata, location_df, project_folder: str, project_name: str):
    
    """
    Main function to generate structure of Metadata required by DeepProfiler
    IMPORTANT: Requires these specific columns, if not existent, check your input data!

    :param metadata: Initial metadata input df with compound informations
    :param location_df: Dataframe with location and plate information as generated by previous functions
    :return: Metadata file for DeepProfiler
    """
    new_column_names_metadata = {
    'Metadata_Barcode': 'Metadata_Plate',
    'well': 'Metadata_Well',
    'cmpd_name': 'Metadata_cmpdName',
    'cmpd_conc': "Metadata_cmpdConc"}
    metadata_filt = metadata.rename(columns=new_column_names_metadata)
    metadata_filt['Metadata_Plate'] = metadata_filt['Metadata_Plate'].str.extract(regex_pattern2)


    location_filt = location_df[["Metadata_Barcode", "Metadata_Site", "Metadata_Well", "FileName_CONC", "FileName_HOECHST", "FileName_PHAandWGA", "FileName_SYTO", "FileName_MITO"]]
    location_filt = location_filt.drop_duplicates().reset_index(drop = True)
    new_column_names_locations = {
    'Metadata_Barcode': 'Metadata_Plate',
    'FileName_CONC': 'ER',
    'FileName_HOECHST': 'DNA',
    'FileName_PHAandWGA': "AGP",
    "FileName_SYTO": "RNA", 
    "FileName_MITO": "Mito"}
    location_filt = location_filt.rename(columns=new_column_names_locations)
    result = pd.merge(metadata_filt, location_filt, on=['Metadata_Plate', 'Metadata_Well', "Metadata_Site"], how='left')
    result["DNA"] = result["Metadata_Plate"] + "/" + result["DNA"] 
    result["ER"] = result["Metadata_Plate"] + "/" + result["ER"] 
    result["RNA"] = result["Metadata_Plate"] + "/" + result["RNA"] 
    result["AGP"] = result["Metadata_Plate"] + "/" + result["AGP"] 
    result["Mito"] = result["Metadata_Plate"] + "/" + result["Mito"] 
    result["Metadata_Site"] = "s" + result["Metadata_Site"].astype(str)
    if not os.path.exists(project_folder):
        os.makedirs(project_folder)

    path_to_metadata = project_folder + "/inputs/metadata/metadata_deepprofiler" + project_name + ".csv"
    result.to_csv(path_to_metadata)
    #return result

In [30]:
generate_metadata_beactica(beactica_plate_info_flag, location_df, projectfolder, "beactica")

In [31]:
create_image_data_symlinks(location_df, projectfolder)

  0%|          | 0/8 [00:00<?, ?it/s]

Linking plate: PB000046


 12%|█▎        | 1/8 [00:08<00:57,  8.29s/it]

Linking plate: PB000047


 25%|██▌       | 2/8 [00:16<00:50,  8.40s/it]

Linking plate: PB000048


 38%|███▊      | 3/8 [00:25<00:42,  8.51s/it]

Linking plate: PB000049


 50%|█████     | 4/8 [00:34<00:34,  8.59s/it]

Linking plate: PB000050


 62%|██████▎   | 5/8 [00:42<00:25,  8.65s/it]

Linking plate: PB000051


 75%|███████▌  | 6/8 [00:51<00:17,  8.68s/it]

Linking plate: PB000052


 88%|████████▊ | 7/8 [01:00<00:08,  8.76s/it]

Linking plate: PB000053


100%|██████████| 8/8 [01:09<00:00,  8.70s/it]


In [None]:
def main(projectname, projectfolder, metadata, mode, checkpoints):
    start_time = time.time()
    if mode == 'metadata':
        try:
            print("Starting the script...")
            metadata_in = pd.read_csv(metadata)
            metadata_in = metadata_in.dropna(subset=['barcode'])
            print(metadata_in.columns)
            print("Creating project folder...")
            create_project_folder(projectfolder)

            if checkpoints is None:
                # Use the default checkpoint parameter value here
                default_checkpoint = "/home/jovyan/share/data/analyses/benjamin/Single_cell_project/Cell_Painting_CNN_v1.hdf5"
                print(f"Using default checkpoint: {default_checkpoint}")
                copy_checkpoint_to_subfolder(projectfolder, default_checkpoint)
            else:
                print("Copying checkpoint to subfolder...")
                copy_checkpoint_to_subfolder(projectfolder, checkpoints)
            
            print("Running quality control...")
            with tqdm.tqdm(total=1, desc="Quality Control") as progress:
                
                qc_df = run_quality_control(str(projectname), metadata=metadata_in, qc_plates=metadata_in["barcode"].unique(), sd=3)
                progress.update()

            with tqdm.tqdm(total=1, desc="Reading and Combining Parquet Files") as progress:
                location_df = read_combine_parquets(metadata_in)
                progress.update()

            with tqdm.tqdm(total=1, desc="Generating Metadata File") as progress:
                generate_metadata_main(qc_df, location_df, projectfolder, projectname)
                progress.update()

            with tqdm.tqdm(total=1, desc="Generating Locations File for DeepProfiler") as progress:
                generate_locations_deepprofiler(location_df, projectfolder)
                progress.update()

            with tqdm.tqdm(total=1, desc="Creating Image Data Symlinks") as progress:
                create_image_data_symlinks(location_df, projectfolder)
                progress.update()

            print("Script execution completed successfully. Ready for profiling with --mode profile.")
            end_time = time.time()
            execution_time = end_time - start_time
            minutes = execution_time // 60
            seconds = execution_time % 60
            print(f"The script executed in {int(minutes)} minutes and {seconds:.2f} seconds.")
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            sys.exit(1)
    elif mode == 'profile':
        profile(projectfolder)
        end_time = time.time()
        execution_time = end_time - start_time
        minutes = execution_time // 60
        seconds = execution_time % 60
        print(f"The script executed in {int(minutes)} minutes and {seconds:.2f} seconds.")

    else:
        print("Invalid command. Use either 'metadata' or 'profile'.")

In [25]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_kde_colored_by_group(df, column_to_plot, color_column):
    plt.figure(figsize=(10, 6))
    
    # Generate a color palette with as many colors as there are unique treatments
    palette = sns.color_palette("husl", len(df[color_column].unique()))
    
    # Loop through the unique values of the color column and plot the KDE for each
    for idx, (value, color) in enumerate(zip(df[color_column].unique(), palette)):
        subset = df[df[color_column] == value]
        sns.kdeplot(subset[column_to_plot], color=color, label=str(value), shade=True, alpha=0.3)
    
    plt.title(f'KDE of {column_to_plot} colored by {color_column}')
    plt.legend(title=color_column, loc='upper left', bbox_to_anchor=(1, 1), frameon=False)
    plt.xlabel(column_to_plot)
    plt.ylabel('Density')
    plt.savefig("beactica_cellsize_dist.png", dpi=300, bbox_inches='tight')
    plt.close()

plot_kde_colored_by_group(location_df, "AreaShape_MajorAxisLength", "Metadata_Barcode")


`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(subset[column_to_plot], color=color, label=str(value), shade=True, alpha=0.3)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(subset[column_to_plot], color=color, label=str(value), shade=True, alpha=0.3)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(subset[column_to_plot], color=color, label=str(value), shade=True, alpha=0.3)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(subset[column_to_plot], color=color, label=str(value), shade=True, alpha=0.3)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This

In [37]:
import imageio
image_path = 'DP_BEACTICA/inputs/images/PB000046/B02_s1_x0_y0_Fluorescence_405_nm_Ex.tiff'
img = imageio.imread(image_path)

# Get image dimensions
height, width = img.shape
print(height, width)

3000 3000


  img = imageio.imread(image_path)
