# Generate min/max representative single-cell images per top two features with highest coefficients  

This is based on the absolute values of the coefficients.

## Import libraries

In [1]:
import pathlib
from pprint import pprint

import cv2
import pandas as pd
from typing import List, Dict

## Define functions

In [2]:
# Function for formatting min/max row data frames into dictionaries
def create_sc_dict(dfs: List[pd.DataFrame], names: List[str]) -> dict:
    """Format lists of data frames and names into a dictionary with all relevant metadata to find single-cell images.

    Args:
        dfs (List[pd.DataFrame]): List of data frames each containing a single cell and relevant metadata.
        names (List[str]): List of names corresponding to the data frames.

    Returns:
        dict: Dictionary containing info relevant for finding single-cell crops.
    """
    sc_dict = {}
    for df, name in zip(dfs, names):
        for i, (_, row) in enumerate(df.iterrows()):
            key = f"{name}_{i + 1}"
            sc_dict[key] = {
                "plate": row["Metadata_Plate"],
                "well": row["Metadata_Well"],
                "site": row["Metadata_Site"],
                "location_center_x": row["Metadata_Nuclei_Location_Center_X"],
                "location_center_y": row["Metadata_Nuclei_Location_Center_Y"],
            }
    return sc_dict

In [3]:
# Function for generating and saving single-cell crops per channel as PNGs
def generate_sc_crops(
    sc_dict: Dict,
    channel_mapping: Dict[int, str],
    images_dir: pathlib.Path,
    output_img_dir: pathlib.Path,
    crop_size: int,
) -> None:
    """Using a dictionary with single-cell metadata info per image set, single-cell crops per channel are generated
    and saved as PNGs in an image set folder.

    Args:
        sc_dict (Dict): Dictionary containing info relevant for finding single-cell crops.
        channel_mapping (Dict[int, str]): Dictionary mapping integer to channel name for generating paths.
        images_dir (pathlib.Path): Directory where illumination corrected images are found.
        output_img_dir (pathlib.Path): Main directory to save each image set single-cell crops
        crop_size (int): Size of the box in pixels (example: setting crop_size as 250 will make a 250x250 pixel crop
        around the single-cell center coordinates)
    """
    for key, info in sc_dict.items():
        # Initialize a list to store file paths for every image set
        file_paths = []

        # Create file paths with well, site, and channel
        for i in range(1, 5):  # Update the range to start from 1
            channel = channel_mapping[i]
            filename = f"{images_dir}/{info['well']}_01_{i}_{info['site']}_{channel}_001_illumcorrect.tiff"
            file_paths.append(filename)

            # Read the image
            channel_image = cv2.imread(filename, cv2.IMREAD_UNCHANGED)

            # Use the location_center_x and location_center_y to create a crop
            center_x = info.get("location_center_x")
            center_y = info.get("location_center_y")

            # Crop dimensions (including crop_size)
            half_crop = crop_size // 2

            # Ensure the center coordinates are valid
            if center_x is not None and center_y is not None:
                # Calculate crop boundaries
                top_left_x = max(int(center_x - half_crop), 0)
                top_left_y = max(int(center_y - half_crop), 0)
                bottom_right_x = min(int(center_x + half_crop), channel_image.shape[1])
                bottom_right_y = min(int(center_y + half_crop), channel_image.shape[0])

                # Perform cropping
                cropped_channel = channel_image[
                    top_left_y:bottom_right_y, top_left_x:bottom_right_x
                ]

                # Ensure the cropped image is of size 250x250
                cropped_channel = cv2.resize(cropped_channel, (crop_size, crop_size))

                # Make directory for the key to keep all channels for an image in one folder
                key_dir = pathlib.Path(f"{output_img_dir}/{key}")
                key_dir.mkdir(exist_ok=True, parents=True)

                # Save the cropped image with single_cell and channel information
                output_filename = pathlib.Path(f"{key_dir}/{key}_d{i}_cropped.png")

                # Check if the file already exists
                if not output_filename.exists():
                    cv2.imwrite(str(output_filename), cropped_channel)
                else:
                    print(f"File {output_filename} already exists. Skipping.")

## Set paths and variables

In [4]:
# Path to cell painting data directory
cell_painting_dir = pathlib.Path(
    "/media/18tbdrive/1.Github_Repositories/nf1_schwann_cell_painting_data"
)

# Images directory for plate 5 (using for finding single-cells)
images_dir = pathlib.Path(
    f"{cell_painting_dir}/1.cellprofiler_ic/Corrected_Images/Corrected_Plate_5"
).resolve(strict=True)

# Output dir for cropped images
output_img_dir = pathlib.Path("./sc_crops")
output_img_dir.mkdir(exist_ok=True)

# Define the size of the cropping box (NxN pixels)
crop_size = 300

# Define a mapping for the suffixes
channel_mapping = {1: "DAPI", 2: "GFP", 3: "CY5", 4: "RFP"}

# Create open list for one row data frames for each top feature per channel per cell type
list_of_dfs = []

# Create open list of names to assign each data frame in a list relating to the feature, channel, and cell type
list_of_names = []

## Load in Plate 5 data to generate repesentative images from

In [5]:
# Load in QC normalized + feature selected data as data frame
plate5_df = pd.read_parquet(
    pathlib.Path(
        f"{cell_painting_dir}/3.processing_features/data/single_cell_profiles/cleaned_sc_profiles/Plate_5_sc_feature_selected.parquet"
    )
)

# Load in QC annotated dataframe to extract neighbors
annot_df = pd.read_parquet(
    pathlib.Path(
        f"{cell_painting_dir}/3.processing_features/data/single_cell_profiles/cleaned_sc_profiles/Plate_5_sc_annotated.parquet"
    ),
    columns=[
        "Metadata_Well",
        "Metadata_Site",
        "Metadata_Nuclei_Number_Object_Number",
        "Cells_Neighbors_NumberOfNeighbors_Adjacent",
    ],
)

plate5_df = plate5_df.merge(
    annot_df,
    on=["Metadata_Well", "Metadata_Site", "Metadata_Nuclei_Number_Object_Number"],
    how="inner",
)

plate5_df.rename(
    columns={
        "Cells_Neighbors_NumberOfNeighbors_Adjacent": "Metadata_Number_of_Cells_Neighbors_Adjacent"
    },
    inplace=True,
)

print(plate5_df.shape)
plate5_df.head()

(5348, 1160)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_InverseDifferenceMoment_GFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumVariance_CY5_3_03_256,Nuclei_Texture_SumVariance_DAPI_3_01_256,Nuclei_Texture_SumVariance_GFP_3_03_256,Nuclei_Texture_SumVariance_RFP_3_01_256,Metadata_Number_of_Cells_Neighbors_Adjacent
0,B,1,B1,11,76,NF1,WT,64,Plate_5,1,...,-0.716575,0.27685,-1.330407,-1.362612,-0.988199,-0.678416,-0.845844,-0.426886,-0.256321,0
1,B,1,B1,14,76,NF1,WT,67,Plate_5,1,...,-1.137427,-1.359737,-0.763721,-0.527832,-1.206265,-0.306204,1.872514,-0.197194,0.396954,2
2,B,1,B1,15,76,NF1,WT,68,Plate_5,1,...,-0.36376,0.404264,-1.484791,-1.732719,-1.561797,-0.621552,0.015391,-0.422749,-0.083205,0
3,B,1,B1,17,76,NF1,WT,70,Plate_5,1,...,1.374772,2.029471,2.650493,1.198457,1.298047,-0.567581,-0.546843,-0.421867,-0.356696,0
4,B,1,B1,19,76,NF1,WT,71,Plate_5,1,...,0.393491,1.768716,0.692973,0.029321,0.259599,-0.694014,-0.594088,-0.552046,-0.41981,1


## Load in feature importance data and determine the top two highest coefficients 

We will be creating image montages for the features with the highest coefficients, which both relate to being important for predicting WT genotype (positive values).
The third highest value (in terms of absolute value) is a feature that is most important for predicting Null genotype, but we do not montage it here.

**Note:** Top positive feature means the most important in predicting the WT genotype, most negative is most important in predicting Null genotype.

In [6]:
# Load in feature importances from QC model
feat_import_df = pd.read_parquet(
    pathlib.Path(
        "../../2.evaluate_model/model_evaluation_data/feature_importances_qc.parquet"
    )
)

# Find the top highest coefficient feature (is positive to related to predicting WT)
top_coeff_feature = feat_import_df.sort_values(
    by="feature_importances", ascending=False
).iloc[0]["feature_names"]

# Find the second highest coefficient feature (is positive to related to predicting WT)
second_top_coeff_feature = feat_import_df.sort_values(
    by="feature_importances", ascending=False
).iloc[1]["feature_names"]

# Find the top negative feature (predicting Null) [NOT INCLUDED AS MONTAGE]
top_Null_feature = feat_import_df.loc[
    feat_import_df["feature_importances"].idxmin(), "feature_names"
]

# Print the features
print(top_coeff_feature)
print(second_top_coeff_feature)
print(top_Null_feature)

Nuclei_RadialDistribution_FracAtD_DAPI_4of4
Nuclei_RadialDistribution_FracAtD_CY5_2of4
Cytoplasm_RadialDistribution_FracAtD_RFP_4of4


## Filter plate 5 single-cells to only include isolated cells that are not near the edge of the FOV

In [7]:
# Filter the DataFrame directly
filtered_plate5_df = plate5_df[
    (plate5_df["Metadata_Number_of_Cells_Neighbors_Adjacent"].isin([0]))
    & (plate5_df["Metadata_Nuclei_Location_Center_X"] > crop_size // 2)
    & (
        plate5_df["Metadata_Nuclei_Location_Center_X"]
        < (plate5_df["Metadata_Nuclei_Location_Center_X"].max() - crop_size // 2)
    )
    & (plate5_df["Metadata_Nuclei_Location_Center_Y"] > crop_size // 2)
    & (
        plate5_df["Metadata_Nuclei_Location_Center_Y"]
        < (plate5_df["Metadata_Nuclei_Location_Center_Y"].max() - crop_size // 2)
    )
]

print(filtered_plate5_df.shape)
filtered_plate5_df.head()

(465, 1160)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_Site,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Cells_Number_Object_Number,...,Nuclei_Texture_InverseDifferenceMoment_GFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumVariance_CY5_3_03_256,Nuclei_Texture_SumVariance_DAPI_3_01_256,Nuclei_Texture_SumVariance_GFP_3_03_256,Nuclei_Texture_SumVariance_RFP_3_01_256,Metadata_Number_of_Cells_Neighbors_Adjacent
0,B,1,B1,11,76,NF1,WT,64,Plate_5,1,...,-0.716575,0.27685,-1.330407,-1.362612,-0.988199,-0.678416,-0.845844,-0.426886,-0.256321,0
6,B,1,B1,5,76,NF1,WT,78,Plate_5,1,...,0.860125,2.670197,1.450978,0.273231,0.225417,-0.562113,-0.264053,-0.271858,-0.392302,0
7,B,1,B1,6,76,NF1,WT,79,Plate_5,1,...,-1.511567,-0.394214,-0.469294,-0.135764,0.262205,0.462,1.534657,2.920023,0.202942,0
16,B,1,B1,15,76,NF1,WT,68,Plate_5,2,...,-0.212915,0.915853,0.571409,-0.006105,0.431098,0.531071,0.621426,1.550741,-0.282232,0
71,C,1,C1,11,63,NF1,WT,305,Plate_5,1,...,0.997985,1.478442,1.571396,0.371741,0.250382,-0.236012,-0.439775,-0.473115,-0.327449,0


### Max single-cells for top highest feature

In [8]:
# Get data frame with the next top 6 single-cells
max_top_feature = (
    filtered_plate5_df[filtered_plate5_df["Metadata_genotype"] == "WT"]
    .nlargest(6, top_coeff_feature)[
        [
            top_coeff_feature,
            "Metadata_genotype",
            "Metadata_Well",
            "Metadata_Plate",
            "Metadata_Site",
            "Metadata_Number_of_Cells_Neighbors_Adjacent",
            "Metadata_Nuclei_Location_Center_X",
            "Metadata_Nuclei_Location_Center_Y",
        ]
    ]
)

# Append the DataFrame and its name to the lists
list_of_dfs.append(max_top_feature)
list_of_names.append("max_top_feature")

print(max_top_feature.shape)
max_top_feature

(6, 8)


Unnamed: 0,Nuclei_RadialDistribution_FracAtD_DAPI_4of4,Metadata_genotype,Metadata_Well,Metadata_Plate,Metadata_Site,Metadata_Number_of_Cells_Neighbors_Adjacent,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y
1491,3.986482,WT,F3,Plate_5,4,0,741.653865,585.318291
2124,3.245599,WT,G4,Plate_5,10,0,576.356057,397.71893
1328,2.29106,WT,E3,Plate_5,12,0,606.209362,225.688272
941,2.212648,WT,G2,Plate_5,4,0,1000.324752,451.423366
1966,2.179773,WT,E4,Plate_5,7,0,786.50885,675.59271
1256,2.148056,WT,D3,Plate_5,2,0,615.165603,275.279078


### Min single-cells for top highest feature

In [9]:
# Get data frame with the top 3 single-cells from the top WT coefficient
min_top_feature = filtered_plate5_df[
    filtered_plate5_df["Metadata_genotype"] == "Null"
].nsmallest(6, top_coeff_feature)[
    [
        top_coeff_feature,
        "Metadata_genotype",
        "Metadata_Well",
        "Metadata_Plate",
        "Metadata_Site",
        "Metadata_Number_of_Cells_Neighbors_Adjacent",
        "Metadata_Nuclei_Location_Center_X",
        "Metadata_Nuclei_Location_Center_Y",
    ]
]

# Append the DataFrame and its name to the lists
list_of_dfs.append(min_top_feature)
list_of_names.append("min_top_feature")

print(min_top_feature.shape)
min_top_feature

(6, 8)


Unnamed: 0,Nuclei_RadialDistribution_FracAtD_DAPI_4of4,Metadata_genotype,Metadata_Well,Metadata_Plate,Metadata_Site,Metadata_Number_of_Cells_Neighbors_Adjacent,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y
3440,-2.51042,Null,E10,Plate_5,8,0,373.985706,616.345444
3526,-2.295683,Null,F10,Plate_5,19,0,159.873446,230.457937
4589,-2.033815,Null,G11,Plate_5,9,0,852.071086,482.028754
5261,-1.792618,Null,G12,Plate_5,6,0,431.031715,254.927651
2912,-1.65427,Null,G9,Plate_5,15,0,154.278204,431.9069
5163,-1.646701,Null,F12,Plate_5,21,0,153.563893,468.904903


### Max single-cells for the second highest feature

In [10]:
# Get data frame with the top 6 single-cells
max_second_top_feature = filtered_plate5_df[
    filtered_plate5_df["Metadata_genotype"] == "WT"
].nlargest(6, second_top_coeff_feature)[
    [
        second_top_coeff_feature,
        "Metadata_genotype",
        "Metadata_Well",
        "Metadata_Plate",
        "Metadata_Site",
        "Metadata_Number_of_Cells_Neighbors_Adjacent",
        "Metadata_Nuclei_Location_Center_X",
        "Metadata_Nuclei_Location_Center_Y",
    ]
]

# Append the DataFrame and its name to the lists
list_of_dfs.append(max_second_top_feature)
list_of_names.append("max_second_top_feature")

print(max_second_top_feature.shape)
max_second_top_feature

(6, 8)


Unnamed: 0,Nuclei_RadialDistribution_FracAtD_CY5_2of4,Metadata_genotype,Metadata_Well,Metadata_Plate,Metadata_Site,Metadata_Number_of_Cells_Neighbors_Adjacent,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y
1227,3.092037,WT,D3,Plate_5,20,0,499.100499,434.482551
251,2.628396,WT,E1,Plate_5,3,0,743.786273,674.774182
1951,2.232429,WT,E4,Plate_5,15,0,445.509153,330.021235
2152,2.225137,WT,G4,Plate_5,2,0,373.653567,406.300484
1096,2.195927,WT,B3,Plate_5,6,0,762.652365,596.915482
1180,1.845778,WT,C3,Plate_5,20,0,934.929842,634.242501


### Min single-cells for the second highest feature

In [11]:
# Get data frame with the top 3 single-cells from the second top Null coefficient
min_second_top_feature = filtered_plate5_df[
    filtered_plate5_df["Metadata_genotype"] == "Null"
].nsmallest(6, second_top_coeff_feature)[
    [
        second_top_coeff_feature,
        "Metadata_genotype",
        "Metadata_Well",
        "Metadata_Plate",
        "Metadata_Site",
        "Metadata_Number_of_Cells_Neighbors_Adjacent",
        "Metadata_Nuclei_Location_Center_X",
        "Metadata_Nuclei_Location_Center_Y",
    ]
]

# Append the DataFrame and its name to the lists
list_of_dfs.append(min_second_top_feature)
list_of_names.append("min_second_top_feature")

print(min_second_top_feature.shape)
min_second_top_feature

(6, 8)


Unnamed: 0,Nuclei_RadialDistribution_FracAtD_CY5_2of4,Metadata_genotype,Metadata_Well,Metadata_Plate,Metadata_Site,Metadata_Number_of_Cells_Neighbors_Adjacent,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y
3674,-3.062115,Null,G10,Plate_5,19,0,808.566934,222.580321
4261,-3.053204,Null,E11,Plate_5,5,0,278.493498,255.725618
3086,-2.603603,Null,B10,Plate_5,17,0,1035.746981,574.287047
4091,-2.556759,Null,D11,Plate_5,17,0,485.472995,217.110747
4414,-2.496828,Null,F11,Plate_5,11,0,782.646797,303.754875
4902,-2.484714,Null,D12,Plate_5,8,0,951.382083,263.449541


## Merge feature info into dictionary for processing

In [12]:
sc_dict = create_sc_dict(dfs=list_of_dfs, names=list_of_names)

# Check the created dictionary for the first two items
pprint(list(sc_dict.items())[:2], indent=4)

[   (   'max_top_feature_1',
        {   'location_center_x': 741.653864618339,
            'location_center_y': 585.3182909265482,
            'plate': 'Plate_5',
            'site': '4',
            'well': 'F3'}),
    (   'max_top_feature_2',
        {   'location_center_x': 576.3560566334556,
            'location_center_y': 397.7189302569481,
            'plate': 'Plate_5',
            'site': '10',
            'well': 'G4'})]


## Generate single-cell crops 

In [13]:
generate_sc_crops(
    sc_dict=sc_dict,
    channel_mapping=channel_mapping,
    images_dir=images_dir,
    output_img_dir=output_img_dir,
    crop_size=crop_size,
)