# Well-Aggregated Plate and Genotype Correlation Analysis
Correlations between groups defined by genotype and plate are determined to understand the similarities between group morphologies.
There are two genotypes {WT, Null}, and three plates {Plate 3, Plate 3 prime, Plate 5} explored in this correlation analysis.
These correlations are computed between cell morphologies aggregated to the well level after feature selection.

In [1]:
import pathlib
import sys

import pandas as pd

# Path to correlation class
sys.path.append("../utils")

# Class for calculating correlations
from CorrelateData import CorrelateData

## Find the root of the git repo on the host system

In [2]:
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

# Inputs

In [3]:
# Set data type for the model evaluation
data_type = "cleaned"

# Set data path based on if apply QC (cleaned) or not QC'd data
if data_type == "cleaned":
    data_path = pathlib.Path(
    "/media/18tbdrive/1.Github_Repositories/nf1_schwann_cell_painting_data/3.processing_features/data/single_cell_profiles/cleaned_sc_profiles"
).resolve(strict=True)
else:
    data_path = pathlib.Path(
    "/media/18tbdrive/1.Github_Repositories/nf1_schwann_cell_painting_data/3.processing_features/data/single_cell_profiles"
).resolve(strict=True)

# Set paths for each plate and load into memory
plate3df_path = pathlib.Path(
    root_dir / data_path / "Plate_3_bulk_camerons_method.parquet"
).resolve(strict=True)
plate3pdf_path = pathlib.Path(
    root_dir / data_path / "Plate_3_prime_bulk_camerons_method.parquet"
).resolve(strict=True)
plate5df_path = pathlib.Path(
    root_dir / data_path / "Plate_5_bulk_camerons_method.parquet"
).resolve(strict=True)

plate3df = pd.read_parquet(plate3df_path)
plate3pdf = pd.read_parquet(plate3pdf_path)
plate5df = pd.read_parquet(plate5df_path)

# Outputs

In [4]:
plate_correlation_path = pathlib.Path("construct_correlation_data")
plate_correlation_path.mkdir(parents=True, exist_ok=True)

# Process Bulk Plate Data

## Combine data
Concat plate data and retain common columns.

In [5]:
plates_cols = plate3df.columns.intersection(plate3pdf.columns).intersection(
    plate5df.columns
)
platesdf = pd.concat([plate3df, plate3pdf, plate5df], axis=0)
platesdf = platesdf[plates_cols]

In [6]:
platesdf.head()

Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_Plate,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,...,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_DAPI_3_00_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumVariance_DAPI_3_03_256,Nuclei_Texture_SumVariance_GFP_3_03_256
0,B,1,B1,43,NF1,WT,Plate_3,0.755509,0.200855,0.502924,...,1.063103,1.092884,-0.190159,-1.335928,-0.959558,-0.92424,-0.656993,-0.862443,0.141373,0.589769
1,B,2,B2,139,NF1,WT,Plate_3,0.419793,0.087161,0.567983,...,0.089232,-0.039208,-0.271067,0.237891,-0.168739,-0.128372,-0.06683,-0.016232,-0.255805,-0.330456
2,B,3,B3,297,NF1,WT,Plate_3,0.324369,-0.030554,0.192781,...,0.03618,0.032251,0.225088,0.6258,-0.055407,-0.03318,0.042812,0.182603,-0.752411,-0.386896
3,B,4,B4,559,NF1,WT,Plate_3,0.137954,-0.586517,-0.211653,...,-0.101324,-0.022847,-0.269345,0.437411,-0.419326,-0.349633,-0.307728,-0.182943,-0.162226,-0.379651
4,B,9,B9,68,NF1,Null,Plate_3,0.424452,0.797895,0.280133,...,0.916729,0.981627,-0.328436,-1.322182,-0.520135,-0.447786,-0.53612,-0.480293,0.253832,1.137389


In [7]:
# Morphology and metadata columns
morph_cols = [col for col in platesdf.columns if "Metadata" not in col]
meta_cols = platesdf.columns.difference(morph_cols)

# Correlate wells
Wells are correlated between plate and genotype.

In [8]:
cd = CorrelateData()
correlationsdf = []

## Well Correlations (same genotypes and different plates)

In [9]:
for genotype in platesdf["Metadata_genotype"].unique():

    correlation_params = {}

    correlationsdf.append(
        cd.inter_correlations(
            _df=platesdf.loc[platesdf["Metadata_genotype"] == genotype].copy(),
            _antehoc_group_cols=["Metadata_Plate"],
            _feat_cols=morph_cols,
            _posthoc_group_cols=["Metadata_Well", "Metadata_genotype"],
            _drop_cols=["Metadata_Well"],
        )
    )

## Well Correlations (different genotypes and all possible plates)
Well correlations between different genotypes are computed, regardless of the plate

In [10]:
correlationsdf.append(
    cd.inter_correlations(
        _df=platesdf.copy(),
        _antehoc_group_cols=["Metadata_genotype"],
        _feat_cols=morph_cols,
        _posthoc_group_cols=["Metadata_Plate", "Metadata_Well"],
        _drop_cols=["Metadata_Well"],
    )
)

## Well Correlations (same genotype and same plate)

In [11]:
correlationsdf.append(
    cd.intra_correlations(
        _df=platesdf.copy(),
        _antehoc_group_cols=["Metadata_Plate", "Metadata_genotype"],
        _feat_cols=morph_cols,
        _posthoc_group_cols=["Metadata_Well"],
        _drop_cols=["Metadata_Well"],
    )
)

# Save Plate Correlations

In [12]:
correlationsdf = pd.concat(correlationsdf, axis=0)
# Save correlations dataframe with qc suffix if data is cleaned
if data_type == "cleaned":
    correlations_file = plate_correlation_path / "well_agg_plate_genotype_correlations_qc.parquet"
else:
    correlations_file = plate_correlation_path / "well_agg_plate_genotype_correlations.parquet"

correlationsdf.to_parquet(correlations_file)

In [13]:
correlationsdf.head()

Unnamed: 0,correlation,Metadata_Plate__group0,Metadata_Plate__group1,Metadata_genotype__group0,Metadata_genotype__group1
0,0.166052,Plate_3,Plate_3_prime,WT,WT
1,0.203561,Plate_3,Plate_3_prime,WT,WT
2,0.336235,Plate_3,Plate_3_prime,WT,WT
3,-0.20909,Plate_3,Plate_3_prime,WT,WT
4,0.267993,Plate_3,Plate_3_prime,WT,WT
