# Well-Aggregated Plate and Genotype Correlation Analysis
Correlations between groups defined by genotype and plate are determined to understand the similarities between group morphologies.
There are two genotypes {WT, Null}, and three plates {Plate 3, Plate 3 prime, Plate 5} explored in this correlation analysis.
These correlations are computed between cell morphologies aggregated to the well level after feature selection.

In [18]:
import pathlib
import sys

import pandas as pd

# Path to correlation class
sys.path.append("../utils")

# Class for calculating correlations
from CorrelateData import CorrelateData

## Find the root of the git repo on the host system

In [19]:
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

# Inputs

In [20]:
# Set data type for the model evaluation
data_type = "cleaned"

# Set data path based on if apply QC (cleaned) or not QC'd data
if data_type == "cleaned":
    data_path = pathlib.Path(
        "/Users/marktalbot/Documents/VC Studio Homework Folders/HighRisk/nf1_schwann_cell_painting_data/3.processing_features/data/single_cell_profiles/cleaned_sc_profiles"
    )
else:
    data_path = pathlib.Path(
        "/Users/marktalbot/Documents/VC Studio Homework Folders/HighRisk/nf1_schwann_cell_painting_data/3.processing_features/data/single_cell_profiles"
    )

# Set paths for each plate and load into memory
plate3df_path = pathlib.Path(
    root_dir / data_path / "Plate_3_bulk_camerons_method.parquet"
).resolve(strict=True)
plate3pdf_path = pathlib.Path(
    root_dir / data_path / "Plate_3_prime_bulk_camerons_method.parquet"
).resolve(strict=True)
plate5df_path = pathlib.Path(
    root_dir / data_path / "Plate_5_bulk_camerons_method.parquet"
).resolve(strict=True)

plate3df = pd.read_parquet(plate3df_path)
plate3pdf = pd.read_parquet(plate3pdf_path)
plate5df = pd.read_parquet(plate5df_path)

# Outputs

In [21]:
plate_correlation_path = pathlib.Path("construct_correlation_data")
plate_correlation_path.mkdir(parents=True, exist_ok=True)

# Process Bulk Plate Data

## Combine data
Concat plate data and retain common columns.

In [22]:
plates_cols = plate3df.columns.intersection(plate3pdf.columns).intersection(
    plate5df.columns
)
platesdf = pd.concat([plate3df, plate3pdf, plate5df], axis=0)
platesdf = platesdf[plates_cols]

In [23]:
platesdf.head()

Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_Plate,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,...,Nuclei_Texture_InfoMeas1_RFP_3_02_256,Nuclei_Texture_InfoMeas1_RFP_3_03_256,Nuclei_Texture_InfoMeas2_RFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_01_256,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256
0,B,1,B1,43,NF1,WT,Plate_3,0.739089,0.261315,0.461323,...,-1.324857,-1.122318,0.958232,1.054596,1.068769,1.04193,-1.083991,-1.017262,-0.713182,-1.002642
1,B,2,B2,139,NF1,WT,Plate_3,0.402931,0.139085,0.527398,...,0.262917,0.35305,0.090754,0.119581,0.132985,0.007156,-0.205224,-0.174515,-0.09384,-0.051587
2,B,3,B3,297,NF1,WT,Plate_3,0.370624,-0.068101,0.224139,...,0.442119,0.345724,-0.055711,-0.173884,-0.111095,-0.021318,-0.024655,0.013474,0.139199,0.285847
3,B,4,B4,559,NF1,WT,Plate_3,0.153483,-0.569034,-0.26817,...,0.437658,0.369961,-0.152767,-0.163421,-0.077296,0.00617,-0.476109,-0.398129,-0.342581,-0.197362
4,B,9,B9,68,NF1,Null,Plate_3,0.525913,0.768425,0.49479,...,-0.873028,-0.921089,0.935452,0.981821,0.945359,0.94246,-0.617901,-0.551435,-0.63519,-0.587984


In [24]:
# Morphology and metadata columns
morph_cols = [col for col in platesdf.columns if "Metadata" not in col]
meta_cols = platesdf.columns.difference(morph_cols)

# Correlate wells
Wells are correlated between plate and genotype.

In [25]:
cd = CorrelateData()
correlationsdf = []

## Well Correlations (same genotypes and different plates)

In [26]:
for genotype in platesdf["Metadata_genotype"].unique():

    correlation_params = {}

    correlationsdf.append(
        cd.inter_correlations(
            _df=platesdf.loc[platesdf["Metadata_genotype"] == genotype].copy(),
            _antehoc_group_cols=["Metadata_Plate"],
            _feat_cols=morph_cols,
            _posthoc_group_cols=["Metadata_Well", "Metadata_genotype"],
            _drop_cols=["Metadata_Well"],
        )
    )

## Well Correlations (different genotypes and all possible plates)
Well correlations between different genotypes are computed, regardless of the plate

In [27]:
correlationsdf.append(
    cd.inter_correlations(
        _df=platesdf.copy(),
        _antehoc_group_cols=["Metadata_genotype"],
        _feat_cols=morph_cols,
        _posthoc_group_cols=["Metadata_Plate", "Metadata_Well"],
        _drop_cols=["Metadata_Well"],
    )
)

## Well Correlations (same genotype and same plate)

In [28]:
correlationsdf.append(
    cd.intra_correlations(
        _df=platesdf.copy(),
        _antehoc_group_cols=["Metadata_Plate", "Metadata_genotype"],
        _feat_cols=morph_cols,
        _posthoc_group_cols=["Metadata_Well"],
        _drop_cols=["Metadata_Well"],
    )
)

# Save Plate Correlations

In [29]:
correlationsdf = pd.concat(correlationsdf, axis=0)
# Save correlations dataframe with qc suffix if data is cleaned
if data_type == "cleaned":
    correlations_file = plate_correlation_path / "well_agg_plate_genotype_correlations_qc.parquet"
else:
    correlations_file = plate_correlation_path / "well_agg_plate_genotype_correlations.parquet"

correlationsdf.to_parquet(correlations_file)

In [30]:
correlationsdf.head()

Unnamed: 0,correlation,Metadata_Plate__group0,Metadata_Plate__group1,Metadata_genotype__group0,Metadata_genotype__group1
0,0.132592,Plate_3,Plate_3_prime,WT,WT
1,0.251257,Plate_3,Plate_3_prime,WT,WT
2,0.409582,Plate_3,Plate_3_prime,WT,WT
3,-0.172047,Plate_3,Plate_3_prime,WT,WT
4,0.245909,Plate_3,Plate_3_prime,WT,WT
