# Compute KS-test results per feature between the two different dervivatives of the `ipn02.3 2Î»` cell line

Plate 6 contains two derivatives of the cell line acquired from `iNFixion` and `MGH`.

## Import libraries

In [1]:
import pathlib
import pandas as pd
from scipy import stats
from joblib import load

## Set results directory and load in model to get list of the features used

In [2]:
# Set results directory
results_dir = pathlib.Path("./results")
results_dir.mkdir(exist_ok=True)

# Load in model
model = load(pathlib.Path("../1.train_models/data/trained_nf1_model.joblib"))
model_features = list(model.feature_names_in_)

len(model_features)

907

## Load in Plate 6 normalized data

In [3]:
# Load in the normalized data
plate_6_norm = pd.read_parquet(
    pathlib.Path(
        "/media/18tbdrive/1.Github_Repositories/nf1_schwann_cell_painting_data/3.processing_features/data/single_cell_profiles/Plate_6_sc_normalized.parquet"
    )
)

## Perform KS-test comparing the features between the two cell line derivatives

In [4]:
# Split data by institution for comparison
institution_1_norm = plate_6_norm[plate_6_norm["Metadata_Institution"] == "iNFixion"]
institution_2_norm = plate_6_norm[plate_6_norm["Metadata_Institution"] == "MGH"]

# Perform KS-test for each feature
ks_test_results_norm = {}

for column in plate_6_norm.columns:
    if column.startswith("Metadata_"):
        continue
    ks_stat, p_value = stats.kstest(
        institution_1_norm[column], institution_2_norm[column]
    )
    ks_test_results_norm[column] = {"ks_stat": ks_stat, "p_value": p_value}

# Convert results to DataFrame for better visualization
ks_test_results_norm_df = (
    pd.DataFrame(ks_test_results_norm)
    .T.reset_index()
    .rename(columns={"index": "feature"})
)

In [5]:
print("\nKS-test results for normalized data:")
ks_test_results_norm_df.head()


KS-test results for normalized data:


Unnamed: 0,feature,ks_stat,p_value
0,Cytoplasm_AreaShape_Area,0.674023,5.3e-322
1,Cytoplasm_AreaShape_BoundingBoxArea,0.590938,2.4e-321
2,Cytoplasm_AreaShape_BoundingBoxMaximum_X,0.082142,3.439385e-10
3,Cytoplasm_AreaShape_BoundingBoxMaximum_Y,0.109208,1.094445e-17
4,Cytoplasm_AreaShape_BoundingBoxMinimum_X,0.090834,2.296492e-12


## Add absolute value coefficients per feature to the results

In [6]:
feat_import_df = pd.read_parquet(
    pathlib.Path(
        "../2.evaluate_model/model_evaluation_data/feature_importances.parquet"
    )
)

# Take the absolute value of the feature importance
feat_import_df["feature_importances"] = feat_import_df["feature_importances"].abs()

# Change the column name from feature_names to feature
feat_import_df = feat_import_df.rename(columns={"feature_names": "feature"})

# Merge the feature importance data with the KS test results
ks_test_results_norm_df = ks_test_results_norm_df.merge(feat_import_df, on="feature")

ks_test_results_norm_df.head()

Unnamed: 0,feature,ks_stat,p_value,feature_importances
0,Cytoplasm_AreaShape_Eccentricity,0.049263,0.0006084972,0.020474
1,Cytoplasm_AreaShape_FormFactor,0.283127,1.081349e-117,1.13896
2,Cytoplasm_AreaShape_MajorAxisLength,0.457394,5.382836e-314,0.169308
3,Cytoplasm_AreaShape_MinorAxisLength,0.530065,3.735e-321,0.014426
4,Cytoplasm_AreaShape_Orientation,0.021219,0.4361176,0.000416


## Split feature names into parts and save results

In [7]:
# Split the feature column into parts
ks_test_results_norm_df[
    [
        "compartment",
        "feature_group",
        "measurement",
        "channel",
        "parameter1",
        "parameter2",
        "parameter3",
    ]
] = (
    ks_test_results_norm_df["feature"]
    .str.split("_", expand=True)
    .reindex(columns=range(7), fill_value=pd.NA)
)

# Filter out features not in model_features
ks_test_results_norm_df = ks_test_results_norm_df[
    ks_test_results_norm_df["feature"].isin(model_features)
]

# Save the results
ks_test_results_norm_df.to_parquet(pathlib.Path(f"{results_dir}/ks_test_derivatives_results.parquet"))

# Display the updated DataFrame
print(ks_test_results_norm_df.shape)
ks_test_results_norm_df.head()

(907, 11)


Unnamed: 0,feature,ks_stat,p_value,feature_importances,compartment,feature_group,measurement,channel,parameter1,parameter2,parameter3
0,Cytoplasm_AreaShape_Eccentricity,0.049263,0.0006084972,0.020474,Cytoplasm,AreaShape,Eccentricity,,,,
1,Cytoplasm_AreaShape_FormFactor,0.283127,1.081349e-117,1.13896,Cytoplasm,AreaShape,FormFactor,,,,
2,Cytoplasm_AreaShape_MajorAxisLength,0.457394,5.382836e-314,0.169308,Cytoplasm,AreaShape,MajorAxisLength,,,,
3,Cytoplasm_AreaShape_MinorAxisLength,0.530065,3.735e-321,0.014426,Cytoplasm,AreaShape,MinorAxisLength,,,,
4,Cytoplasm_AreaShape_Orientation,0.021219,0.4361176,0.000416,Cytoplasm,AreaShape,Orientation,,,,


## Print rows from the top five feature importances

In [8]:
ks_test_results_norm_df = ks_test_results_norm_df.sort_values(by="feature_importances", ascending=False)
ks_test_results_norm_df.head()

Unnamed: 0,feature,ks_stat,p_value,feature_importances,compartment,feature_group,measurement,channel,parameter1,parameter2,parameter3
64,Cytoplasm_RadialDistribution_FracAtD_RFP_4of4,0.041397,0.006538166,2.338295,Cytoplasm,RadialDistribution,FracAtD,RFP,4of4,,
291,Cells_Correlation_Correlation_DAPI_GFP,0.087697,1.484263e-11,1.991228,Cells,Correlation,Correlation,DAPI,GFP,,
54,Cytoplasm_Intensity_MeanIntensityEdge_GFP,0.034079,0.04116397,1.917576,Cytoplasm,Intensity,MeanIntensityEdge,GFP,,,
495,Cells_Texture_Correlation_DAPI_3_02_256,0.43032,1.157851e-276,1.729736,Cells,Texture,Correlation,DAPI,3,2.0,256.0
60,Cytoplasm_RadialDistribution_FracAtD_DAPI_4of4,0.069893,1.697322e-07,1.661875,Cytoplasm,RadialDistribution,FracAtD,DAPI,4of4,,
