In [87]:
import os
# import pandas as pd 
import polars as pl
# import scanpy as sc
import numpy as np
import typing as tp
import re
import umap as umap
from sklearn.decomposition import PCA
# import cytominer_eval as cm
from scperturb import *
from scipy.stats import pearsonr
from tqdm import tqdm
import pandas as pd

2024-05-03 18:48:10.409859: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df = pl.read_parquet('output/bfmoalive_pilot/1_FeaturesImages/MedianCell/bfmoalive_A549_PE_48h_240312_L1_CP.parquet')

In [4]:
filenames = ['bfmoalive_A549_PE_48h_240312_L1_CP']
statmets = ['SingleCell', 'MedianCell', 'MeanCell']

dropcols = ['Nuclei_Location_Center_X',
            'Nuclei_Location_Center_Y',
            'Location',
            'ImageNumber_',
            'Parent',
            'Children',
            '_ObjectNumber',
            '_Object_Number',
            '_Y',
            '_X' 
            ]


cons = ['pos_con', 'neg_con']
float_columns=[pl.col(pl.Float32),pl.col(pl.Float64)]

In [91]:
def is_meta_column(
    c:str,
    allowlist:tp.List[str]=["Metadata_Well","Metadata_Barcode","Metadata_AcqID","Metadata_Site"],
    denylist:tp.List[str]=[],
)->bool:
    """
        allowlist:
            the function will return False for these, no matter if they are metadata or not
        denylist:
            the function will return True for these, no matter if they are metadata or not
    """
    if c in allowlist:
        return False
    if c in denylist:
        return True
    for ex in '''
        Metadata
        ^Count
        ImageNumber
        Object
        Parent
        Children
        Plate
        Well
        Location
        _[XYZ]_
        _[XYZ]$
        BoundingBox
        Phase
        Orientation
        Angle
        Scale
        Scaling
        Width
        Height
        Group
        FileName
        PathName
        URL
        Execution
        ModuleError
        LargeBrightArtefact
        MD5Digest
        RadialDistribution_Frac
        Intensity_
    '''.split():
        if re.search(ex, c):
            return True
    return False


def oneHot(row, mapping):
    return mapping.get(row, -1)

def makePCA(df, name='', statmet='SingleCell' , n_components=10):
    dataN=df.select([c for c in df.columns if not is_meta_column(c)]).select(float_columns).to_numpy()
    pca_model = PCA(n_components=n_components)
    pca_model = pca_model.fit(dataN)
    pcaOut = pca_model.transform(dataN)
    pca_columns = [f"umap{i+1}" for i in range(500)]
    df = pl.DataFrame(pcaOut, columns=pca_columns)
    return df

def makeUMAP(df, nn = 5, n_components=10, min_dist=0.2, spread= 5, n_epochs=None, metric='cosine'):
    dataN=df.select([c for c in df.columns if not is_meta_column(c)]).select(float_columns).to_numpy()
    umap_model = umap.UMAP(n_neighbors=nn
                        , min_dist=min_dist
                        , spread= spread
                        , n_epochs=n_epochs
                        , n_components=n_components
                        , metric=metric
                        , n_jobs=-1
                        )
  
    umapOut = umap_model.fit_transform(dataN)
    # df = df.with_columns([
    # pl.Series('umap1', umapOut[:, 0]),  
    # pl.Series('umap2', umapOut[:, 1])   
    # ])
    umap_columns = [f"umap{i+1}" for i in range(500)]
    df = pl.DataFrame(umapOut, columns=umap_columns)
    return df

In [73]:
float_columns=[pl.col(pl.Float32),pl.col(pl.Float64)]
tub_selected = ['Lexibulin', 'parbendazole', 'Nocodazole', 'ALBENDAZOLE', 'Colchicine', 'Vinorelbine ditartrate', 'Paclitaxel', 'plinabulin', 'MEBENDAZOLE', 'Vinblastine sulfate', 'Vincristine sulfate', 'Fenbendazole', 'ixabepilone']
cdk_selected = ['LY2857785', 'ON123300', 'DRF053-(R)', 'PHA-793887', 'Abemaciclib (Verzenio)', 'Roscovitine (Seliciclib, CYC202)', 'purvalanol-a', 'Palbociclib (Ibrance)', 'BMS-265246', 'Ribociclib (Kisqali)', 'TG-02', 'aminopurvalanol-a', 'RGB-286638']
parp_selected = ['E7449', 'Niraparib', 'Iniparib (BSI-201)', 'Rucaparib (phosphate)', 'IWR-1', 'AG14361', 'ME0328', 'EB-47', 'Veliparib', 'DR-2313', 'Talazoparib', 'AZD-2461', '(20S)-Protopanaxadiol']
akt_selected = ['Ipatasertib', 'triciribine', 'Buparlisib', 'Miransertib', 'AZD5363', 'MK-2206', 'PI-103', '10-DEBC', 'CCT128930', 'GSK690693', 'AR-12', 'GSK2110183', 'Uprosertib'] ## Kanske FPA-124
mapk_selected = ['RWJ-67657', 'Cobimetinib', 'Trametinib', 'Ulixertinib', 'EO-1428', 'SB-239063', 'SKF-86002', 'Binimetinib', 'TAK-715', 'PH-797804', 'SB-242235', 'LY2228820', 'Doramapimod']
hdac_selected = ['M344', 'Pimelic Diphenylamide 106', 'Abexinostat', 'Chidamide', 'Droxinostat', 'RG2833', 'Pyroxamide', 'Vorinostat', 'UF010', 'Resminostat (Hydrochloride)', 'Ricolinostat', 'SBHA', 'Tacedinaline', 'Scriptaid']

In [74]:
moas = {'TUB': tub_selected,
        'CDK': cdk_selected,
        'PARP': parp_selected,
        'AKT': akt_selected,
        'mapk': mapk_selected,
        'HDAC': hdac_selected
        }

# my_dict = {
#     "key1": "value1",
#     "key2": "value2",
#     "key3": "value3"
# }

In [75]:
for col in df.select([pl.col(pl.Float32),pl.col(pl.Float64)]).columns:
    before_drop=df.shape[0]
    df=df.filter(pl.col(col).is_not_null())
    after_drop=df.shape[0]



In [76]:
col_drop = ['_PathName_', '_FileName_']

columns_to_keep = [column for column in df.columns if not any(sub in column for sub in col_drop)]

# Select these columns from the DataFrame
cleaned_df = df.select(columns_to_keep)



cleaned_df.head()

Metadata_Barcode,Metadata_Well,Cytoplasm_RadialDistribution_RadialCV_illumPHAandWGA_2of4,Nuclei_Correlation_K_illumCONC_illumMITO,Nuclei_RadialDistribution_ZernikePhase_illumMITO_8_0,Nuclei_RadialDistribution_RadialCV_illumSYTO_4of4,Nuclei_AreaShape_Zernike_5_1,Cells_RadialDistribution_ZernikePhase_illumSYTO_1_1,Nuclei_RadialDistribution_ZernikePhase_illumPHAandWGA_7_7,Cytoplasm_RadialDistribution_RadialCV_illumPHAandWGA_Overflow,Cells_Granularity_9_illumHOECHST,Cells_Intensity_MADIntensity_illumHOECHST,Cells_RadialDistribution_FracAtD_illumSYTO_1of4,Nuclei_RadialDistribution_RadialCV_illumSYTO_2of4,Cells_RadialDistribution_ZernikePhase_illumPHAandWGA_5_1,Cytoplasm_RadialDistribution_ZernikePhase_illumHOECHST_2_0,Nuclei_AreaShape_Zernike_8_6,Cells_RadialDistribution_ZernikePhase_illumSYTO_8_0,Cytoplasm_RadialDistribution_ZernikeMagnitude_illumCONC_4_4,Cells_Correlation_RWC_illumMITO_illumCONC,Cytoplasm_RadialDistribution_MeanFrac_illumHOECHST_3of4,Nuclei_RadialDistribution_ZernikeMagnitude_illumCONC_6_6,Nuclei_AreaShape_MaxFeretDiameter,Cells_AreaShape_Zernike_7_1,Cells_Granularity_12_illumSYTO,Cells_Granularity_15_illumHOECHST,Cytoplasm_Correlation_Overlap_illumMITO_illumPHAandWGA,Cytoplasm_RadialDistribution_ZernikePhase_illumSYTO_7_3,Cytoplasm_RadialDistribution_ZernikeMagnitude_illumCONC_9_5,Cytoplasm_Correlation_K_illumPHAandWGA_illumSYTO,Cytoplasm_Intensity_MeanIntensity_illumCONC,Nuclei_RadialDistribution_ZernikePhase_illumMITO_6_4,Nuclei_Granularity_10_illumHOECHST,Nuclei_Granularity_13_illumCONC,Cells_RadialDistribution_ZernikeMagnitude_illumSYTO_8_4,Cells_RadialDistribution_ZernikeMagnitude_illumMITO_7_3,Nuclei_Granularity_2_illumCONC,…,Cytoplasm_RadialDistribution_MeanFrac_illumMITO_3of4,Cells_AreaShape_BoundingBoxArea,Nuclei_RadialDistribution_ZernikeMagnitude_illumCONC_8_6,Cells_Correlation_Manders_illumSYTO_illumCONC,Cells_Parent_nuclei,Cytoplasm_AreaShape_BoundingBoxMaximum_X,Cytoplasm_Neighbors_FirstClosestObjectNumber_Adjacent,Cytoplasm_Number_Object_Number,Cells_Metadata_AcqID,Cytoplasm_Location_Center_Z,Cytoplasm_ImageNumber,Cytoplasm_Children_cytoplasm_Count,Metadata_cmpd_Batch_ID,Cytoplasm_AreaShape_BoundingBoxMinimum_X,Metadata_cmpd_moa,Cytoplasm_Parent_precells,Metadata_cmpd_moa_group,Cytoplasm_Children_nuclei_Count,Cells_Parent_cells,Metadata_cmpd_Plate_ID,Metadata_Site,Cytoplasm_Metadata_AcqID,Cells_ImageNumber,Metadata_cmpd_target,Cytoplasm_AreaShape_EulerNumber,Cytoplasm_Neighbors_SecondClosestObjectNumber_Adjacent,Metadata_cmpd_Compound_ID,Cells_Number_Object_Number,Metadata_cmpd_Form,Metadata_cmpd_Conc_mM,Cytoplasm_AreaShape_Area,Metadata_cmpd_cmpdname,Cytoplasm_AreaShape_BoundingBoxMaximum_Y,Cytoplasm_AreaShape_ConvexArea,Cytoplasm_AreaShape_BoundingBoxArea,Cytoplasm_ObjectNumber,Cytoplasm_AreaShape_BoundingBoxMinimum_Y
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,i32,i32,i32,i32,i32,i32,i32,i32,str,i32,str,i32,str,i32,i32,str,i32,i32,i32,str,i32,i32,str,i32,str,i64,i32,str,i32,i32,i32,i32,i32
"""bfmoalive_A549…","""B03""",-0.068828,-0.826513,1.0,0.0,0.086781,0.13864,-0.095953,1.170463,-0.784504,-1.190277,-1.129475,-0.311303,-0.226543,0.0,-0.001174,0.0,-0.098691,0.526812,1.753993,0.277546,0.154568,-0.60335,-0.418361,0.703098,-0.261384,-0.030222,0.352222,0.256858,0.567149,-0.065819,0.073948,0.111635,-0.581359,-0.535193,-0.573127,…,1.437631,2.860429,0.567097,-0.788606,5,310,8,1,4677,0,1,1,"""BJ1898167""",171,"""HDAC inhibitor…",7,"""HDAC""",1,1,"""P104700""",3,4677,1,"""HDAC""",1,13,"""CBK290479""",1,"""DMSO""",10,13623,"""Abexinostat""",199,15951,23352,1,31
"""bfmoalive_A549…","""B17""",0.257859,-0.494009,1.0,0.0,-0.070144,-0.02632,0.001752,0.562204,0.000833,-0.00545,-0.20998,-0.123111,0.277236,0.0,0.061826,0.0,0.003278,0.408883,0.019225,-0.421162,-0.322932,0.072087,0.452128,0.403767,-0.947677,-0.014562,-0.286731,-0.022135,-0.035955,-0.084534,0.141456,0.435277,0.502934,-0.36261,-0.121294,…,0.107685,0.346826,-0.030213,-0.87797,5,149,12,1,4677,0,1,1,"""BJ1898265""",26,"""HDAC inhibitor…",6,"""HDAC""",1,1,"""P104700""",9,4677,1,"""HDAC6|HDAC8""",1,3,"""CBK290547""",1,"""DMSO""",10,17921,"""Droxinostat""",245,19214,26322,1,31
"""bfmoalive_A549…","""G14""",-0.477886,0.533709,-1.0,0.0,0.013693,0.090723,0.070456,-0.243346,0.036986,-0.071599,0.514161,0.089386,0.087222,0.0,-0.477958,0.0,-0.184669,-0.80749,0.229903,-0.281246,0.691968,0.041394,-0.429586,-0.958376,1.696636,0.087917,-0.303731,-1.261832,-1.396538,0.079941,0.010199,-0.442911,-0.14269,-0.102282,-0.090064,…,0.491208,-0.067905,-1.087246,1.002579,19,1280,5,1,4677,0,1,1,"""BJ1896286""",1076,"""p38 MAPK inhib…",16,"""MAPK""",1,1,"""P104700""",7,4677,1,"""MAPK14""",1,7,"""CBK308133""",1,"""DMSO""",10,13007,"""TAK-715""",197,14956,39168,1,5
"""bfmoalive_A549…","""I16""",0.087303,0.266359,0.0,0.0,0.021015,-0.051602,0.052439,-1.063609,0.260151,0.630888,0.602901,-0.115654,0.004503,0.0,-0.020746,0.0,-0.022192,0.037356,-0.536604,-0.297213,-0.395221,0.215739,0.175569,-0.889807,0.081975,0.069624,-0.069167,-0.419836,-0.118422,0.041553,-0.325115,-0.268101,0.186237,0.149192,0.167264,…,-0.541982,-0.687237,-0.261479,0.578775,11,310,4,1,4677,0,1,1,"""BJ1894454""",154,"""PARP inhibitor…",6,"""PARP""",1,1,"""P104700""",4,4677,1,"""PARP""",1,20,"""CBK289987""",1,"""DMSO""",10,22491,"""NU1025""",237,24355,34944,1,13
"""bfmoalive_A549…","""P12""",-0.506741,-1.528433,0.0,0.0,0.243367,-0.094423,-0.010318,0.713405,0.129391,0.339962,-0.590543,-0.85379,0.075257,0.0,-0.458247,0.0,0.560617,0.854043,0.169736,3.007963,-1.175715,-0.525398,-0.119895,2.99327,-0.166146,-0.107346,2.191429,0.573282,4.573414,-0.01713,2.055285,0.566345,-0.326669,0.167792,-0.04355,…,0.712576,0.702499,2.189545,-0.175563,7,2309,2,1,4677,0,1,1,"""[fenb]""",2096,"""[fenb]""",5,"""[fenb]""",2,1,"""P104700""",7,4677,1,"""pos_con""",1,8,"""[fenb]""",1,"""DMSO""",10,32249,"""fenb""",422,34278,44304,1,214


In [77]:
df = cleaned_df

In [78]:
def calculate_euclidean(dmso, treatment):
    return np.sqrt(np.sum((dmso - treatment) ** 2))
df_agg = df
df_float_columns=set(list(df_agg.select(float_columns).columns))
group_by_columns=['Metadata_cmpd_cmpdname']
other_columns=set(list(df_agg.columns))-df_float_columns-set(group_by_columns)
# # group by mean for all float features, and group by first for all non-float columns (indices and string metadata)
group_by_aggregates=[
    *[pl.mean(x) for x in list(df_float_columns)],
    *[pl.first(x) for x in list(other_columns)]
]
df_agg=df_agg.group_by(group_by_columns).agg(group_by_aggregates)

In [79]:
df_agg.head()

Metadata_cmpd_cmpdname,Cytoplasm_Correlation_Manders_illumPHAandWGA_illumSYTO,Cells_RadialDistribution_ZernikeMagnitude_illumCONC_6_6,Cells_RadialDistribution_RadialCV_illumHOECHST_3of4,Cells_Correlation_Manders_illumPHAandWGA_illumCONC,Cytoplasm_RadialDistribution_MeanFrac_illumHOECHST_2of4,Cells_Granularity_4_illumPHAandWGA,Cytoplasm_Intensity_LowerQuartileIntensity_illumHOECHST,Nuclei_Granularity_12_illumHOECHST,Cytoplasm_AreaShape_Zernike_8_8,Cells_RadialDistribution_FracAtD_illumMITO_Overflow,Cytoplasm_Intensity_MeanIntensityEdge_illumSYTO,Cytoplasm_Intensity_MinIntensityEdge_illumHOECHST,Cytoplasm_RadialDistribution_ZernikeMagnitude_illumCONC_4_2,Cytoplasm_RadialDistribution_ZernikeMagnitude_illumHOECHST_7_1,Cytoplasm_RadialDistribution_ZernikeMagnitude_illumHOECHST_9_3,Nuclei_RadialDistribution_FracAtD_illumHOECHST_Overflow,Cytoplasm_Intensity_MinIntensity_illumMITO,Cytoplasm_Correlation_K_illumPHAandWGA_illumHOECHST,Nuclei_AreaShape_Zernike_5_3,Cytoplasm_RadialDistribution_ZernikeMagnitude_illumHOECHST_6_6,Cells_RadialDistribution_ZernikePhase_illumMITO_0_0,Cells_RadialDistribution_ZernikePhase_illumMITO_3_3,Nuclei_AreaShape_Zernike_9_1,Cytoplasm_AreaShape_EquivalentDiameter,Cells_Granularity_15_illumCONC,Cytoplasm_RadialDistribution_ZernikeMagnitude_illumCONC_9_3,Cytoplasm_RadialDistribution_ZernikePhase_illumMITO_4_0,Nuclei_RadialDistribution_RadialCV_illumSYTO_3of4,Cytoplasm_RadialDistribution_RadialCV_illumMITO_Overflow,Cytoplasm_Intensity_StdIntensityEdge_illumCONC,Nuclei_AreaShape_Zernike_8_2,Cytoplasm_RadialDistribution_ZernikeMagnitude_illumHOECHST_8_2,Cytoplasm_RadialDistribution_ZernikeMagnitude_illumCONC_2_2,Cytoplasm_Location_MaxIntensity_Z_illumPHAandWGA,Cytoplasm_RadialDistribution_MeanFrac_illumSYTO_2of4,Nuclei_RadialDistribution_ZernikeMagnitude_illumHOECHST_3_1,…,Nuclei_Intensity_MedianIntensity_illumSYTO,Nuclei_Granularity_12_illumCONC,Cytoplasm_RadialDistribution_ZernikeMagnitude_illumMITO_7_5,Cytoplasm_Children_nuclei_Count,Cytoplasm_Location_Center_Z,Cytoplasm_AreaShape_BoundingBoxMinimum_Y,Cells_Parent_cells,Metadata_cmpd_Compound_ID,Cells_Number_Object_Number,Cytoplasm_AreaShape_BoundingBoxMaximum_Y,Cytoplasm_AreaShape_Area,Cytoplasm_Neighbors_SecondClosestObjectNumber_Adjacent,Cytoplasm_AreaShape_EulerNumber,Metadata_cmpd_target,Metadata_Well,Metadata_cmpd_moa,Cells_ImageNumber,Cytoplasm_AreaShape_BoundingBoxArea,Metadata_cmpd_Batch_ID,Metadata_cmpd_moa_group,Metadata_Barcode,Cytoplasm_Neighbors_FirstClosestObjectNumber_Adjacent,Metadata_cmpd_Conc_mM,Cytoplasm_Parent_precells,Cytoplasm_Metadata_AcqID,Metadata_cmpd_Plate_ID,Metadata_cmpd_Form,Cells_Metadata_AcqID,Cytoplasm_AreaShape_ConvexArea,Cytoplasm_Children_cytoplasm_Count,Cells_Parent_nuclei,Metadata_Site,Cytoplasm_Number_Object_Number,Cytoplasm_AreaShape_BoundingBoxMinimum_X,Cytoplasm_AreaShape_BoundingBoxMaximum_X,Cytoplasm_ImageNumber,Cytoplasm_ObjectNumber
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,i32,i32,i32,i32,str,i32,i32,i32,i32,i32,str,str,str,i32,i32,str,str,str,i32,i64,i32,i32,str,str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
"""IWR-1""",0.315562,-0.358418,-0.019717,-0.056131,-0.446342,0.326655,-0.483166,-0.456663,-0.110756,-0.723908,-0.40494,-0.443352,-0.325168,0.157439,0.235467,0.0,1.280951,-0.284162,-0.151856,-0.28586,0.0,0.028615,-0.105765,-0.437858,0.25334,-0.199608,0.0,0.0,-0.968742,-0.600153,-0.032377,0.214328,-0.349628,0.0,-0.258316,0.098518,…,0.288068,0.216604,-0.060064,1,0,35,1,"""CBK289918H""",1,173,12692,12,1,"""Wnt""","""K05""","""PARP inhibitor…",1,21252,"""BJ1896120""","""PARP""","""bfmoalive_A549…",9,10,13,4677,"""P104700""","""DMSO""",4677,14001,1,13,4,1,146,300,1,1
"""Vincristine su…",0.814964,1.100074,0.921509,0.63117,1.294013,-0.668151,0.27578,2.448105,-0.346173,1.292453,-0.015795,-0.235659,1.308132,-0.735544,-0.248873,0.0,-0.979357,-1.395679,0.149589,-0.377196,0.0,-0.105908,0.384849,1.432939,0.489489,2.139829,0.0,0.0,2.214508,2.757882,0.050609,-0.523528,1.651148,0.0,0.832296,0.21346,…,-0.040553,-0.40898,0.101315,1,0,194,2,"""CBK277916""",2,421,17472,5,1,"""Tubulin""","""F19""","""Inhibits micro…",1,37228,"""BJ1897809""","""TUB""","""bfmoalive_A549…",1,10,6,4677,"""P104700""","""DMSO""",4677,20803,1,3,4,2,705,869,1,2
"""ryuvidine""",-0.150411,-0.262653,0.019142,-0.550454,-0.289666,-0.070885,0.661573,-0.317816,0.022723,-0.274901,-0.148221,0.843887,-0.283635,0.328161,0.214626,0.0,1.75397,0.503676,0.158717,0.11425,0.0,-0.02706,0.056404,-0.323733,-0.288135,-0.283967,0.0,0.0,-0.778362,-0.434609,-0.001516,0.256747,-0.313828,0.0,-0.179749,0.30982,…,0.367879,-0.000907,0.179926,1,0,7,1,"""CBK290226""",1,189,17154,13,1,"""CDK4""","""L14""","""undefined""",1,24752,"""BJ1895932""","""CDK""","""bfmoalive_A549…",3,10,13,4677,"""P104700""","""DMSO""",4677,17765,1,15,1,1,1259,1395,1,1
"""TG-02""",1.665036,3.307765,0.283998,-0.081614,-0.29082,-0.133863,0.510449,1.744653,0.139854,-0.615391,-0.831175,-0.127117,4.462582,-1.078854,-0.204162,0.0,-0.655365,-1.017525,0.33084,0.704364,0.0,0.069154,-0.03206,-0.546989,2.118623,3.918256,0.0,0.0,-0.436178,4.31832,-0.103491,-0.581989,4.176915,0.0,-0.619182,0.024162,…,-1.744174,0.406342,1.231379,1,0,1774,12,"""CBK308919""",12,1958,10143,7,1,"""CDK9""","""F10""","""undefined""",1,19872,"""BJ1897361""","""CDK""","""bfmoalive_A549…",14,10,17,4677,"""P104700""","""DMSO""",4677,10512,1,16,4,12,1325,1433,1,12
"""Cobimetinib""",-0.948749,0.292533,-1.20822,-0.665158,1.228216,-0.553934,-0.714303,0.909783,0.211162,1.677152,-0.230119,-0.468155,0.25224,-1.31783,-1.026326,0.0,1.316988,-2.029647,-0.941699,0.215076,0.0,-0.024185,-0.592019,2.378866,0.825488,0.221992,0.0,0.0,2.315886,0.388693,0.467705,-1.126151,0.251982,0.0,0.940482,-0.813462,…,-1.159271,0.243951,0.241708,1,0,99,1,"""CBK303945_1""",1,434,72878,2,1,"""MAP2K1""","""A06""","""undefined""",1,110885,"""BJ1897191""","""MAPK""","""bfmoalive_A549…",6,10,9,4677,"""P104700""","""DMSO""",4677,75091,1,10,3,1,1850,2181,1,1


In [95]:
targets = ['dmso','MAPK']
sel_cmpds = ['dmso'] + mapk_selected 
df_2class = df_agg.filter(pl.col('Metadata_cmpd_moa_group').is_in(targets))
df_2class = df_2class.filter(pl.col('Metadata_cmpd_cmpdname').is_in(sel_cmpds))
df_treat = df_2class.filter(pl.col(['Metadata_cmpd_cmpdname']) != 'dmso')
df_dmso = df_2class.filter(pl.col(['Metadata_cmpd_cmpdname']) == 'dmso')

df_dmso = df_dmso.with_columns(pl.lit(1).alias("key"))
df_treat = df_treat.with_columns(pl.lit(1).alias("key"))

feature_columns=df_2class.select(float_columns)
dmso_features = np.array(df_dmso.select(feature_columns).to_numpy()[0])
treat_features = df_treat.select(feature_columns)
# Calculate Euclidean distance for each treatment
distances = []
for row in treat_features.iter_rows(named=False):
    treatment_features = np.array(row)
    distance = calculate_euclidean(dmso_features, treatment_features)
    print(distance)
    distances.append(distance)
distances = distances[1:]
df_treat = df_treat.with_columns(pl.Series("Euclidean_Distance", distances))
mapk_df = df_treat.select(pl.col(['Metadata_cmpd_cmpdname', 'Metadata_cmpd_moa_group', 'Euclidean_Distance']))
outdir = 'output/bfmoalive_pilot/3_perturbations'
if not os.path.exists(outdir): 
    os.makedirs(outdir)
mapk_df.write_csv(f'{outdir}/MAPK.csv')


0.0
1634.760345710443
407.24374273613074
1587.8790683938873
1630.2046076279632
315.2084727081119
1667.4794866818797
262.5226259181921
1654.775774860756
1590.38739012775
1592.7567237937674
1614.4324130033544
1628.687189436708
1566.0223726478075


In [83]:
for key, value in moas.items():
    targets = ['dmso',key]
    sel_cmpds = ['dmso'] + value 
    df_2class = df_agg.filter(pl.col('Metadata_cmpd_moa_group').is_in(targets))
    df_2class = df_2class.filter(pl.col('Metadata_cmpd_cmpdname').is_in(sel_cmpds))
    df_treat = df_2class.filter(pl.col(['Metadata_cmpd_cmpdname']) != 'dmso')
    df_dmso = df_2class.filter(pl.col(['Metadata_cmpd_cmpdname']) == 'dmso')

    feature_columns=df_2class.select(float_columns)
    dmso_features = np.array(df_dmso.select(feature_columns).to_numpy()[0])
    treat_features = df_treat.select(feature_columns)
    # Calculate Euclidean distance for each treatment
    distances = []
    for row in treat_features.iter_rows(named=False):
        treatment_features = np.array(row)
        distance = calculate_euclidean(dmso_features, treatment_features)
        print(distance)
        distances.append(distance)
    distances = distances[1:]
    df_treat = df_treat.with_columns(pl.Series("Euclidean_Distance", distances))
    df_out = df_treat.select(pl.col(['Metadata_cmpd_cmpdname', 'Metadata_cmpd_moa_group', 'Euclidean_Distance']))
    outdir = 'output/bfmoalive_pilot/3_perturbations'
    if not os.path.exists(outdir): 
        os.makedirs(outdir)
    df_out.write_csv(f'{outdir}/{key}.csv')

0.0
1541.32240366676
453.7233544627179
167.6356440511391
819.1048782775372
569.122558814582
472.515553345358
639.1682443241182
569.8206786374285
455.73813906634183
1727.7841206695316
287.2420709612252
497.941682338628
178.7964936760725
0.0
1716.1026319449502
1002.6601559387108
814.6404075876203
1654.1104870656268
1613.8735503746177
775.1430814677194
1725.2327936471406
509.91436327828114
1101.4235170620925
1637.6316701282196
702.6877803157246
754.2637812691481
1683.225472055833
0.0
391.4734760195839
319.7514603603027
502.22253645447176
493.03479623350046
1576.5228125413144
278.91083817666595
221.64453118536684
1698.0220849927566
1597.2531966888344
392.3537973353339
353.8539350801107
805.8940697685143
267.3173432576008
0.0
1836.55672799059
1666.70828020685
1756.3089497752524
1677.9269077276417
1929.9936738950462
1753.3955963896988
1751.7681605789778
882.9017380899578
1699.9172495309492
554.8887787708202
1781.8625575865963
1643.390890925162
1673.4970158220572
0.0
0.0
559.205111665078
487.

In [92]:

for key, value in moas.items():
    
    
    targets = ['dmso',key]
    sel_cmpds = ['dmso'] + value 
    df_2class = df_agg.filter(pl.col('Metadata_cmpd_moa_group').is_in(targets))
    df_2class = df_2class.filter(pl.col('Metadata_cmpd_cmpdname').is_in(sel_cmpds))
    df_umap = makePCA(df_2class)
    df_2class=df_2class.select([c for c in df.columns if is_meta_column(c)]).select(float_columns)
    df_2class = pl.concat([df_2class, df_umap], how='horizontal')
    df_treat = df_2class.filter(pl.col(['Metadata_cmpd_cmpdname']) != 'dmso')
    df_dmso = df_2class.filter(pl.col(['Metadata_cmpd_cmpdname']) == 'dmso')

    feature_columns=df_2class.select(float_columns)
    dmso_features = np.array(df_dmso.select(feature_columns).to_numpy()[0])
    treat_features = df_treat.select(feature_columns)
    # Calculate Euclidean distance for each treatment
    distances = []
    for row in treat_features.iter_rows(named=False):
        treatment_features = np.array(row)
        distance = calculate_euclidean(dmso_features, treatment_features)
        print(distance)
        distances.append(distance)
    distances = distances[1:]
    df_treat = df_treat.with_columns(pl.Series("Euclidean_Distance", distances))
    df_out = df_treat.select(pl.col(['Metadata_cmpd_cmpdname', 'Metadata_cmpd_moa_group', 'Euclidean_Distance']))
    outdir = 'output/bfmoalive_pilot/3_perturbations'
    if not os.path.exists(outdir): 
        os.makedirs(outdir)
    df_out.write_csv(f'{outdir}/{key}_pca.csv')

TypeError: DataFrame.__init__() got an unexpected keyword argument 'columns'

In [None]:
targets = ['dmso','MAPK']
sel_cmpds = ['dmso'] + mapk_selected 
df_2class = df_agg.filter(pl.col('Metadata_cmpd_moa_group').is_in(targets))
df_2class = df_2class.filter(pl.col('Metadata_cmpd_cmpdname').is_in(sel_cmpds))
df_treat = df_2class.filter(pl.col(['Metadata_cmpd_cmpdname']) != 'dmso')
df_dmso = df_2class.filter(pl.col(['Metadata_cmpd_cmpdname']) == 'dmso')

df_dmso = df_dmso.with_columns(pl.lit(1).alias("key"))
df_treat = df_treat.with_columns(pl.lit(1).alias("key"))

feature_columns=df_2class.select(float_columns)
dmso_features = np.array(df_dmso.select(feature_columns).to_numpy()[0])
treat_features = df_treat.select(feature_columns)
# Calculate Euclidean distance for each treatment
distances = []
for row in treat_features.iter_rows(named=False):
    treatment_features = np.array(row)
    distance = calculate_euclidean(dmso_features, treatment_features)
    print(distance)
    distances.append(distance)
distances = distances[1:]
df_treat = df_treat.with_columns(pl.Series("Euclidean_Distance", distances))
mapk_df = df_treat.select(pl.col(['Metadata_cmpd_cmpdname', 'Metadata_cmpd_moa_group', 'Euclidean_Distance']))
outdir = 'output/bfmoalive_pilot/3_perturbations'
if not os.path.exists(outdir): 
    os.makedirs(outdir)
mapk_df.write_csv(f'{outdir}/MAPK.csv')

In [96]:
df.head()

Metadata_Barcode,Metadata_Well,Cytoplasm_RadialDistribution_RadialCV_illumPHAandWGA_2of4,Nuclei_Correlation_K_illumCONC_illumMITO,Nuclei_RadialDistribution_ZernikePhase_illumMITO_8_0,Nuclei_RadialDistribution_RadialCV_illumSYTO_4of4,Nuclei_AreaShape_Zernike_5_1,Cells_RadialDistribution_ZernikePhase_illumSYTO_1_1,Nuclei_RadialDistribution_ZernikePhase_illumPHAandWGA_7_7,Cytoplasm_RadialDistribution_RadialCV_illumPHAandWGA_Overflow,Cells_Granularity_9_illumHOECHST,Cells_Intensity_MADIntensity_illumHOECHST,Cells_RadialDistribution_FracAtD_illumSYTO_1of4,Nuclei_RadialDistribution_RadialCV_illumSYTO_2of4,Cells_RadialDistribution_ZernikePhase_illumPHAandWGA_5_1,Cytoplasm_RadialDistribution_ZernikePhase_illumHOECHST_2_0,Nuclei_AreaShape_Zernike_8_6,Cells_RadialDistribution_ZernikePhase_illumSYTO_8_0,Cytoplasm_RadialDistribution_ZernikeMagnitude_illumCONC_4_4,Cells_Correlation_RWC_illumMITO_illumCONC,Cytoplasm_RadialDistribution_MeanFrac_illumHOECHST_3of4,Nuclei_RadialDistribution_ZernikeMagnitude_illumCONC_6_6,Nuclei_AreaShape_MaxFeretDiameter,Cells_AreaShape_Zernike_7_1,Cells_Granularity_12_illumSYTO,Cells_Granularity_15_illumHOECHST,Cytoplasm_Correlation_Overlap_illumMITO_illumPHAandWGA,Cytoplasm_RadialDistribution_ZernikePhase_illumSYTO_7_3,Cytoplasm_RadialDistribution_ZernikeMagnitude_illumCONC_9_5,Cytoplasm_Correlation_K_illumPHAandWGA_illumSYTO,Cytoplasm_Intensity_MeanIntensity_illumCONC,Nuclei_RadialDistribution_ZernikePhase_illumMITO_6_4,Nuclei_Granularity_10_illumHOECHST,Nuclei_Granularity_13_illumCONC,Cells_RadialDistribution_ZernikeMagnitude_illumSYTO_8_4,Cells_RadialDistribution_ZernikeMagnitude_illumMITO_7_3,Nuclei_Granularity_2_illumCONC,…,Cytoplasm_RadialDistribution_MeanFrac_illumMITO_3of4,Cells_AreaShape_BoundingBoxArea,Nuclei_RadialDistribution_ZernikeMagnitude_illumCONC_8_6,Cells_Correlation_Manders_illumSYTO_illumCONC,Cells_Parent_nuclei,Cytoplasm_AreaShape_BoundingBoxMaximum_X,Cytoplasm_Neighbors_FirstClosestObjectNumber_Adjacent,Cytoplasm_Number_Object_Number,Cells_Metadata_AcqID,Cytoplasm_Location_Center_Z,Cytoplasm_ImageNumber,Cytoplasm_Children_cytoplasm_Count,Metadata_cmpd_Batch_ID,Cytoplasm_AreaShape_BoundingBoxMinimum_X,Metadata_cmpd_moa,Cytoplasm_Parent_precells,Metadata_cmpd_moa_group,Cytoplasm_Children_nuclei_Count,Cells_Parent_cells,Metadata_cmpd_Plate_ID,Metadata_Site,Cytoplasm_Metadata_AcqID,Cells_ImageNumber,Metadata_cmpd_target,Cytoplasm_AreaShape_EulerNumber,Cytoplasm_Neighbors_SecondClosestObjectNumber_Adjacent,Metadata_cmpd_Compound_ID,Cells_Number_Object_Number,Metadata_cmpd_Form,Metadata_cmpd_Conc_mM,Cytoplasm_AreaShape_Area,Metadata_cmpd_cmpdname,Cytoplasm_AreaShape_BoundingBoxMaximum_Y,Cytoplasm_AreaShape_ConvexArea,Cytoplasm_AreaShape_BoundingBoxArea,Cytoplasm_ObjectNumber,Cytoplasm_AreaShape_BoundingBoxMinimum_Y
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,i32,i32,i32,i32,i32,i32,i32,i32,str,i32,str,i32,str,i32,i32,str,i32,i32,i32,str,i32,i32,str,i32,str,i64,i32,str,i32,i32,i32,i32,i32
"""bfmoalive_A549…","""B03""",-0.068828,-0.826513,1.0,0.0,0.086781,0.13864,-0.095953,1.170463,-0.784504,-1.190277,-1.129475,-0.311303,-0.226543,0.0,-0.001174,0.0,-0.098691,0.526812,1.753993,0.277546,0.154568,-0.60335,-0.418361,0.703098,-0.261384,-0.030222,0.352222,0.256858,0.567149,-0.065819,0.073948,0.111635,-0.581359,-0.535193,-0.573127,…,1.437631,2.860429,0.567097,-0.788606,5,310,8,1,4677,0,1,1,"""BJ1898167""",171,"""HDAC inhibitor…",7,"""HDAC""",1,1,"""P104700""",3,4677,1,"""HDAC""",1,13,"""CBK290479""",1,"""DMSO""",10,13623,"""Abexinostat""",199,15951,23352,1,31
"""bfmoalive_A549…","""B17""",0.257859,-0.494009,1.0,0.0,-0.070144,-0.02632,0.001752,0.562204,0.000833,-0.00545,-0.20998,-0.123111,0.277236,0.0,0.061826,0.0,0.003278,0.408883,0.019225,-0.421162,-0.322932,0.072087,0.452128,0.403767,-0.947677,-0.014562,-0.286731,-0.022135,-0.035955,-0.084534,0.141456,0.435277,0.502934,-0.36261,-0.121294,…,0.107685,0.346826,-0.030213,-0.87797,5,149,12,1,4677,0,1,1,"""BJ1898265""",26,"""HDAC inhibitor…",6,"""HDAC""",1,1,"""P104700""",9,4677,1,"""HDAC6|HDAC8""",1,3,"""CBK290547""",1,"""DMSO""",10,17921,"""Droxinostat""",245,19214,26322,1,31
"""bfmoalive_A549…","""G14""",-0.477886,0.533709,-1.0,0.0,0.013693,0.090723,0.070456,-0.243346,0.036986,-0.071599,0.514161,0.089386,0.087222,0.0,-0.477958,0.0,-0.184669,-0.80749,0.229903,-0.281246,0.691968,0.041394,-0.429586,-0.958376,1.696636,0.087917,-0.303731,-1.261832,-1.396538,0.079941,0.010199,-0.442911,-0.14269,-0.102282,-0.090064,…,0.491208,-0.067905,-1.087246,1.002579,19,1280,5,1,4677,0,1,1,"""BJ1896286""",1076,"""p38 MAPK inhib…",16,"""MAPK""",1,1,"""P104700""",7,4677,1,"""MAPK14""",1,7,"""CBK308133""",1,"""DMSO""",10,13007,"""TAK-715""",197,14956,39168,1,5
"""bfmoalive_A549…","""I16""",0.087303,0.266359,0.0,0.0,0.021015,-0.051602,0.052439,-1.063609,0.260151,0.630888,0.602901,-0.115654,0.004503,0.0,-0.020746,0.0,-0.022192,0.037356,-0.536604,-0.297213,-0.395221,0.215739,0.175569,-0.889807,0.081975,0.069624,-0.069167,-0.419836,-0.118422,0.041553,-0.325115,-0.268101,0.186237,0.149192,0.167264,…,-0.541982,-0.687237,-0.261479,0.578775,11,310,4,1,4677,0,1,1,"""BJ1894454""",154,"""PARP inhibitor…",6,"""PARP""",1,1,"""P104700""",4,4677,1,"""PARP""",1,20,"""CBK289987""",1,"""DMSO""",10,22491,"""NU1025""",237,24355,34944,1,13
"""bfmoalive_A549…","""P12""",-0.506741,-1.528433,0.0,0.0,0.243367,-0.094423,-0.010318,0.713405,0.129391,0.339962,-0.590543,-0.85379,0.075257,0.0,-0.458247,0.0,0.560617,0.854043,0.169736,3.007963,-1.175715,-0.525398,-0.119895,2.99327,-0.166146,-0.107346,2.191429,0.573282,4.573414,-0.01713,2.055285,0.566345,-0.326669,0.167792,-0.04355,…,0.712576,0.702499,2.189545,-0.175563,7,2309,2,1,4677,0,1,1,"""[fenb]""",2096,"""[fenb]""",5,"""[fenb]""",2,1,"""P104700""",7,4677,1,"""pos_con""",1,8,"""[fenb]""",1,"""DMSO""",10,32249,"""fenb""",422,34278,44304,1,214


In [None]:
targets = ['neg_con','pos_con']