In [1]:
import pandas as pd
import numpy as np
import os

# Pycytominer
from pycytominer import feature_select
from pycytominer import normalize
# from pycytominer import aggregate

# Set current working directory
print(os.getcwd())
os.chdir('/share/data/analyses/christa/colopaint3D/spher_colo52_v1/')
print(os.getcwd())

/share/data/analyses/christa/colopaint3D/spher_colo52_v1/2_Processing
/share/data/analyses/christa/colopaint3D/spher_colo52_v1


In [2]:
def list_features(df):
    # List features
    list_of_selected_features = list(df.columns.values)
    list_of_metadata = list(df.columns[df.columns.str.contains("Metadata_")])
    list_of_selected_features = list(set(list_of_selected_features) - set(list_of_metadata))
    
    return list_of_selected_features, list_of_metadata


def standardize_mean(df):
    # df = df.with_row_count('index')
    df_mean = pl.DataFrame()
    for i in range(df.select(pl.col('Metadata_Site')).max().item()):
        df_slice = df.filter(pl.col('Metadata_Site')==i)
        df_slice_DMSO=df_slice.filter(pl.col('Metadata_cmpd_cmpdname')=='dmso')
        assert df_slice_DMSO.shape[0]>0, "did not find any wells 'treated' with DMSO"
        mu = df_slice_DMSO.select(float_columns).mean()
        std = df_slice_DMSO.select(float_columns).std()
        # replace 0 with 1 (specifically not clip) to avoid div by zero
        std = std.select([pl.col(c).replace({0: 1}, default=pl.col(c)) for c in std.columns])
        for i,col in enumerate(std.columns):
            if std[col].is_null().any():
                raise RuntimeError(f"some std value in column {col,i} is nan?!")
            if std[col].is_infinite().any():
                raise RuntimeError(f"some std value in column {col,i} is infinite?!")
            if (std[col]==0).any():
                raise RuntimeError(f"unexpected 0 in column {col}")
        print_time("calculated DMSO distribution for one slice")
        df_standardized_slice = df_slice.with_columns([(pl.col(c) - mu[c]) / (std[c]+0.01) for c in mu.columns])
        found_nan=False
        # checking nans:
        for i,col in enumerate(mu.columns):
            if df_standardized_slice[col].is_null().any():
                found_nan=True
                print(f"some value in column {col,i} is nan")
        if found_nan:
            raise RuntimeError("found nan")
        df_mean_slice=df_slice.with_columns([df_standardized_slice[c] for c in df_standardized_slice.columns])
        df_mean = pl.concat([df_mean, df_mean_slice])
    # df_mean
    return df_mean

In [3]:
cell_line = 'HCT116' # HT29 or HCT116
data_type = 'aggregates'

In [4]:
## Save the normalized per slice data 
# Save the data
OutputDir = '1_Data/results/'
if not os.path.exists(OutputDir): 
    os.makedirs(OutputDir)


In [5]:
# List all files in directory
dir = '1_Data/FeaturesImages_150125_none/SingleSlice/'

files = os.listdir(dir)

name = dir

# Select all files with HCT116 in the name as well as MedianAgg_meanstd
files = [file for file in files if cell_line in file and 'MedianAgg' in file]

In [6]:
# Load the parquet file into a pandas dataframe

# Load all files
data = []
for file in files:
    data.append(pd.read_parquet(dir + file))

data = pd.concat(data)

In [7]:
# Load metadata (I am missing the concentrations)
metadata = pd.read_csv('1_Data/spher_colo52-metadata.csv')


In [8]:
## Prepare the metadata
dataset = data.copy()

# Merge data with metadata to get the concentrations
dataset = dataset.merge(metadata[['plate_well', 'cmpd_conc']], left_on='Metadata_PlateWell', right_on = 'plate_well')
dataset = dataset.drop(columns=['plate_well'])
# dataset['Metadata_cmpd_conc'] = dataset['cmpd_conc'].rename('Metadata_cmpd_conc')

# # Add a short name for the compound
dataset['Metadata_name'] = dataset['Metadata_cmpdname'].str[:5]
# Normalize each slice for each plate separately
dataset["Metadata_plate_slice"] = (
    dataset["Metadata_Barcode"] + "_" + dataset["Metadata_Site"].astype(str)
    )

In [9]:
#
# Normalize separately per 1) plate and 2) cell line
#

units = dataset["Metadata_plate_slice"].unique() # Per slice in each plate

# Itnitialize an empty dataframe
normalized = pd.DataFrame(columns=dataset.columns.values)

for unit in units:
    
    print(unit)
    annotated_temp = dataset[dataset['Metadata_plate_slice'] == unit]

    # Normalize: choose between standardize, robustize, mad_robustize, spherize 
    normalized_temp = normalize(annotated_temp, 
                                features=list_features(dataset)[0],image_features=False, 
                                meta_features="infer", samples="Metadata_cmpdname == 'dmso'", 
                                method="standardize")
    normalized = pd.concat([normalized, normalized_temp], ignore_index=True)

to_clip_df = feature_select(normalized, features=list_features(normalized)[0], operation=["variance_threshold", "correlation_threshold","drop_na_columns", "blocklist"])
# Instead of removing the outliers, we can clip them
selected_df = pd.concat([to_clip_df[list_features(to_clip_df)[1]], to_clip_df[list_features(to_clip_df)[0]].clip(lower=-40, upper=40, axis=1)], axis=1)
print(selected_df.shape)
selected_df.to_csv(('{}normalized_data_{}.csv').format(OutputDir, cell_line), index=False)
    


PB000140_0
PB000140_10
PB000140_11
PB000140_1
PB000140_2
PB000140_3
PB000140_4
PB000140_5
PB000140_6
PB000140_7
PB000140_8
PB000140_9
PB000137_0
PB000137_10
PB000137_11
PB000137_1
PB000137_2
PB000137_3
PB000137_4
PB000137_5
PB000137_6
PB000137_7
PB000137_8
PB000137_9
PB000138_0
PB000138_10
PB000138_11
PB000138_1
PB000138_2
PB000138_3
PB000138_4
PB000138_5
PB000138_6
PB000138_7
PB000138_8
PB000138_9
PB000139_0
PB000139_10
PB000139_11
PB000139_1
PB000139_2
PB000139_3
PB000139_4
PB000139_5
PB000139_6
PB000139_7
PB000139_8
PB000139_9
(9919, 813)


In [10]:
## Normalize per plate 
#
# Normalize separately per 1) plate and 2) cell line
#

units = dataset["Metadata_Barcode"].unique() # Per slice in each plate

# Itnitialize an empty dataframe
normalized_noslice = pd.DataFrame(columns=dataset.columns.values)

for unit in units:
    
    print(unit)
    annotated_temp = dataset[dataset['Metadata_Barcode'] == unit]

    # Normalize: choose between standardize, robustize, mad_robustize, spherize 
    normalized_temp = normalize(annotated_temp, 
                                features=list_features(dataset)[0],image_features=False, 
                                meta_features="infer", samples="Metadata_cmpdname == 'dmso'", 
                                method="standardize")
    normalized_noslice = pd.concat([normalized_noslice, normalized_temp], ignore_index=True)

to_clip_df = feature_select(normalized_noslice, features=list_features(normalized_noslice)[0], operation=["variance_threshold", "correlation_threshold","drop_na_columns", "blocklist"])
# Instead of removing the outliers, we can clip them
selected_df = pd.concat([to_clip_df[list_features(to_clip_df)[1]], to_clip_df[list_features(to_clip_df)[0]].clip(lower=-40, upper=40, axis=1)], axis=1)
print(selected_df.shape)
selected_df.to_csv(('{}normalized_data_no_slice_{}.csv').format(OutputDir, cell_line), index=False)
    




PB000140
PB000137
PB000138
PB000139
(9919, 707)
