In [1]:
import pandas as pd
import numpy as np
import os

# Pycytominer
from pycytominer import feature_select
from pycytominer import normalize
from pycytominer import aggregate

# Set current working directory
print(os.getcwd())
os.chdir('/share/data/analyses/christa/colopaint3D/spher_colo52_v1/')
print(os.getcwd())

/share/data/analyses/christa/colopaint3D/spher_colo52_v1
/share/data/analyses/christa/colopaint3D/spher_colo52_v1


In [2]:
# Some function definitions

def list_features(df):
    # List features
    list_of_selected_features = list(df.columns.values)
    list_of_metadata = list(df.columns[df.columns.str.contains("Metadata_")])
    list_of_selected_features = list(set(list_of_selected_features) - set(list_of_metadata))
    
    return list_of_selected_features, list_of_metadata

In [3]:
cell_line = 'HT29'

In [4]:
# List all files in directory
dir = '1_FeaturesImages_none/SingleSlice/'


files = os.listdir(dir)

name = dir

# Select all files with HCT116 in the name as well as MedianAgg_meanstd
files = [file for file in files if cell_line in file and 'MedianAgg' in file]

In [5]:
# Load the parquet file into a pandas dataframe

# Load all files
data = []
for file in files:
    data.append(pd.read_parquet(dir + file))

data = pd.concat(data)

# Load metadata (I am missing the concentrations)
metadata = pd.read_csv('spher_colo52-metadata.csv')

In [6]:
## Prepare the metadata

dataset = data.copy()

# Merge data with metadata to get the concentrations
dataset = dataset.merge(metadata[['plate_well', 'cmpd_conc']], left_on='Metadata_PlateWell', right_on = 'plate_well')
dataset = dataset.drop(columns=['plate_well'])
dataset['Metadata_cmpd_conc'] = dataset['cmpd_conc'].rename('Metadata_cmpd_conc')

# Remove all columns with 'FileName' or 'PathName' in the name
dataset = dataset.loc[:,~dataset.columns.str.contains('FileName|PathName|ObjectNumber|ImageNumber|AcqID')]
dataset['Metadata_PlateWell'] = dataset['Metadata_Well'].astype(str) + '_' + dataset['Metadata_Barcode']
# # Add a short name for the compound
dataset['Metadata_name'] = dataset['Metadata_cmpd_cmpdname'].str[:5] 


In [7]:
# Normalize each slice for each plate separately
dataset["Metadata_plate_slice"] = (
    dataset["Metadata_Barcode"] + "_" + dataset["Metadata_Site"].astype(str)
    )

In [None]:
#
# Normalize separately per 1) plate and 2) cell line
#

units = dataset["Metadata_plate_slice"].unique() # Per slice in each plate

# Itnitialize an empty dataframe
normalized = pd.DataFrame(columns=dataset.columns.values)

for unit in units:
    
    print(unit)
    annotated_temp = dataset[dataset['Metadata_plate_slice'] == unit]

    # Normalize: choose between standardize, robustize, mad_robustize, spherize 
    normalized_temp = normalize(annotated_temp, 
                                features=list_features(dataset)[0],image_features=False, 
                                meta_features="infer", samples="Metadata_cmpd_cmpdname == 'dmso'", 
                                method="standardize")
    normalized = pd.concat([normalized, normalized_temp], ignore_index=True)

    


PB000140_0
PB000140_1
PB000140_2
PB000140_3
PB000140_4
PB000140_5
PB000140_6
PB000140_7
PB000140_8
PB000140_9
PB000140_10
PB000140_11
PB000141_0
PB000141_1
PB000141_2
PB000141_3
PB000141_4
PB000141_5
PB000141_6
PB000141_7
PB000141_8
PB000141_9
PB000141_10
PB000141_11
PB000142_0
PB000142_1
PB000142_2
PB000142_3
PB000142_4
PB000142_5
PB000142_6
PB000142_7
PB000142_8
PB000142_9
PB000142_10
PB000142_11
PB000139_0
PB000139_1
PB000139_2
PB000139_3
PB000139_4
PB000139_5
PB000139_6
PB000139_7
PB000139_8
PB000139_9
PB000139_10
PB000139_11


In [9]:
# Aggregate profiles across z-slices

features = list_features(normalized)[0]
metadata_cols = [col for col in normalized.columns if col not in features + ['Metadata_Site', 'Metadata_PlateWell','Metadata_plate_slice']]

aggregated_df = normalized.groupby(['Metadata_PlateWell']).agg(
    {**{col: 'first' for col in metadata_cols},  # Keep the first occurrence of metadata columns
    **{col: 'median' for col in features}}  # Aggregate features by mean (or any other function)
).reset_index()

In [10]:
# Feature selection: "variance_threshold", "correlation_threshold", "drop_na_columns", "blocklist", "drop_outliers", "noise_removal",
to_clip_df = feature_select(aggregated_df, features=list_features(normalized)[0], operation=["variance_threshold", "correlation_threshold","drop_na_columns", "blocklist"])

# Instead of removing the outliers, we can clip them
selected_df = pd.concat([to_clip_df[list_features(to_clip_df)[1]], to_clip_df[list_features(to_clip_df)[0]].clip(lower=-40, upper=40, axis=1)], axis=1)
print(selected_df.shape)

(773, 670)


In [11]:
# Save the data
OutputDir = '1_FeaturesImages_none/'

selected_df.to_csv(('{}selected_data_{}.csv').format(OutputDir, cell_line), index=False)