## Pycytominer example pipeline
requires scipy=<1.7.3


### Set-up

In [1]:
import os
import numpy as np
import pandas as pd
import string

# pycytominer
# from pycytominer import aggregate
from pycytominer import annotate
# from pycytominer import consensus
from pycytominer import feature_select
from pycytominer import normalize


# Set current working directory
print(os.getcwd())
os.chdir('../maxproject_features/')
print(os.getcwd())

2024-07-31 09:27:58.224771: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


/share/data/analyses/christa/colopaint3D/maxproject_features
/share/data/analyses/christa/colopaint3D/maxproject_features


In [2]:
OutputDir = "results" # Where do you want to save the merged data csv? 
if not os.path.exists(OutputDir):
    os.makedirs(OutputDir)

cells = ['HCT116', 'HT29']

In [3]:
## Figure settings 

dpi = 300
figsize = (2.24, 2.24)

mpl.rcParams['pdf.fonttype'] = 42  
mpl.rcParams['ps.fonttype'] = 42  


# Font sizes
title_size = 12
axis_label_size = 10
label_size = 6


### Perfrom quality control



In [4]:
### Some settings

features = ['ImageQuality_PowerLogLogSlope_PHAandWGA', 'ImageQuality_PowerLogLogSlope_HOECHST', 'ImageQuality_PowerLogLogSlope_CONC','ImageQuality_PowerLogLogSlope_SYTO', 'ImageQuality_PowerLogLogSlope_MITO']
plates = ["PB000137", "PB000138", "PB000139", "PB000140", "PB000141", "PB000142"]

cutoff = 0.93

In [5]:
# Initialize the dataframe
QC = pd.DataFrame()

for plate in plates:
    # Import QC for all cell lines 
    QCfile = "/share/data/cellprofiler/automation/results/spher-colo52/{}/QC/qcRAW_images.csv".format(plate)

    # Import the data
    data = pd.read_csv(QCfile, index_col=0)
    data['barcode'] = plate

    data['flag'] = 0
    for feature in features: 
        data['flag'] = data['flag'] + (data[feature] > data[feature].quantile(cutoff)).astype(int)

    # Merge the data
    QC = pd.concat([QC, data])
    

# Update the source layout to match the new well assignments 
QC['well'] = QC['FileName_CONC'].str.split('-', expand=True)[1]
QC['well_number'] = QC['well'].str.split('(\d+)', expand=True)[1].str.lstrip('0').astype(int)
QC['well_letter'] = QC['well'].str.split('(\d+)', expand=True)[0].map(lambda x: ord(x) - 64)

QC['plate_well'] = QC['well'] + "_" + QC['barcode']

# Save the merged data
QC.to_csv("{}/QCFlags.csv".format(OutputDir), index=False)

In [6]:
## QC

for plate in plates:

    # define a 386-well plate
    cols = 24
    rows = 16

    heatmap = np.zeros((rows,cols))

    plot_df = QC[QC['barcode'] == plate]

    for i in range(len(plot_df)):
        heatmap[plot_df['well_letter'].iloc[i]-1,plot_df['well_number'].iloc[i]-1] = plot_df['flag'].iloc[i]

    fig = plt.figure(figsize=figsize, dpi=dpi)
   
    %matplotlib inline
    yticklabels= list(string.ascii_uppercase)[:16]
    xticklabels = range(1, 25)

    ax = sns.heatmap(heatmap, linewidths=1, cmap='Reds', yticklabels=yticklabels, xticklabels=xticklabels, cbar=False)
    ax.set_title('# Flags: {}'.format(plate), fontsize=title_size)
    ax.xaxis.tick_top()
    # Hide major ticks but keep the labels
    ax.tick_params(axis='both', which='both', length=0)

    # We change the fontsize of minor ticks label 
    ax.tick_params(axis='both', labelsize=6)

    # box tight
    plt.tight_layout()

    plt.savefig("{}/{}_QC.pdf".format(OutputDir, plate), dpi=dpi)
    plt.close()

    


### Preprocess with pycytominer

In [8]:
## Import data and remove QC flags

data = pd.DataFrame()

for plate in plates:
    # Import QC for all cell lines 
    data_file = "/share/data/cellprofiler/automation/results/spher-colo52/{}/results/featICF_spheroid.csv".format(plate)

    # Import the data
    tmp= pd.read_csv(data_file, index_col=0)
    tmp['barcode'] = plate

    print("Data", plate ,":\n", tmp.shape)

    # Merge the data
    data = pd.concat([data, tmp])
    print("XXX", data.shape)

# Rename the plate_well using Metadata_barcode and well_id
data['plate_well'] = data['Metadata_Well'] + "_" + data['barcode']

df = data.merge(QC[['plate_well', 'well', 'flag']], left_on=['plate_well'], right_on=['plate_well'])

print("Data before QC:\n", df.shape)

# Filter out wells with 2 or more flags
df = df[df['flag'] < 2]

print("Data after QC:\n", df.shape, "\n")


## Are there any NaNs? 
nans = df.isna().sum()
nans = nans[nans > 0]

print("There are so many nans:\n", nans)

Data PB000137 :
 (287, 738)
XXX (287, 738)
Data PB000138 :
 (287, 738)
XXX (574, 738)
Data PB000139 :
 (264, 738)
XXX (838, 738)
Data PB000140 :
 (264, 738)
XXX (1102, 738)
Data PB000141 :
 (254, 738)
XXX (1356, 738)
Data PB000142 :
 (260, 738)
XXX (1616, 738)
Data before QC:
 (1616, 741)
Data after QC:
 (1460, 741) 

There are so many nans:
 Metadata_Channel         1460
Metadata_Channel.1       1460
Metadata_FileLocation    1460
dtype: int64


In [9]:
# Remove non-data features from the list of features
ListOfFeatures = list(df.columns.values)
ListOfMetadata = list(df.columns[
    df.columns.str.contains("FileName_") |
    df.columns.str.contains("PathName_") |
    df.columns.str.contains("Metadata_")])
ListOfFeatures = list(set(ListOfFeatures) - set(ListOfMetadata) - set(['ObjectNumber', 'Number_Object_Number', 'well', 'flag']))

# ListOfMetadataNew = ["Metadata_Well", "cell_line"] 

# Remove all metadata, paths and filenames except for the well_id
df = df[ListOfFeatures +  ["Metadata_Well"]]

In [10]:
# Import Metadata
dfLayout = pd.read_csv('colo52-v1-import-inputFiles-and-PLAIDresults - 5-metadata.csv', sep=",")
# dfLayout = dfLayout.loc[~(dfLayout.layout_id == 'spher010-P1-L2')]
# Rename the plate_well using Metadata_barcode and well_id
dfLayout['plate_well'] = dfLayout['well_id'] + "_" + dfLayout['barcode']
dfLayout['name'] = dfLayout['cmpdname'].str[:5]

print(dfLayout.shape)
dfLayout.head()

(1848, 28)


Unnamed: 0,layout_id,well_id,image_id,cp_id,barcode,plate_well,cmpd_code,cmpdname,solvent,cmpd_conc,...,article_id,target,pathway,pubchemID,smiles,inkey,target_type,clinical_status,cell_line,name
0,spher-colo52-v1-ULA-PB000137-HCT116-48h-P1-L1,B02,4185,5532,PB000137,B02_PB000137,colo-006,PD0325901,dmso,3.0,...,S1036,MEK,MAPK,9826528.0,OCC(O)CONC(=O)C1=C(NC2=CC=C(I)C=C2F)C(=C(F)C=C1)F,SUDAHWBOROXANE-SECBINFHSA-N,Targeted,Phase 2,HCT116,PD032
1,spher-colo52-v1-ULA-PB000137-HCT116-48h-P1-L1,B03,4185,5532,PB000137,B03_PB000137,colo-018,Paclitaxel,dmso,0.1,...,S1150,"Autophagy,Microtubule Associated",Cytoskeletal Signaling,441276.0,CC(=O)OC1C(=O)C2(C)C(O)CC3OCC3(OC(C)=O)C2C(OC(...,RCINICONZNJXQF-VAZQATRQSA-N,Cytotoxic,Preclinical,HCT116,Pacli
2,spher-colo52-v1-ULA-PB000137-HCT116-48h-P1-L1,B04,4185,5532,PB000137,B04_PB000137,colo-009,"Olaparib (AZD2281, Ku-0059436)",dmso,3.0,...,S1060,PARP,DNA Damage,23725625.0,FC1=C(C=C(CC2=NNC(=O)C3=C2C=CC=C3)C=C1)C(=O)N4...,FDLYAMZZIXQODN-UHFFFAOYSA-N,Targeted,Launched,HCT116,Olapa
3,spher-colo52-v1-ULA-PB000137-HCT116-48h-P1-L1,B05,4185,5532,PB000137,B05_PB000137,colo-012,SB216763,dmso,10.0,...,S1075,GSK-3,PI3K/Akt/mTOR,176158.0,C[N]1C=C(C2=C1C=CC=C2)C3=C(C(=O)NC3=O)C4=CC=C(...,JCSGFHVFHSKIJH-UHFFFAOYSA-N,Targeted,Preclinical,HCT116,SB216
4,spher-colo52-v1-ULA-PB000137-HCT116-48h-P1-L1,B06,4185,5532,PB000137,B06_PB000137,colo-008,"Vorinostat (SAHA, MK0683)",dmso,3.0,...,S1047,"Autophagy,HDAC",Epigenetics,5311.0,ONC(=O)CCCCCCC(=O)NC1=CC=CC=C1,WAEXFXRVDQXREF-UHFFFAOYSA-N,Targeted,Launched,HCT116,Vorin


In [11]:
# Annotate: connect metadata to the feature data
# OBS: metadata will be prefixed with 'Metadata_'
annotated = annotate(df, platemap=dfLayout, join_on=['Metadata_plate_well', 'plate_well'],add_metadata_id_to_platemap=True, format_broad_cmap=False, clean_cellprofiler=False)
# annotated.to_csv("{}/annotated_data_{}.csv".format(OutputDir, cell_line))

annotated.shape

(1460, 748)

In [12]:
# Remove non-data features from the list of features
ListOfFeatures = list(df.columns.values)
ListOfMetadata = list(df.columns[df.columns.str.contains("Metadata_")])
ListOfFeatures = list(set(ListOfFeatures) - set(ListOfMetadata) - set(["plate_well", "barcode", "cell_line"]))

In [13]:
annotated['Metadata_norm_unit'] = annotated['Metadata_barcode'] + "_" + annotated['Metadata_cell_line']
units = annotated['Metadata_norm_unit'].unique()

In [14]:
#
# Version 2: Normalize separately per 1) plate and 2) cell line
#

# # Normalize separately per cell line
# ListOfPlates = annotated['Metadata_layout_id'].unique()

# itnitialize an empty dataframe
normalized = pd.DataFrame(columns=annotated.columns.values)
normalized = normalized.drop(columns=['plate_well', 'barcode'])

for unit in units:
    
    annotated_temp = annotated[annotated['Metadata_norm_unit'] == unit]

    # Normalize: choose between standardize, robustize, mad_robustize, spherize 
    normalized_temp = normalize(annotated_temp, 
                                features=ListOfFeatures,image_features=False, 
                                meta_features="infer", samples="Metadata_pert_type == 'neg_con'", 
                                method="standardize")
    normalized = pd.concat([normalized, normalized_temp], ignore_index=True)

    print(unit)


PB000137_HCT116
PB000139_HCT116
PB000139_HT29
PB000141_HT29
PB000138_HCT116
PB000140_HCT116
PB000140_HT29
PB000142_HT29


In [15]:
# Feature selection: "variance_threshold", "correlation_threshold", "drop_na_columns", "blocklist", "drop_outliers", "noise_removal",
to_clip_df = feature_select(normalized, features=ListOfFeatures, operation=["variance_threshold", "correlation_threshold","drop_na_columns", "blocklist" ])

In [16]:
# Remove non-data features from the list
ListOfSelectedFeatures = list(to_clip_df.columns.values)
ListOfMetadata = list(to_clip_df.columns[to_clip_df.columns.str.contains("Metadata_")])
ListOfSelectedFeatures = list(set(ListOfSelectedFeatures) - set(ListOfMetadata))

In [17]:
# Instead of removing the outliers, we can clip them to the 1st and 99th percentile.
selected_df = pd.concat([to_clip_df[ListOfMetadata], to_clip_df[ListOfSelectedFeatures].clip(lower=-40, upper=40, axis=1)], axis=1)
print(selected_df.shape)

(1460, 503)


In [18]:
selected_df.to_parquet("{}/selected_data.csv".format(OutputDir))