In [1]:
import pandas as pd

db_uri = 'postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb'

query = """
        SELECT project
        FROM image_analyses_per_plate
        GROUP BY project
        ORDER BY project 
        """

# Query database and store result in pandas dataframe
df_projects = pd.read_sql_query(query, db_uri)

display(df_projects.head(10))

Unnamed: 0,project
0,160621-Wash-Optimisation
1,2020_11_04_CPJUMP1
2,24OHC-v1
3,A549-VictorChildIMX
4,Aish
5,Aleksi
6,Anders_requested_test
7,anders-test
8,Anton-gpcr
9,AROS-CP


In [2]:
import pandas as pd

NameContains = 'AROS-Reproducibility-MoA'
query = f"""
        SELECT *
        FROM image_analyses_per_plate
        WHERE project LIKE '{NameContains}%%'
        AND meta->>'type' = 'cp-qc'
        AND analysis_date IS NOT NULL
        ORDER BY plate_barcode 
        """

# Query database and store result in pandas dataframe
df_cp_results = pd.read_sql_query(query, db_uri)

display(df_cp_results)

Unnamed: 0,project,plate_barcode,plate_acq_name,plate_acq_id,analysis_id,analysis_date,analysis_error,meta,pipeline_name,results
0,AROS-Reproducibility-MoA-Full,P013725,P013725,3072,3248,2023-03-23,,"{'type': 'cp-qc', 'priority': None}",384-96_QC-batch9,/share/data/cellprofiler/automation/results/P0...
1,AROS-Reproducibility-MoA-Full,P013726,P013726,3073,3249,2023-03-23,,"{'type': 'cp-qc', 'priority': None}",384-96_QC-batch9,/share/data/cellprofiler/automation/results/P0...
2,AROS-Reproducibility-MoA-Full,P013726,P013726,3073,3241,2023-03-23,,"{'type': 'cp-qc', 'priority': None}",384-96_QC-batch3,/share/data/cellprofiler/automation/results/P0...


In [3]:
import pandas as pd

# Check for duplicates
duplicates = df_cp_results.duplicated(subset='plate_barcode', keep=False)
duplicates_count = duplicates.sum()

# If there are duplicates, display a warning to the user
if duplicates_count > 0:
    print(f"Warning: There are {duplicates_count} duplicated rows based on 'plate_barcode'. Please check your data.")

display(df_cp_results)



Unnamed: 0,project,plate_barcode,plate_acq_name,plate_acq_id,analysis_id,analysis_date,analysis_error,meta,pipeline_name,results
0,AROS-Reproducibility-MoA-Full,P013725,P013725,3072,3248,2023-03-23,,"{'type': 'cp-qc', 'priority': None}",384-96_QC-batch9,/share/data/cellprofiler/automation/results/P0...
1,AROS-Reproducibility-MoA-Full,P013726,P013726,3073,3249,2023-03-23,,"{'type': 'cp-qc', 'priority': None}",384-96_QC-batch9,/share/data/cellprofiler/automation/results/P0...
2,AROS-Reproducibility-MoA-Full,P013726,P013726,3073,3241,2023-03-23,,"{'type': 'cp-qc', 'priority': None}",384-96_QC-batch3,/share/data/cellprofiler/automation/results/P0...


In [4]:
import pandas as pd

NameContains = 'AROS-Reproducibility-MoA'
query = f"""
        SELECT *
        FROM image_analyses_per_plate
        WHERE project LIKE '{NameContains}%%'
        AND meta->>'type' = 'cp-qc'
        AND analysis_date IS NOT NULL
        ORDER BY plate_barcode 
        """

# Query database and store result in pandas dataframe
df_cp_results = pd.read_sql_query(query, db_uri)

# Check for duplicates
duplicates = df_cp_results[df_cp_results.duplicated(subset='plate_barcode', keep=False)]

if not duplicates.empty:
    # Group the duplicated data by 'plate_barcode' and count the occurrences
    grouped_duplicates = duplicates.groupby('plate_barcode')
    for name, group in grouped_duplicates:
        print(f"The plate with barcode {name} is replicated {len(group)} times with analysis_id of {group['analysis_id'].tolist()}")

display(df_cp_results)

The plate with barcode P013726 is replicated 2 times with analysis_id of [3249, 3241]


Unnamed: 0,project,plate_barcode,plate_acq_name,plate_acq_id,analysis_id,analysis_date,analysis_error,meta,pipeline_name,results
0,AROS-Reproducibility-MoA-Full,P013725,P013725,3072,3248,2023-03-23,,"{'type': 'cp-qc', 'priority': None}",384-96_QC-batch9,/share/data/cellprofiler/automation/results/P0...
1,AROS-Reproducibility-MoA-Full,P013726,P013726,3073,3249,2023-03-23,,"{'type': 'cp-qc', 'priority': None}",384-96_QC-batch9,/share/data/cellprofiler/automation/results/P0...
2,AROS-Reproducibility-MoA-Full,P013726,P013726,3073,3241,2023-03-23,,"{'type': 'cp-qc', 'priority': None}",384-96_QC-batch3,/share/data/cellprofiler/automation/results/P0...


In [5]:
# drop some rows if that is needed on Analysis ID
df_cp_results = df_cp_results[~df_cp_results.analysis_id.isin([3241])] 
# These analyses were run on wrong channel map. 2641 contains 3456 rows instead of 2772 since it is a JUMP plate.

# reindex df
df_cp_results = df_cp_results.reset_index(drop=True)

display(df_cp_results)

# add cp-result file column
df_cp_results['qc-file'] = df_cp_results['results'] + 'qcRAW_images_' + df_cp_results['plate_barcode'] + '.parquet'

# read all csv and concat them all into one dataframe
df_all_files = pd.DataFrame()
for index, row in df_cp_results.iterrows():

    df_data_from_one_file =  pd.read_parquet(row['qc-file'])
    
    # Add column and update barcode (as a workaround It should be included in cellprofiler result in future)
    #
    df_data_from_one_file['Metadata_AcqID'] = row['plate_acq_id']
    df_data_from_one_file['Metadata_Barcode'] = row['plate_barcode']
    
    print (f'df_data_from_one_file no: {index} contains {df_data_from_one_file.shape[1]} columns and {df_data_from_one_file.shape[0]} rows. name: {row["qc-file"]}')
    
    df_all_files = pd.concat([df_all_files, df_data_from_one_file])
    
display(df_all_files.head(2))

df_all_files.shape

Unnamed: 0,project,plate_barcode,plate_acq_name,plate_acq_id,analysis_id,analysis_date,analysis_error,meta,pipeline_name,results
0,AROS-Reproducibility-MoA-Full,P013725,P013725,3072,3248,2023-03-23,,"{'type': 'cp-qc', 'priority': None}",384-96_QC-batch9,/share/data/cellprofiler/automation/results/P0...
1,AROS-Reproducibility-MoA-Full,P013726,P013726,3073,3249,2023-03-23,,"{'type': 'cp-qc', 'priority': None}",384-96_QC-batch9,/share/data/cellprofiler/automation/results/P0...


df_data_from_one_file no: 0 contains 588 columns and 3420 rows. name: /share/data/cellprofiler/automation/results/P013725/3072/3248/qcRAW_images_P013725.parquet
df_data_from_one_file no: 1 contains 588 columns and 3420 rows. name: /share/data/cellprofiler/automation/results/P013726/3073/3249/qcRAW_images_P013726.parquet


Unnamed: 0,AreaOccupied_AreaOccupied_nuclei,AreaOccupied_Perimeter_nuclei,AreaOccupied_TotalArea_nuclei,Count_nuclei,ExecutionTime_01LoadData,ExecutionTime_02MeasureImageQuality,ExecutionTime_03FlagImage,ExecutionTime_04MeasureImageQuality,ExecutionTime_05FlagImage,ExecutionTime_06MeasureImageQuality,...,qc_flag_rawCONC_Blurred,qc_flag_rawCONC_Saturated,qc_flag_rawHOECHST_Blurry,qc_flag_rawHOECHST_Saturated,qc_flag_rawMITO_Blurry,qc_flag_rawMITO_Saturated,qc_flag_rawPHAandWGA_Blurry,qc_flag_rawPHAandWGA_Saturated,qc_flag_rawSYTO_Blurred,qc_flag_rawSYTO_Saturated
0,773018.0,56064.0,9000000.0,261.0,4.65,26.85,0.0,30.790001,0.01,31.780001,...,0,0,1,0,0,0,0,0,0,0
1,420668.0,31480.0,9000000.0,150.0,3.41,27.52,0.01,29.57,0.0,30.93,...,1,0,1,0,0,0,0,0,0,0


(6840, 588)

In [6]:
# Add some columns
df_data = df_all_files.copy()
df_data['Metadata_AcqID'] = df_data['Metadata_AcqID'].astype(int).astype(str)
df_data['Metadata_Site'] = df_data['Metadata_Site'].astype(int).astype(str)
df_data['ImageID'] = df_data['Metadata_AcqID'] + '_' + df_data['Metadata_Well'] + '_' + df_data['Metadata_Site']
df_data['barcode'] = df_data['Metadata_Barcode']
df_data['well_id'] = df_data['Metadata_Well']
df_data['plate'] = df_data['Metadata_Barcode']
df_data['plate-name'] = df_data['Metadata_Barcode']
df_data['plateWell'] = df_data['Metadata_Barcode'] + '_' + df_data['Metadata_Well']
df_data['site'] = df_data['Metadata_Site']

display(df_data.tail(2))

Unnamed: 0,AreaOccupied_AreaOccupied_nuclei,AreaOccupied_Perimeter_nuclei,AreaOccupied_TotalArea_nuclei,Count_nuclei,ExecutionTime_01LoadData,ExecutionTime_02MeasureImageQuality,ExecutionTime_03FlagImage,ExecutionTime_04MeasureImageQuality,ExecutionTime_05FlagImage,ExecutionTime_06MeasureImageQuality,...,qc_flag_rawPHAandWGA_Saturated,qc_flag_rawSYTO_Blurred,qc_flag_rawSYTO_Saturated,ImageID,barcode,well_id,plate,plate-name,plateWell,site
3418,527036.0,37106.0,9000000.0,177.0,1.2,17.74,0.0,19.4,0.0,20.33,...,0,0,0,3073_P22_8,P013726,P22,P013726,P013726,P013726_P22,8
3419,542470.0,38359.0,9000000.0,179.0,1.07,17.42,0.0,19.200001,0.0,20.139999,...,0,0,0,3073_P22_9,P013726,P22,P013726,P013726,P013726_P22,9


In [7]:
data = df_all_files.copy()
try:
    PlateNames = sorted(list(set(data['Metadata_Barcode'])))
    print(f'Number of plates: {len(PlateNames)}')
    print(PlateNames)
    data.sort_values(['Metadata_Barcode','Metadata_Well', 'Metadata_Site'], inplace = True)
    data.reset_index(drop=True, inplace = True)
except Exception:
    print('Plate names not specified')
    PlateNames = []
Wells = sorted(list(set(data['Metadata_Well'])))
NrOfWells = len(Wells)
print(f'Number of wells: {NrOfWells}')


Rows = sorted(list({w[0] for w in Wells}))
print(*Rows)
NrOfRows = len(Rows)
Columns = sorted(list({w[1:] for w in Wells}))
NrOfColumns = len(Columns)
print(*Columns)

AllWells = [(x+y) for x in Rows for y in Columns]  #ADDED THIS LINE

Sites = sorted(list(set(data['Metadata_Site'])))
NrOfSites = len(Sites)
print(f'Number of sites: {NrOfSites}')
print(
    f'Processed {data.shape[0]} of {len(PlateNames) * NrOfWells * NrOfSites} images'
)

Number of plates: 2
['P013725', 'P013726']
Number of wells: 380
A B C D E F G H I J K L M N O P
01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
Number of sites: 9
Processed 6840 of 6840 images


In [77]:
import re

image_quality_cols = [col for col in data.columns if "ImageQuality_" in col]
image_quality_module = [col.replace('ImageQuality_', '') for col in image_quality_cols]
image_quality_measures = sorted({re.sub('_.*', '', s) for s in image_quality_module})
count_measures = len(image_quality_measures)

print(f'Image Quality module has measured {count_measures} parameters: {", ".join(image_quality_measures)}')

Image Quality module has measured 22 parameters: Correlation, FocusScore, LocalFocusScore, MADIntensity, MaxIntensity, MeanIntensity, MedianIntensity, MinIntensity, PercentMaximal, PercentMinimal, PowerLogLogSlope, Scaling, StdIntensity, ThresholdBackground, ThresholdKapur, ThresholdMCT, ThresholdMoG, ThresholdOtsu, ThresholdRidlerCalvard, ThresholdRobustBackground, TotalArea, TotalIntensity


In [78]:
not_so_useful = ['TotalArea', 'Scaling', 'TotalIntensity', 'Correlation', 'PercentMinimal',
                 'LocalFocusScore', 'MinIntensity', 'MedianIntensity', 'MADIntensity',
                 'ThresholdMoG', 'ThresholdBackground', 'ThresholdKapur', 'ThresholdMCT',
                 'ThresholdOtsu', 'ThresholdRidlerCalvard', 'ThresholdRobustBackground',
                 'PercentMaximal']

image_quality_measures = [measure for measure in image_quality_measures if measure not in not_so_useful]
count_measures = len(image_quality_measures)

print(f'I will use {count_measures} parameters: {", ".join(image_quality_measures)}')

data_frame_dictionary = {measure: data[[col for col in image_quality_cols if f'_{measure}' in col]] for measure in image_quality_measures}
data_frame_list = sorted(list(data_frame_dictionary.keys()))

I will use 5 parameters: FocusScore, MaxIntensity, MeanIntensity, PowerLogLogSlope, StdIntensity


In [86]:
nrSubplots=len(data_frame_list)
ChannelNames = [
    re.sub('.*_', '', c)
    for c in list(data_frame_dictionary[data_frame_list[0]].columns)
]
ChannelNames

['CONC', 'HOECHST', 'MITO', 'PHAandWGA', 'SYTO']