In [None]:
import pandas as pd

db_uri = 'postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb'

query = """
        SELECT project
        FROM image_analyses_per_plate
        GROUP BY project
        ORDER BY project 
        """

# Query database and store result in pandas dataframe
df_projects = pd.read_sql_query(query, db_uri)

display(df_projects.head(10))

In [None]:
import pandas as pd

NameContains = 'AROS-Reproducibility-MoA'
query = f"""
        SELECT *
        FROM image_analyses_per_plate
        WHERE project LIKE '{NameContains}%%'
        AND meta->>'type' = 'cp-qc'
        AND analysis_date IS NOT NULL
        ORDER BY plate_barcode 
        """

# Query database and store result in pandas dataframe
df_cp_results = pd.read_sql_query(query, db_uri)

display(df_cp_results)

In [None]:
import pandas as pd

# Check for duplicates
duplicates = df_cp_results.duplicated(subset='plate_barcode', keep=False)
duplicates_count = duplicates.sum()

# If there are duplicates, display a warning to the user
if duplicates_count > 0:
    print(f"Warning: There are {duplicates_count} duplicated rows based on 'plate_barcode'. Please check your data.")

display(df_cp_results)

In [None]:
import pandas as pd

NameContains = 'AROS-Reproducibility-MoA'
query = f"""
        SELECT *
        FROM image_analyses_per_plate
        WHERE project LIKE '{NameContains}%%'
        AND meta->>'type' = 'cp-qc'
        AND analysis_date IS NOT NULL
        ORDER BY plate_barcode 
        """

# Query database and store result in pandas dataframe
df_cp_results = pd.read_sql_query(query, db_uri)

# Check for duplicates
duplicates = df_cp_results[df_cp_results.duplicated(subset='plate_barcode', keep=False)]

if not duplicates.empty:
    # Group the duplicated data by 'plate_barcode' and count the occurrences
    grouped_duplicates = duplicates.groupby('plate_barcode')
    for name, group in grouped_duplicates:
        print(f"The plate with barcode {name} is replicated {len(group)} times with analysis_id of {group['analysis_id'].tolist()}")

display(df_cp_results)

In [None]:
# drop some rows if that is needed on Analysis ID
df_cp_results = df_cp_results[~df_cp_results.analysis_id.isin([3241])] 
# These analyses were run on wrong channel map. 2641 contains 3456 rows instead of 2772 since it is a JUMP plate.

# reindex df
df_cp_results = df_cp_results.reset_index(drop=True)

display(df_cp_results)

# add cp-result file column
df_cp_results['qc-file'] = df_cp_results['results'] + 'qcRAW_images_' + df_cp_results['plate_barcode'] + '.parquet'

# read all csv and concat them all into one dataframe
df_all_files = pd.DataFrame()
for index, row in df_cp_results.iterrows():

    df_data_from_one_file =  pd.read_parquet(row['qc-file'])
    
    # Add column and update barcode (as a workaround It should be included in cellprofiler result in future)
    #
    df_data_from_one_file['Metadata_AcqID'] = row['plate_acq_id']
    df_data_from_one_file['Metadata_Barcode'] = row['plate_barcode']
    
    print (f'df_data_from_one_file no: {index} contains {df_data_from_one_file.shape[1]} columns and {df_data_from_one_file.shape[0]} rows. name: {row["qc-file"]}')
    
    df_all_files = pd.concat([df_all_files, df_data_from_one_file])
    
display(df_all_files.head(2))

df_all_files.shape

In [None]:
# Add some columns
df_data = df_all_files.copy()
df_data['Metadata_AcqID'] = df_data['Metadata_AcqID'].astype(int).astype(str)
df_data['Metadata_Site'] = df_data['Metadata_Site'].astype(int).astype(str)
df_data['ImageID'] = df_data['Metadata_AcqID'] + '_' + df_data['Metadata_Well'] + '_' + df_data['Metadata_Site']
df_data['barcode'] = df_data['Metadata_Barcode']
df_data['well_id'] = df_data['Metadata_Well']
df_data['plate'] = df_data['Metadata_Barcode']
df_data['plate-name'] = df_data['Metadata_Barcode']
df_data['plateWell'] = df_data['Metadata_Barcode'] + '_' + df_data['Metadata_Well']
df_data['site'] = df_data['Metadata_Site']

display(df_data.tail(2))

In [None]:
data = df_all_files.copy()
try:
    PlateNames = sorted(list(set(data['Metadata_Barcode'])))
    print(f'Number of plates: {len(PlateNames)}')
    print(PlateNames)
    data.sort_values(['Metadata_Barcode','Metadata_Well', 'Metadata_Site'], inplace = True)
    data.reset_index(drop=True, inplace = True)
except Exception:
    print('Plate names not specified')
    PlateNames = []
Wells = sorted(list(set(data['Metadata_Well'])))
NrOfWells = len(Wells)
print(f'Number of wells: {NrOfWells}')


Rows = sorted(list({w[0] for w in Wells}))
print(*Rows)
NrOfRows = len(Rows)
Columns = sorted(list({w[1:] for w in Wells}))
NrOfColumns = len(Columns)
print(*Columns)

AllWells = [(x+y) for x in Rows for y in Columns]  #ADDED THIS LINE

Sites = sorted(list(set(data['Metadata_Site'])))
NrOfSites = len(Sites)
print(f'Number of sites: {NrOfSites}')
print(
    f'Processed {data.shape[0]} of {len(PlateNames) * NrOfWells * NrOfSites} images'
)

In [None]:
import re

image_quality_cols = [col for col in data.columns if "ImageQuality_" in col]
image_quality_module = [col.replace('ImageQuality_', '') for col in image_quality_cols]
image_quality_measures = sorted({re.sub('_.*', '', s) for s in image_quality_module})
count_measures = len(image_quality_measures)

print(f'Image Quality module has measured {count_measures} parameters: {", ".join(image_quality_measures)}')

In [None]:
not_so_useful = ['TotalArea', 'Scaling', 'TotalIntensity', 'Correlation', 'PercentMinimal',
                 'LocalFocusScore', 'MinIntensity', 'MedianIntensity', 'MADIntensity',
                 'ThresholdMoG', 'ThresholdBackground', 'ThresholdKapur', 'ThresholdMCT',
                 'ThresholdOtsu', 'ThresholdRidlerCalvard', 'ThresholdRobustBackground',
                 'PercentMaximal']

image_quality_measures = [measure for measure in image_quality_measures if measure not in not_so_useful]
count_measures = len(image_quality_measures)

print(f'I will use {count_measures} parameters: {", ".join(image_quality_measures)}')

data_frame_dictionary = {measure: data[[col for col in image_quality_cols if f'_{measure}' in col]] for measure in image_quality_measures}
data_frame_list = sorted(list(data_frame_dictionary.keys()))

In [None]:
nrSubplots=len(data_frame_list)
ChannelNames = [
    re.sub('.*_', '', c)
    for c in list(data_frame_dictionary[data_frame_list[0]].columns)
]
ChannelNames

In [None]:
# Set of measures to not keep
not_so_useful_set = {
    'TotalArea',
    'Scaling',
    'TotalIntensity',
    'Correlation',
    'PercentMinimal',
    'LocalFocusScore',
    'MinIntensity',
    'MedianIntensity',
    'MADIntensity',
    'ThresholdMoG',
    'ThresholdBackground',
    'ThresholdKapur',
    'ThresholdMCT',
    'ThresholdOtsu',
    'ThresholdRidlerCalvard',
    'ThresholdRobustBackground',
    'PercentMaximal',
}

# Use pandas string operations to filter and transform column names
image_quality_cols = data.columns[data.columns.str.startswith('ImageQuality_')]
image_quality_measures_all = image_quality_cols.str.replace('ImageQuality_', '').str.split('_').str[0].unique()

print(f'Image Quality module has measured {len(image_quality_measures_all)} parameters: {", ".join(image_quality_measures_all)}')

# Use pandas to filter out the not so useful measures
image_quality_measures_filtered = image_quality_measures_all[~image_quality_measures_all.isin(not_so_useful_set)]
print(f'I will use {len(image_quality_measures_filtered)} parameters: {", ".join(image_quality_measures_filtered)}')

# Filter data to only the necessary columns
filtered_data = data[image_quality_cols]

# Create the DataFrame dictionary
data_frame_dictionary = {measure: filtered_data[filtered_data.columns[filtered_data.columns.str.contains(f'_{measure}')]] for measure in image_quality_measures_filtered}
data_frame_list = sorted(data_frame_dictionary.keys())


In [None]:
# Set of measures to keep
useful_measures = {
    'FocusScore',
    'MaxIntensity',
    'MeanIntensity',
    'PowerLogLogSlope',
    'StdIntensity',
}

# Use pandas string operations to filter and transform column names
image_quality_cols = data.columns[data.columns.str.startswith('ImageQuality_')]
image_quality_measures_all = image_quality_cols.str.replace('ImageQuality_', '').str.split('_').str[0].unique()

print(f'Image Quality module has measured {len(image_quality_measures_all)} parameters: {", ".join(image_quality_measures_all)}')

# Use pandas to filter the measures to the useful ones
image_quality_measures_filtered = image_quality_measures_all[image_quality_measures_all.isin(useful_measures)]
print(f'I will use {len(image_quality_measures_filtered)} parameters: {", ".join(image_quality_measures_filtered)}')

# Filter data to only the necessary columns
filtered_data = data[image_quality_cols]

# Create the DataFrame dictionary
data_frame_dictionary = {measure: filtered_data[filtered_data.columns[filtered_data.columns.str.contains(f'_{measure}')]] for measure in image_quality_measures_filtered}
data_frame_list = sorted(data_frame_dictionary.keys())


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Configuration parameters
colors = ['darkorange', 'cornflowerblue', 'forestgreen', 'red', 'yellow']
alpha = 0.5  # transparency
line_width = 0.5
figure_size = (10, 5)
resolution = 300
font_size = 12
num_subplots = len(data_frame_list)

fig = plt.figure(figsize=(figure_size[0], 1.5 * num_subplots))
fig.suptitle(NameContains, fontsize=font_size*1.2, x=0.2)

for i in range(num_subplots):
    # Extract the current DataFrame and its name
    current_df = data_frame_dictionary[data_frame_list[i]]
    current_df_name = image_quality_measures[i]

    # Set subplot settings
    ax = fig.add_subplot(num_subplots, 1, i+1)
    ax.set_title(current_df_name, fontsize=font_size)
    ax.set_facecolor('w')
    ax.spines['bottom'].set_color('w')
    ax.spines['top'].set_color('w')
    ax.spines['left'].set_color('lightgrey')
    ax.spines['right'].set_color('lightgrey')
    ax.set_xlim([0, len(current_df.index)])
    ax.grid(visible=True, which='major', axis='x', color='lightgrey', linestyle='-', linewidth=1, alpha=1)
    ax.grid(visible=True, which='major', axis='y', color='lightgrey', linestyle='', linewidth=0)
    ax.tick_params(axis='x', labelbottom=i == num_subplots - 1)

    # Plot the data
    current_df.plot(kind='line', linewidth=line_width, alpha=alpha, ax=ax, legend=None, color=colors)

    xticks = np.arange(0, len(current_df.index), NrOfSites*NrOfColumns*NrOfRows)
    ax.set_xticks(xticks)

    if i == num_subplots - 1:
        ax.set_xticklabels(PlateNames)
        for tick in ax.xaxis.get_major_ticks():
            tick.label1.set_fontsize(font_size - 6)
            tick.label1.set_rotation(15)

# Adjust subplot layout and set legend
plt.subplots_adjust(top=0.85, hspace=0.60)
legend = fig.legend(ChannelNames, fontsize=8, frameon=False)
for line in legend.get_lines():
    line.set_linewidth(5.0)

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd
import math

# Configuration parameters
colors = ['darkorange', 'cornflowerblue', 'forestgreen', 'red', 'yellow']
figure_size = (10, 5)
font_size = 12
num_subplots = len(data_frame_list)
y_range = [-5.5, 6]

fig = plt.figure(figsize=(figure_size[0], 1.5 * num_subplots))
fig.suptitle(NameContains, fontsize=font_size*1.2, x=0.2)

for i in range(num_subplots):
    # Extract the current DataFrame and its name
    current_df = data_frame_dictionary[data_frame_list[i]]
    current_df_name = image_quality_measures[i]

    # Scale the data
    scaler = StandardScaler()
    current_df_scaled = pd.DataFrame(scaler.fit_transform(current_df), columns=ChannelNames)

    # Set subplot settings
    ax = fig.add_subplot(num_subplots, 1, i+1)
    ax.set_title(current_df_name + ' Scaled', fontsize=font_size)
    ax.set_facecolor('w')
    ax.spines['bottom'].set_color('w')
    ax.spines['top'].set_color('w')
    ax.spines['left'].set_color('lightgrey')
    ax.spines['right'].set_color('lightgrey')
    ax.set_xlim([0, len(current_df_scaled.index)])
    ax.set_ylim(y_range[0], y_range[1])
    ax.grid(visible=True, which='major', axis='x', color='lightgrey', linestyle='-', linewidth=1, alpha=1)
    ax.grid(visible=True, which='major', axis='y', color='lightgrey', linestyle='-', linewidth=1, alpha=1)
    ax.tick_params(axis='x', labelbottom=i == num_subplots - 1)

    # Plot the data
    current_df_scaled.plot(kind='line', linewidth=0.5, alpha=0.5, ax=ax, legend=None, color=colors)

    xticks = np.arange(0, len(current_df_scaled.index), NrOfSites*NrOfColumns*NrOfRows)
    yticks = np.arange(start=math.ceil(y_range[0]), stop=y_range[1], step=2)
    ax.set_xticks(xticks)
    ax.set_yticks(yticks)

    if i == num_subplots - 1:
        ax.set_xticklabels(PlateNames)
        for tick in ax.xaxis.get_major_ticks():
            tick.label1.set_fontsize(font_size - 4)
            tick.label1.set_rotation(15)
    for tick in ax.yaxis.get_major_ticks():
        tick.label1.set_fontsize(6)

# Adjust subplot layout and set legend
plt.subplots_adjust(top=0.85, hspace=0.60)
legend = fig.legend(ChannelNames, fontsize=8, frameon=False)
for line in legend.get_lines():
    line.set_linewidth(5.0)

plt.show()

In [None]:
for i, item in enumerate(data_frame_list):
        print(i, item)

p = input(f'Enter an integer from 0 to {len(data_frame_list) - 1}:')
try:
        p = int(p)
except Exception:
        p=0
        
current_df = data_frame_dictionary.get(data_frame_list[p])
current_df_name = image_quality_measures[p]
current_df.describe(percentiles =  [.25, .5, .75, .90, .99])[1:]

In [None]:
LowerLimitScaled = -4.5   #('-inf') # e.g. -3 for MeanIntensityScaled
UpperLimitScaled = 4.5    #('inf')
NewFlagSc = 'OutlierScaled' + '_' + data_frame_list[p] + '_' + str(LowerLimitScaled) + '_' + str (UpperLimitScaled)
print(f'Outliers will be flagged in column: {NewFlagSc}')
Flags = [NewFlagSc]
data[NewFlagSc] = 0

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
current_df_scaled = pd.DataFrame(scaler.fit_transform(current_df), columns=ChannelNames)

CurrentDataFrameOutliersMetadata = data[
    (current_df_scaled.values >= UpperLimitScaled).any(1)
    | (current_df_scaled.values <= LowerLimitScaled).any(1)
][["Metadata_Barcode", "Metadata_Well", "Metadata_Site"]]

CurrentDataFrameOutliersValues = current_df_scaled[
    (current_df_scaled.values >= UpperLimitScaled).any(1)
    | (current_df_scaled.values <= LowerLimitScaled).any(1)
]

CurrentDataFrameScaledOutliers = CurrentDataFrameOutliersMetadata.merge(
    CurrentDataFrameOutliersValues, left_index=True, right_index=True
)

print(CurrentDataFrameScaledOutliers.shape[0])
CurrentDataFrameScaledOutliers

In [None]:
Outliers = CurrentDataFrameScaledOutliers.index.values.tolist()
data.loc[Outliers,NewFlagSc] = 1
print(
    f'{CurrentDataFrameScaledOutliers.shape[0]} images flagged in column {NewFlagSc}'
)

In [None]:
LowerLimitScaled = -4.5  # float('-inf')
UpperLimitScaled = 4.5  # float('inf')
Flags = []
for p in range(0, len(data_frame_list)):
    CurrentDataFrame = data_frame_dictionary.get(data_frame_list[p])
    CurrentDFName = image_quality_measures[p]

    x_unscaled = CurrentDataFrame.values
    x_scaled = StandardScaler().fit_transform(x_unscaled)
    CurrentDataFrameScaled = pd.DataFrame(x_scaled, columns=ChannelNames)

    NewFlagSc = (
        "OutlierScaled"
        + "_"
        + data_frame_list[p]
        + "_"
        + str(LowerLimitScaled)
        + "_"
        + str(UpperLimitScaled)
    )
    Flags.append(NewFlagSc)
    data[NewFlagSc] = 0

    CurrentDataFrameOutliersMetadata = data[
        (CurrentDataFrameScaled.values >= UpperLimitScaled).any(1)
        | (CurrentDataFrameScaled.values <= LowerLimitScaled).any(1)
    ][["Metadata_Barcode", "Metadata_Well", "Metadata_Site"]]
    CurrentDataFrameOutliersValues = CurrentDataFrameScaled[
        (CurrentDataFrameScaled.values >= UpperLimitScaled).any(1)
        | (CurrentDataFrameScaled.values <= LowerLimitScaled).any(1)
    ]
    CurrentDataFrameScaledOutliers = CurrentDataFrameOutliersMetadata.merge(
        CurrentDataFrameOutliersValues, left_index=True, right_index=True
    )

    Outliers = CurrentDataFrameScaledOutliers.index.values.tolist()
    data.loc[Outliers, NewFlagSc] = 1

data['Total'] = data[Flags].max(axis = 1)
Flags.append('Total')
print(data[Flags].sum())

In [None]:
# Scaling bounds
lower_limit_scaled = -4.5
upper_limit_scaled = 4.5

# List to store flags
flags = []

# Iterate over the length of the data frame list
for index, df_name in enumerate(data_frame_list):
    
    # Get the current dataframe from the dictionary
    current_dataframe = data_frame_dictionary[df_name]
    image_quality = image_quality_measures[index]

    # Scale the dataframe values
    x_unscaled = current_dataframe.values
    x_scaled = StandardScaler().fit_transform(x_unscaled)
    current_dataframe_scaled = pd.DataFrame(x_scaled, columns=ChannelNames)

    # Create a new flag
    new_flag_scaled = f"OutlierScaled_{df_name}_{lower_limit_scaled}_{upper_limit_scaled}"
    flags.append(new_flag_scaled)
    data[new_flag_scaled] = 0

    # Create a condition for outliers
    is_outlier_condition = (current_dataframe_scaled.values >= upper_limit_scaled).any(1) | (current_dataframe_scaled.values <= lower_limit_scaled).any(1)
    
    # Extract metadata and outlier values
    outliers_metadata = data[is_outlier_condition][["Metadata_Barcode", "Metadata_Well", "Metadata_Site"]]
    outliers_values = current_dataframe_scaled[is_outlier_condition]
    
    # Merge metadata and outlier values
    outliers_data = outliers_metadata.merge(outliers_values, left_index=True, right_index=True)
    outliers_indices = outliers_data.index.values.tolist()
    
    # Update the flag column for outliers
    data.loc[outliers_indices, new_flag_scaled] = 1

# Add a 'Total' column which is the maximum of all flag columns
data['Total'] = data[flags].max(axis=1)

# Append 'Total' to the list of flags
flags.append('Total')

# Print the sum of all flags
print(data[flags].sum())


In [None]:
df_flags = data[['Metadata_Barcode', 'Metadata_AcqID', 'Metadata_Well', 'Metadata_Site', 'Count_nuclei'] + list(data_frame_dictionary[data_frame_list[0]].columns) + Flags]
df_flags

In [None]:
NameContains = 'AROS-Reproducibility-MoA'
query = f"""
        SELECT *
        FROM image_analyses_per_plate
        WHERE project LIKE '{NameContains}%%'
        AND meta->>'type' = 'cp-features'
        AND analysis_date IS NOT NULL
        ORDER BY plate_acq_id, analysis_id
        """

# Query database and store result in pandas dataframe
df_cp_results = pd.read_sql_query(query, db_uri)

In [None]:
df_cp_results

In [None]:
import os
import datetime
import re
import numpy as np
import pandas as pd

# Your main dataframe containing the meta data for each plate.
df_cp_results = df_cp_results  # insert your DataFrame here

# Setting directory name to store output files.
OutputDir = 'ImageMeanFeatures'
if not os.path.exists(OutputDir):
    os.makedirs(OutputDir)

# Setting the prefix for output file names.
plateNamePrefix = 'ImageMeanPlate'

print('Start: ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

# Loop through each plate in the meta data.
for index, oneplate_analysis_meta in df_cp_results.iterrows():
    print(f'Processing plate: {index + 1} of {len(df_cp_results.index)}')

    # Checking if the output file already exists, if so skip this iteration.
    one_plate_filename = f'{OutputDir}/{plateNamePrefix}_{oneplate_analysis_meta["plate_acq_name"]}.parquet'
    if os.path.exists(one_plate_filename):
        print(f'File exists already, skipping this plate: {one_plate_filename}')
        continue

    # Load and process data for each feature: nuclei, cells and cytoplasm.
    feature_names = ['featICF_nuclei', 'featICF_cells', 'featICF_cytoplasm']
    df_features = {}  # A dictionary to hold dataframes for each feature.
    for feature_name in feature_names:
        feature_file = f"{oneplate_analysis_meta['results']}{feature_name}.parquet"
        print(f'Reading feature file: {feature_file}')
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        df_feature = pd.read_parquet(feature_file)
        df_feature.columns = [f"{col}_{re.sub('_.*', '', re.sub('featICF_', '', feature_name))}" for col in df_feature.columns]
        df_features[feature_name] = df_feature  # Adding the dataframe to our dictionary.
        print(f'Finished reading file. Dataframe shape: {df_feature.shape}')

    # Merge the features dataframes.
    df = df_features['featICF_nuclei'].merge(df_features['featICF_cells'],
                                             left_on=['Metadata_Barcode_nuclei', 'Metadata_Site_nuclei', 'Metadata_Well_nuclei', 'Parent_cells_nuclei'],
                                             right_on=['Metadata_Barcode_cells', 'Metadata_Site_cells', 'Metadata_Well_cells', 'ObjectNumber_cells'],  # Update column names
                                             suffixes=('_nuclei', '_cells'),
                                             how='left')

    df = df.merge(df_features['featICF_cytoplasm'],
                  left_on=['Metadata_Barcode_nuclei', 'Metadata_Site_nuclei', 'Metadata_Well_nuclei', 'Parent_cells_nuclei'],
                  right_on=['Metadata_Barcode_cytoplasm', 'Metadata_Site_cytoplasm', 'Metadata_Well_cytoplasm', 'ObjectNumber_cytoplasm'],  # Update column names
                  suffixes=('_nuclei', '_cytoplasm'),
                  how='left')

    df.reset_index(drop=True, inplace=True)

    # Add plate and barcode information to the dataframe.
    df['Metadata_AcqID_nuclei'] = oneplate_analysis_meta['plate_acq_id']
    df['Metadata_Barcode_nuclei'] = oneplate_analysis_meta['plate_barcode']

    # Renaming the columns.
    df.rename(columns={'Metadata_Barcode_nuclei': 'Metadata_Barcode',
                       'Metadata_Well_nuclei': 'Metadata_Well',
                       'Metadata_Site_nuclei': 'Metadata_Site',
                       'Metadata_AcqID_nuclei': 'Metadata_AcqID'}, inplace=True)

    # Adding an ImageID column by combining certain fields.
    df['ImageID'] = df['Metadata_AcqID'].astype(str) + '_' + df['Metadata_Barcode'] + '_' + df['Metadata_Well'] + '_' + df['Metadata_Site'].astype(str)

    # Select numeric columns to calculate the mean.
    numeric_columns = df.select_dtypes(include=np.number).columns.tolist()
    aggregation_functions = {i: np.nanmean for i in numeric_columns}

    print('Starting grouping by image ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    groupedbyImage = df.groupby(['ImageID', 'Metadata_Barcode', 'Metadata_Well', 'Metadata_Site', 'Metadata_AcqID'], as_index=False).agg(aggregation_functions)

    # Save grouped data to a file.
    print('Saving to parquet file ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    groupedbyImage.to_parquet(one_plate_filename)
    print('Finished saving to parquet file ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

# Combine all plates' data and save to a single file.
# Now finally read all one-plate files and concat them into an AllPlates-file
groupedbyImageAllPlates = pd.DataFrame()
for index, oneplate_analysis_meta in df_cp_results.iterrows(): 
    one_plate_filename = f'{ OutputDir }/{plateNamePrefix}_{ oneplate_analysis_meta["plate_acq_name"] }.parquet'
    print(f'read file: {one_plate_filename}')
    df = pd.read_parquet(one_plate_filename)
    groupedbyImageAllPlates = pd.concat([groupedbyImageAllPlates, df])
    
all_plates_outfile = f'{OutputDir}/{plateNamePrefix}AllPlates.parquet'
groupedbyImageAllPlates


In [None]:
df = groupedbyImageAllPlates[groupedbyImageAllPlates['ObjectNumber_nuclei'] >= 10] # drop images that have less than 10 cells
print("df after dropping low nuclei instances", df.shape)

In [None]:
OutputDir = 'Boxplots_mean_2023_04_27_P013725_only_MarisCopy_MartinCheck'
if not os.path.exists(OutputDir):
    os.makedirs(OutputDir)

In [None]:
db_uri = 'postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb'
query = """
        SELECT *
        FROM plate_v1
        WHERE layout_id ilike '%%sarscov2%%'
        """
df_plates = pd.read_sql_query(query, db_uri)
df_plates

In [None]:
db_uri = 'postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb'

# Create a list of barcodes you want to filter on
barcode_list = df_cp_results.plate_barcode.to_list()

# Convert the barcode list to a comma-separated string for SQL
barcode_str = ', '.join([f"'{item}'" for item in barcode_list])

NameContain = 'AROS'

query = f"""
        SELECT *
        FROM plate_v1
        WHERE layout_id ILIKE '%%{NameContain}%%'
        AND barcode IN ({barcode_str})
        """
df_plates = pd.read_sql_query(query, db_uri)
df_plates.shape

In [None]:
df_plates.iloc[0]

In [None]:
db_uri = 'postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb'

# Example barcode list
barcode_list = df_cp_results.plate_barcode.to_list()

NameContain = 'AROS'
conditions = [f"layout_id ILIKE '%%{NameContain}%%{barcode}%%'" for barcode in barcode_list]

# Join conditions with 'OR' to create the final condition for WHERE clause
where_clause = " OR ".join(conditions)

query = f"""
    SELECT *
    FROM plate_v1
    WHERE {where_clause}
"""

df_plates = pd.read_sql_query(query, db_uri)
df_plates.shape

In [None]:
# Removed the barcode list section because it's not needed anymore

NameContain = 'AROS'

query = f"""
        SELECT *
        FROM plate_v1
        WHERE layout_id ILIKE '%%{NameContain}%%'
        AND barcode <> ''
        """
df_plates = pd.read_sql_query(query, db_uri)
df_plates.shape

In [None]:
df_merged = pd.merge(df, df_plates, how='left', left_on=['Metadata_Barcode','Metadata_Well'], right_on=['barcode','well_id'])
df_merged

In [None]:
print("df_plates", df_plates.shape)
print("Unique batchid inclusive 6 controls:", len(df_plates.batchid.unique()))
df_plates.head(1)

In [None]:
df_merged['compound'] = df_merged['batchid']
df_merged['concentration'] = df_merged['cmpd_conc']

unique_comp_initial = df_merged.compound.unique()
print("Number of unique compounds", len(unique_comp_initial))
unique_comp_initial

In [None]:
df_merged.dropna(subset = ['compound'], inplace=True)
df_merged.reset_index(drop=True, inplace=True)
df_merged

In [None]:
df_merged2 = df_merged.merge(df_flags, left_on = ['Metadata_AcqID', 'Metadata_Barcode', 'Metadata_Well', 'Metadata_Site'], right_on = ['Metadata_AcqID', 'Metadata_Barcode', 'Metadata_Well', 'Metadata_Site'], how = 'left')
print("df_merged2:", df_merged2.shape)

df_merged3 = df_merged2[df_merged2['Total'] == 0 ].copy()
print("df_merged3:", df_merged3.shape)

print("Reduction by", (len(df_merged2))-(len(df_merged3)) )
print("Number of flagged instances in QC was", len(df_flags[df_flags['Total'] == 1]))

In [None]:
df_merged3_nan = df_merged3[df_merged3['Total'].isnull() ]
print("There are", len(df_merged3_nan), "NaN instances")

In [None]:
df_merged3.cbkid.value_counts()

In [None]:
df_merged3.pert_type.value_counts()

In [None]:
poscon = df_merged3[df_merged3['pert_type'] == 'poscon']
poscon.batchid.unique()

In [None]:
negcon = df_merged3[df_merged3['pert_type'] == 'negcon']
negcon.batchid.unique()