In [None]:
import polars as pl

db_uri = 'postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb'

query = """
        SELECT project
        FROM image_analyses_per_plate
        GROUP BY project
        ORDER BY project 
        """

# Query database and store result in Polars dataframe
df_projects = pl.read_database(query, db_uri)

df_projects.head(5)

In [5]:
# pharmbio equivalant

from pharmbio.dataset import get_projects_list

get_projects_list(lookup='cov')

['Covid19-Profiling', 'sarscov2-repurposing']

In [None]:
import polars as pl
from collections import Counter

db_uri = 'postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb'

NameContains = 'AROS-Reproducibility-MoA'
query = f"""
        SELECT *
        FROM image_analyses_per_plate
        WHERE project LIKE '{NameContains}%%'
        AND meta->>'type' = 'cp-qc'
        AND analysis_date IS NOT NULL
        ORDER BY plate_barcode 
        """

# Query database and store result in Polars dataframe
df_cp_results = pl.read_database(query, db_uri)

df_cp_results['analysis_id'].to_list()
counter = Counter(df_cp_results['plate_barcode'].to_list())
[item for item, count in counter.items() if count > 1]

In [1]:
# pharmbio equivalant

from pharmbio.dataset import get_projects_list, ExperimentData

data = ExperimentData(Name='AROS-CP')

Quering the db for AROS-CP
Analysis for the plate with barcode P009-P012-CACO2 is replicated 5 times with analysis_id of [446, 476, 477, 478, 479]
Analysis for the plate with barcode P005-P008-A549 is replicated 4 times with analysis_id of [472, 473, 474, 475]
Analysis for the plate with barcode P001-P004-U2OS is replicated 4 times with analysis_id of [468, 469, 470, 471]


In [3]:
data.df_qc

project,plate_barcode,plate_acq_name,plate_acq_id,analysis_id,analysis_date,analysis_error,meta,pipeline_name,results
str,str,str,i32,i32,str,str,str,str,str
"""AROS-CP""","""CoP013737-U2OS…","""CoP013737-U2OS…",1080,364,"""2021-06-14""",,"""{""type"":""cp-qc…","""384-96_QC-batc…","""/share/data/ce…"
"""AROS-CP""","""P001-P004-U2OS…","""P001-P004-U2OS…",1152,471,"""2021-08-01""",,"""{""type"":""cp-qc…","""384-96_QC-batc…","""/share/data/ce…"
"""AROS-CP""","""P005-P008-A549…","""P005-P008-A549…",1140,475,"""2021-08-01""",,"""{""type"":""cp-qc…","""384-96_QC-batc…","""/share/data/ce…"
"""AROS-CP""","""P009-P012-CACO…","""P009-P012-CACO…",1148,479,"""2021-08-01""",,"""{""type"":""cp-qc…","""384-96_QC-batc…","""/share/data/ce…"


In [None]:
from collections import Counter

# your list
my_list = ['apple', 'banana', 'apple', 'pear', 'banana', 'kiwi']

# count the occurrences of each item
counter = Counter(my_list)

print(counter)

In [None]:
import polars as pl

db_uri = 'postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb'

NameContains = 'AROS-Reproducibility-MoA'
query = f"""
        SELECT *
        FROM image_analyses_per_plate
        WHERE project LIKE '{NameContains}%%'
        AND meta->>'type' = 'cp-qc'
        AND analysis_date IS NOT NULL
        ORDER BY plate_barcode 
        """

# Query database and store result in Polars dataframe
df_cp_results = pl.read_database(query, db_uri)

# Check for duplicates
duplicates = df_cp_results.filter(pl.col('plate_barcode').is_duplicated())

if not duplicates.is_empty():
    # Group the duplicated data by 'plate_barcode' and count the occurrences
    grouped_duplicates = duplicates.groupby('plate_barcode')
    for name, group in grouped_duplicates:
        print(f"The plate with barcode {name} is replicated {len(group)} times with analysis_id of {group['analysis_id'].to_list()}")

df_cp_results.n_unique('plate_barcode')

df_cp_results

In [None]:
# keeping the highet analysis_id value of replicated rows
df_cp_results.sort("analysis_id", descending=True).unique('plate_barcode', keep='first').sort("analysis_id")

In [None]:
# drop rows by analysis_id
df_cp_results.filter(~pl.col('analysis_id').is_in([475, 471, 479]))

In [None]:
# keep rows by analysis_id
df_cp_results.filter(pl.col('analysis_id').is_in([475, 471, 479]))

In [None]:
# for data with csv format : AROS-CP

import polars as pl

# Add cp-result file column
df_cp_results = df_cp_results.with_columns(
    pl.lit(df_cp_results['results']+'qcRAW_images_'+ df_cp_results['plate_barcode']+ '.csv').alias('qc-file')
)

# Read all Parquet files and concatenate them into one DataFrame
df_all_files = pl.DataFrame()
for idx, row in enumerate(df_cp_results.iter_rows(named=True)):
    df_data_from_one_file = pl.read_csv(row['qc-file'])
    
    # Add column and update barcode
    df_data_from_one_file = df_data_from_one_file.with_columns(
        pl.lit(row['plate_acq_id']).alias('Metadata_AcqID'),
        pl.lit(row['plate_barcode']).alias('Metadata_Barcode')
    )
    
    print(f'df_data_from_one_file no: {idx} contains {df_data_from_one_file.width} columns and {df_data_from_one_file.height} rows. name: {row["qc-file"]}')
    
    df_all_files = df_all_files.vstack(df_data_from_one_file)


In [None]:
# Data with parquet format : AROS-Reproducibility-MoA

import polars as pl

# Filter out rows with specific analysis_id
df_filtered_results = df_cp_results.filter(~pl.col('analysis_id').is_in([3241]))

# Add qc-file column based on 'results' and 'plate_barcode' columns
df_filtered_results = df_filtered_results.with_columns(
    (pl.col('results') + 'qcRAW_images_'+ pl.col('plate_barcode') + '.parquet').alias('qc-file')
)


print(f'Experiment has {df_filtered_results.height} parquet files in its path.\n')

# Initialize an empty DataFrame to store all the parquet files data
df_concatenated_files = pl.DataFrame()

for idx, row in enumerate(df_filtered_results.iter_rows(named=True)):
    # Read data from the parquet file
    df_single_file_data = pl.read_parquet(row['qc-file'])
    
    # Add 'Metadata_AcqID' and 'Metadata_Barcode' columns
    df_single_file_data = df_single_file_data.with_columns(
        pl.lit(row['plate_acq_id']).alias('Metadata_AcqID'),
        pl.lit(row['plate_barcode']).alias('Metadata_Barcode')
    )
    
    print(f'File {idx + 1} contains {df_single_file_data.width} columns and {df_single_file_data.height} rows. Path: {row["qc-file"]}')
    
    # Stack the new data onto the previous DataFrame
    df_concatenated_files = df_concatenated_files.vstack(df_single_file_data)
    


In [None]:
# Add some columns
df_data = df_concatenated_files.clone()

df_data.with_columns(
    (pl.col('Metadata_AcqID').cast(pl.Utf8) + '_' + pl.col('Metadata_Well') + '_' + pl.col('Metadata_Site').cast(pl.Utf8)).alias('ImageID')
)

# df_data['Metadata_AcqID'] = df_data['Metadata_AcqID'].astype(int).astype(str)
# df_data['Metadata_Site'] = df_data['Metadata_Site'].astype(int).astype(str)
# df_data['ImageID'] = df_data['Metadata_AcqID'] + '_' + df_data['Metadata_Well'] + '_' + df_data['Metadata_Site']
# df_data['barcode'] = df_data['Metadata_Barcode']
# df_data['well_id'] = df_data['Metadata_Well']
# df_data['plate'] = df_data['Metadata_Barcode']
# df_data['plate-name'] = df_data['Metadata_Barcode']
# df_data['plateWell'] = df_data['Metadata_Barcode'] + '_' + df_data['Metadata_Well']
# df_data['site'] = df_data['Metadata_Site']

# display(df_data.tail(2))

In [None]:
try:
    data = df_concatenated_files.clone()
    plate_names = data.select('Metadata_Barcode').unique().sort(by='Metadata_Barcode').to_series().to_list()
    print(plate_names)
except Exception:
    print('Plate names not specified')
    plate_names = []

data = data.sort(['Metadata_Barcode','Metadata_Well', 'Metadata_Site'])

wells = data.select('Metadata_Well').unique().sort(by='Metadata_Well').to_series().to_list()
number_of_wells = len(wells)
print(f'Number of wells: {number_of_wells}')

rows = sorted(list({w[0] for w in wells}))
number_of_rows = len(rows)
print(*rows)

columns = sorted(list({w[1:] for w in wells}))
number_of_columns = len(columns)
print(*columns)

all_wells = [(x+y) for x in rows for y in columns]

sites = data.select('Metadata_Site').unique().sort(by='Metadata_Site').to_series().to_list()
number_of_sites = len(sites)
print(f'Number of sites: {number_of_sites}')

total_images = data.shape[0]
expected_images = len(plate_names) * number_of_wells * number_of_sites

print(f'Processed {total_images} of {expected_images} images')

In [None]:
import re

# Collect columns related to image quality
image_quality_cols = [col for col in data.columns if "ImageQuality_" in col]

# Remove 'ImageQuality_' prefix from column names
image_quality_module = [col.replace('ImageQuality_', '') for col in image_quality_cols]

# Get unique measures from column names, assuming measure is before first underscore
image_quality_measures = sorted({re.sub('_.*', '', measure) for measure in image_quality_module})
count_measures = len(image_quality_measures)

print(f'Image Quality module has measured {count_measures} parameters: {", ".join(image_quality_measures)}')

In [None]:
not_so_useful = ['TotalArea', 'Scaling', 'TotalIntensity', 'Correlation', 'PercentMinimal',
                 'LocalFocusScore', 'MinIntensity', 'MedianIntensity', 'MADIntensity',
                 'ThresholdMoG', 'ThresholdBackground', 'ThresholdKapur', 'ThresholdMCT',
                 'ThresholdOtsu', 'ThresholdRidlerCalvard', 'ThresholdRobustBackground',
                 'PercentMaximal']

image_quality_measures = [measure for measure in image_quality_measures if measure not in not_so_useful]
count_measures = len(image_quality_measures)

print(f'I will use {count_measures} parameters: {", ".join(image_quality_measures)}')

data_frame_dictionary = {measure: data[[col for col in image_quality_cols if f'_{measure}' in col]] for measure in image_quality_measures}
data_frame_list = sorted(list(data_frame_dictionary.keys()))

In [None]:
# Correlation, LocalFocusScore, ThresholdMoG, ThresholdOtsu
for i in range(22):
    if len(data_frame_dictionary[data_frame_list[i]].columns) > 5:
        print(i+1, data_frame_list[i], len(data_frame_dictionary[data_frame_list[i]].columns))

In [None]:
channel_names = [
    re.sub('.*_', '', c)
    for c in list(data_frame_dictionary[data_frame_list[1]].columns)
]
channel_names

In [None]:
import plotly.figure_factory as ff
import numpy as np

for plate in plate_names:
    plate_data = data.filter(pl.col('Metadata_Barcode') == plate)
    heatmap_data = []
    for row in rows:
        heatmap_row = []
        for column in columns:
            well = row + column
            count_nuclei = plate_data.filter(pl.col('Metadata_Well') == well)['Count_nuclei'].to_numpy()
            
            # If the value is NaN, convert it to a specific value (like -1 or 0)
            if count_nuclei.size == 0:
                well_nuclei_count = 0  # Or whatever value you'd like to use for missing data
            else:
                well_nuclei_count = np.mean(count_nuclei).round(decimals = 0).astype(int)
            
            heatmap_row.append(well_nuclei_count)
        heatmap_data.append(heatmap_row)

    annotation_text = [["" for _ in range(len(row))] for row in heatmap_data]
    fig = ff.create_annotated_heatmap(heatmap_data, x=[i+1 for i in range(24)], y=rows,
                                      annotation_text=annotation_text,
                                      colorscale='OrRd', 
                                      hoverinfo='z')
    fig.update_layout(
        title_text=f'Plate: {plate}',
        width=700)
    fig.update_xaxes(side='bottom')
    fig['layout']['yaxis']['autorange'] = "reversed"
    fig.show()


In [None]:
import plotly.figure_factory as ff
import plotly.subplots as sp

# Define the number of columns for your grid
plot_size = 400
font_ratio = plot_size/400
num_columns = 2
num_rows = -(-len(plate_names) // num_columns)  # Ceiling division to get number of rows needed

# Create a subplot with num_rows rows and num_columns columns
fig = sp.make_subplots(rows=num_rows, cols=num_columns, subplot_titles=plate_names)

for index, plate in enumerate(plate_names):
    plate_data = data.filter(pl.col('Metadata_Barcode') == plate)
    heatmap_data = []
    for row in rows:
        heatmap_row = []
        for column in columns:
            well = row + column
            count_nuclei = plate_data.filter(pl.col('Metadata_Well') == well)['Count_nuclei'].to_numpy()
            
            if count_nuclei.size == 0:
                well_nuclei_count = 0
            else:
                well_nuclei_count = np.mean(count_nuclei).round(decimals = 0).astype(int)
            
            heatmap_row.append(well_nuclei_count)
        heatmap_data.append(heatmap_row)

    # Calculate the subplot row and column indices
    subplot_row = index // num_columns + 1
    subplot_col = index % num_columns + 1
    
    heatmap = ff.create_annotated_heatmap(
        heatmap_data,
        x=[str(i+1) for i in range(24)],
        y=rows,
        annotation_text=heatmap_data,
        colorscale='OrRd',
        hoverinfo='z'
    )

    # Add the heatmap to the subplot
    fig.add_trace(heatmap.data[0], row=subplot_row, col=subplot_col)

# Update x and y axes properties
for i in fig['layout']['annotations']:
    i['font'] = dict(size=12*font_ratio)
fig.update_xaxes(tickfont=dict(size=10*font_ratio), nticks=48, side='bottom')
fig.update_yaxes(autorange="reversed", tickfont=dict(size=10))
# fig.update_yaxes(tickfont=dict(size=10*font_ratio))
fig.update_layout(height=plot_size*num_rows, width=plot_size*1.425*num_columns)
fig.show()

In [None]:
import plotly.subplots as sp
import plotly.graph_objects as go

# Defining a color list
colors = ['blue', 'green', 'red', 'purple', 'orange']

fig = sp.make_subplots(rows=len(image_quality_measures), cols=1, subplot_titles=image_quality_measures, x_title='Plates')

for x in range(len(image_quality_measures)):
    CurrentDataFrame = data_frame_dictionary.get(data_frame_list[x])
    
    min_val = CurrentDataFrame.min().to_numpy().min()  # minimum of all columns
    max_val = CurrentDataFrame.max().to_numpy().max()  # maximum of all columns
    for i, column in enumerate(CurrentDataFrame.columns):
        channel_name = channel_names[i]
        show_in_legend = (x == 0)
        
        fig.add_trace(
            go.Scatter(
                x=[str(j) for j in range(CurrentDataFrame.height)],
                y=CurrentDataFrame[column],
                mode='lines',
                line=dict(width=0.5, color=colors[i % len(colors)]),
                showlegend=False,
                name=channel_name if not show_in_legend else "",
                legendgroup=channel_name, 
            ),
            row=x + 1,
            col=1,
        )

    fig.update_xaxes(range=[0, CurrentDataFrame.height], showticklabels=False, row=x+1, col=1)

    fig.add_shape(type="line",
        xref="x", yref="paper",
        x0=CurrentDataFrame.height/2, y0=min_val, x1=CurrentDataFrame.height/2, y1=max_val,
        line=dict(
            color="Black",
            width=1,
            dash="dashdot",
        ),
        row=x + 1,
        col=1
    )

# Dummy traces for the legend
for i, channel_name in enumerate(channel_names):
    fig.add_trace(
        go.Scatter(
            x=[None],  # these traces won't appear
            y=[None],
            mode='lines',
            line=dict(width=3, color=colors[i % len(colors)]),  # this will be the width in the legend
            legendgroup=channel_name,
            name=channel_name,  # this will be the name in the legend
        ),
    )

# Add main title
fig.update_layout(height=1.5*len(image_quality_measures)*100, title_text=NameContains, title_x=0.1, width=1400)

fig.show()
