# pharmbio package

```
pip install pharmbio
```

version 0.1.7

<div style="text-align:center">
    <img src="test/Analysis_Diagram.jpg" alt="Pipeline" width="200"/>
</div>

<div style="text-align:center">
    <img src="test/pharmbio_package.svg" alt="pharmbio python package structure and modules" width="400"/>
</div>

### For sequrity reason the URI address of database is defined as an environment variable

In [None]:
%env DB_URI=postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb

In [None]:
import os

# Set the environment variable
os.environ[
    "DB_URI"
] = "postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb"

### Retrive all the experiment in image_db

In [None]:
from pharmbio.dataset.experiment import get_projects_list

get_projects_list()

### Looking up the experiment in image_db with name

In [None]:
from pharmbio.dataset.experiment import get_projects_list

get_projects_list(lookup="aros")

### Set the logger level
possible values: 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'

it is not case-sensitive

In [None]:
from pharmbio.logger import set_logger_level

set_logger_level("debug")

Using get_image_quality_ref() with its default value for the arguments will return the quality control reference dataframe and keep just the duplications that has the highest experiment id value (analysis_id).

In [None]:
from pharmbio.dataset.image_quality import get_image_quality_ref, get_image_quality_data

qc_ref_df = get_image_quality_ref("AROS-Reproducibility-MoA-Full")
qc_ref_df

if we put drop_replication to "None" then we have:

In [None]:
qc_ref_df = get_image_quality_ref(
    "AROS-Reproducibility-MoA-Full", drop_replication="None"
)
qc_ref_df

as you see plate with barcode 'P013726' is duplicated (has two analysis_id	of 3241 and 3249). But we can keep experiment just by indicating their analysis ids. for example if we just want plate with analysis id of 3249 we can do this:

In [None]:
qc_ref_df = get_image_quality_ref(
    "AROS-Reproducibility-MoA-Full", keep_replication=[3249]
)
qc_ref_df

and if we want more experiment we just add them like so:

In [None]:
qc_ref_df = get_image_quality_ref(
    "AROS-Reproducibility-MoA-Full", keep_replication=[3249, 3241]
)
qc_ref_df

we can also prform advanced filter. for this

In [None]:
get_projects_list(lookup="cov")

In [None]:
qc_ref_df = get_image_quality_ref("sarscov2-repurposing", drop_replication="None")
qc_ref_df

as you can see we have a lots of rows. Imagine we want to just look at experiment that conducted in 2021.

In [None]:
qc_ref_df = get_image_quality_ref(
    "sarscov2-repurposing",
    drop_replication="None",
    filter={
        "analysis_date": ["2021"],
    },
)
qc_ref_df

we can also add more than one year in the list to add them (act as AND)

In [None]:
qc_ref_df = get_image_quality_ref(
    "sarscov2-repurposing",
    drop_replication="None",
    filter={
        "analysis_date": ["2021", "2023"],
    },
)
qc_ref_df

now we can limit the yesr to 2023 to have less entry:

In [None]:
qc_ref_df = get_image_quality_ref(
    "sarscov2-repurposing",
    drop_replication="None",
    filter={
        "analysis_date": ["2023"],
    },
)
qc_ref_df

what if we want those from 2023 that were used VeroE6 cells? then we add ney key to the filter as so:

In [None]:
qc_ref_df = get_image_quality_ref(
    "sarscov2-repurposing",
    drop_replication="None",
    filter={"analysis_date": ["2023"], "plate_barcode": ["VeroE6"]},
)
qc_ref_df

So the filter argument can have multiple key and value. the value of the keys should be in the list and as string. they can be a part from in the middle of the string. number shoud also be in string format so you can use whatever part of it that you want to search. for example if we just want from the result above to filter those with plate_acq_id that stsrt eith 37 we can do this:

In [None]:
qc_ref_df = get_image_quality_ref(
    "sarscov2-repurposing",
    drop_replication="None",
    filter={
        "analysis_date": ["2023"],
        "plate_barcode": ["VeroE6"],
        "plate_acq_id": ["37"],
    },
)
qc_ref_df

In [None]:
qc_df = get_image_quality_data(qc_ref_df)
qc_df

In [None]:
from pharmbio.data_processing.quality_control import get_qc_module, get_qc_data_dict

get_qc_data_dict(qc_df)

In [None]:
cov_qc_ref_df = get_image_quality_ref(
    "sarscov2-repurposing",
    filter={"plate_acq_id": ["37", "36"], "plate_barcode": ["Vero"]},
)

In [None]:
cov_qc_df = get_image_quality_data(cov_qc_ref_df)

In [None]:
from pharmbio.data_processing.quality_control import flag_outlier_images

flagged_qc_df = flag_outlier_images(qc_df)

In [None]:
from pharmbio.data_processing.quality_control import (
    get_qc_module,
    get_channels,
    flag_outlier_images,
)

# get_qc_module(df)
# get_channels(df, out_put='print')
flag_outlier_images(qc_df)
# flag_outlier_images(df, method='SD', default_sd_step=(-4.5, 4.5))

In [None]:
qc_df.columns

In [None]:
from pharmbio.visualization import plots

plots.plate_heatmap(
    qc_df,  # plate_names=['P100980', 'P100981', 'P100982', 'P100983', 'P100984', 'P100985', 'P100986', 'P100987', 'P100988', 'P100989',],
    subplot_num_columns=2,
    plot_size=450,
)  # measurement='qc_flag_rawHOECHST_Blurry')
# plots.quality_module_lineplot(qc_df)

In [None]:
qc_ref_df = get_image_quality_ref("AROS-Reproducibility-MoA-Full")
flagged_qc_df = flag_outlier_images(get_image_quality_data(qc_ref_df))

In [None]:
from pharmbio.dataset.cell_morphology import (
    get_cell_morphology_ref,
    get_cell_morphology_data,
)

cp_ref_df = get_cell_morphology_ref("AROS-Reproducibility-MoA-Full")
cp_df = get_cell_morphology_data(
    cp_ref_df,
    flagged_qc_df=flagged_qc_df,
    site_threshold=3,
    compound_threshold=0.4,
    aggregation_level="cell",
)

In [None]:
from pharmbio.dataset.cell_morphology import _outlier_series_to_delete

comp_series_to_delete, img_series_to_delete = _outlier_series_to_delete(
    flagged_qc_df, site_threshold=4, compound_threshold=0.4
)
comp_series_to_delete, img_series_to_delete

In [None]:
from pharmbio.dataset.cell_morphology import get_comp_outlier_info, get_outlier_df

get_comp_outlier_info(flagged_qc_df)
get_outlier_df(flagged_qc_df, with_compound_info=True)

In [None]:
plots.plate_heatmap(get_outlier_df(flagged_qc_df), measurement="outlier_num")

In [None]:
%env DB_URI=postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb
import os

# Set the environment variable
os.environ[
    "DB_URI"
] = "postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb"

import json
from pharmbio.dataset.image_quality import get_image_quality_ref, get_image_quality_data
from pharmbio.data_processing.quality_control import get_qc_module, get_qc_data_dict, flag_outlier_images
from pharmbio.dataset.cell_morphology import (
    get_cell_morphology_ref,
    get_cell_morphology_data,
    get_comp_outlier_info,
    get_outlier_df,
)
from pharmbio.visualization import plots
from typing import Union, Literal, Tuple, Set, List, Dict
from pharmbio.config import COLORS

class Experiment:
    def __init__(self, json_file):
        with open(json_file, "r") as file:
            data = json.load(file)
            self.__dict__.update(data)
        self.image_quality_data = self.get_image_quality_data()
        self.flagged_image_quality_data = self.flag_outlier_images()
        self.cell_morphology_data = self.get_cell_morphology_data()
        self.compound_batch_ids = self.cell_morphology_data.select("batch_id").unique("batch_id").to_series().to_list()
        self.compound_outlier_info = get_comp_outlier_info(flagged_df=self.flagged_image_quality_data)
        self.outlier_dataframe = get_outlier_df(flagged_qc_df=self.flagged_image_quality_data, with_compound_info=self.__dict__.get('with_compound_info', False))

    def get_image_quality_reference_data(self):
        return get_image_quality_ref(
            name=self.experiment_name, 
            drop_replication=self.__dict__.get('drop_replication', "Auto"),
            keep_replication=self.__dict__.get('keep_replication', "None"),
            filter=self.__dict__.get('filter', None)
        )

    def get_image_quality_data(self):
        return get_image_quality_data(
            self.get_image_quality_reference_data(),
            force_merging_columns=self.__dict__.get("force_merging_columns", False)
        )
        
    def get_cell_morphology_reference_data(self):
        return get_cell_morphology_ref(
            name=self.experiment_name,
            filter=self.__dict__.get('filter_cp', None),
        )
    
    def get_cell_morphology_data(self):
        return get_cell_morphology_data(
            cell_morphology_ref_df=self.get_cell_morphology_reference_data(),
            flagged_qc_df= self.flagged_image_quality_data,
            site_threshold= self.__dict__.get("site_threshold", 6),
            compound_threshold=self.__dict__.get('compound_threshold', 0.7),
            aggregation_level=self.__dict__.get('aggregation_level', "cell"),
            aggregation_method=self.__dict__.get('aggregation_method', None),
            path_to_save=self.__dict__.get('path_to_save', "data"),
            use_gpu=self.__dict__.get('use_gpu', False),
            save_plate_separately=self.__dict__.get('save_plate_separately', False),
        )

    def get_image_guality_modules(self):
        return get_qc_module(qc_data=self.image_quality_data)

    def get_image_guality_data_dict(self):
        return get_qc_data_dict(
            qc_data=self.image_quality_data,
                    module_to_keep = self.__dict__.get('module_to_keep', None),
                    module_to_drop = self.__dict__.get('module_to_drop', None),
        )
    
    def flag_outlier_images(self):
        return flag_outlier_images(
        qc_data= self.image_quality_data,
        module_to_keep = self.__dict__.get('module_to_keep', None),
        module_to_drop = self.__dict__.get('module_to_drop', None),
        method = self.__dict__.get('method', "SD"),
        IQR_normalization = eval(self.__dict__.get('IQR_normalization', "True")),
        normalization = self.__dict__.get('normalization', "zscore"),
        sd_step_dict = self.__dict__.get("sd_step_dict", None),
        default_sd_step = tuple(self.__dict__.get("default_sd_step")) if self.__dict__.get("default_sd_step", None) else (-4.5, 4.5),
        quantile_limit = self.__dict__.get('quantile_limit', 0.25),
        multiplier= self.__dict__.get('quantile_limit', 1.5),
        )
        
    
    
    def plate_heatmap(self,
    plate_names: List[str] = None,
    subplot_num_columns: int = 2,
    plot_size: int = 400,
    measurement: str = "Count_nuclei",
    plate_well_columns: Dict[str, str] = None,):
        plots.plate_heatmap(df = self.image_quality_data ,
                            plate_names = plate_names,
                            subplot_num_columns = subplot_num_columns,
                            plot_size= plot_size,
                            measurement=measurement,
                            plate_well_columns=plate_well_columns)
        
    def well_outlier_heatmap(self):
        plots.plate_heatmap(self.outlier_dataframe, measurement="outlier_num")
        
    def quality_module_lineplot(self,
    qc_module_to_plot: Set[str] = None,
    title: str = "Unnamed",
    plot_size: int = 1400,
    normalization: bool = True,
    normalization_method: Literal["zscore", "minmax"] = "zscore",
    y_axis_range: Tuple = (-5, 5),
    colors: List[str] = COLORS,):
        plots.quality_module_lineplot(
                            df=self.image_quality_data,
                            qc_module_to_plot=qc_module_to_plot,
                            title=title,
                            plot_size=plot_size,
                            normalization=normalization,
                            normalization_method=normalization_method,
                            y_axis_range=y_axis_range,
                            colors=colors)

    def print_setting(self):
        keys_to_exclude = {'image_quality_data', 'cell_morphology_data', 'flagged_image_quality_data',
                           'compound_batch_ids', 'compound_outlier_info', 'outlier_dataframe'}
        filtered_dict = {k: v for k, v in self.__dict__.items() if k not in keys_to_exclude}
        print(json.dumps(filtered_dict, indent=4))


# Usage
data = Experiment("experiment_settings.json")

In [None]:
data.well_outlier_heatmap()

In [None]:
data.plate_heatmap()

In [None]:
data.quality_module_lineplot(title="AROS")