# pharmbio package

```
pip install pharmbio
```

version 0.1.5

<div style="text-align:center">
    <img src="test/Analysis_Diagram.jpg" alt="Pipeline" width="200"/>
</div>

<div style="text-align:center">
    <img src="test/pharmbio_package.svg" alt="pharmbio python package structure and modules" width="400"/>
</div>

### For sequrity reason the URI address of database is defined as an environment variable

In [None]:
%env DB_URI=postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb

In [None]:
import os

# Set the environment variable
os.environ["DB_URI"] = "postgresql://pharmbio_readonly:readonly@imagedb-pg-postgresql.services.svc.cluster.local/imagedb"

### Looking up the experiment in image_db with name

In [None]:
from pharmbio.dataset.experiment import get_projects_list

get_projects_list(lookup='specs3')

### Set the logger level
possible values: 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'

In [None]:
from pharmbio.logger import set_logger_level

set_logger_level('debug')

In [None]:
from pharmbio.dataset.image_quality import get_image_quality_ref, get_image_quality_data

qc_ref_df = get_image_quality_ref("specs3k", filter={'plate_barcode': ['P101334']})
qc_df = get_image_quality_data(qc_ref_df)

In [None]:
cov_qc_ref_df = get_image_quality_ref('sarscov2-repurposing', filter={'plate_acq_id': ['37', '36'], 'plate_barcode': ['Vero']})

In [None]:
cov_qc_df = get_image_quality_data(cov_qc_ref_df)

In [None]:
from pharmbio.data_processing.quality_control import get_qc_module, get_channels, flag_outlier_images
df = qc_df
# get_qc_module(df)
# get_channels(df, out_put='print')
# flag_outlier_images(qc_df)
# flag_outlier_images(df, method='SD', default_sd_step=(-4.5, 4.5))

In [None]:
from pharmbio.visualization import plots

# plots.plate_heatmap(cov_qc_df, subplot_num_columns=2)
# plots.quality_module_lineplot(qc_df)

In [None]:
from pharmbio.dataset.cell_morphology import get_cell_morphology_ref, get_cell_morphology_data

cp_ref_df = get_cell_morphology_ref("specs3k", filter={'analysis_id': ['4371', '4424']})

In [None]:
cpf_df = get_cell_morphology_data(cp_ref_df, aggregation_level='cell')

In [None]:
import polars as pl
# cpf_df.select(pl.col('smiles'))
cpf_df

In [None]:
import polars as pl
import pandas as pd
from typing import Union, List

def aggregate_data_cpu(
    df: Union[pl.DataFrame, pd.DataFrame],
    columns_to_aggregate: List[str],
    groupby_columns: List[str],
    aggregation_function: str = "mean",
):
    """
    Aggregates morphology data using the specified columns and aggregation function.

    Args:
        df (Union[pl.DataFrame, pd.DataFrame]): The input DataFrame to be aggregated.
        columns_to_aggregate (List[str]): The list of columns to be aggregated.
        groupby_columns (List[str]): The list of columns to group by.
        aggregation_function (str, optional): The aggregation function to be applied. Defaults to "mean" where
        possible values could set to: "mean", median, "sum", "min", "max", "first", "last".

    Returns:
        pl.DataFrame: The aggregated DataFrame.

    Examples:
        ```python
        df = pd.DataFrame({
            'A': [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
            'B': [1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2],
            'C': [9, 10, 11, 12, 9, 10, 11, 12, 12, 11, 12],
            'D': [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]})

        aggregate_data_cpu(df, columns_to_aggregate=['B', 'C'], groupby_columns=['A'], aggregation_function='mean')
        ```
    """

    # Check if data is in pandas DataFrame, if so convert to polars DataFrame
    if isinstance(df, pd.DataFrame):
        df = pl.from_pandas(df)

    grouped = df.lazy().groupby(groupby_columns)
    agg_exprs = [
        getattr(pl.col(col), aggregation_function)().alias(col)
        for col in columns_to_aggregate
    ]
    
    metadata_column = [col for col in df.columns if col not in columns_to_aggregate and col not in groupby_columns]
    metadata_agg_exprs = [
        pl.col(col).first().alias(col)
        for col in metadata_column
    ]
    
    all_agg_exprs = agg_exprs + metadata_agg_exprs

    # Execute the aggregation.
    agg_df = grouped.agg(all_agg_exprs)

    return agg_df.sort(groupby_columns).collect()

df = pd.DataFrame({
    'A': [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
    'B': [1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2],
    'C': [9, 10, 11, 12, 9, 10, 11, 12, 12, 11, 12],
    'D': [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]})

aggregate_data_cpu(df, columns_to_aggregate=['B', 'C'], groupby_columns=['A'], aggregation_function='mean')

In [None]:
import time
def print_progress(iteration, total, prefix='Progress:', decimals=2, bar_length=50):
    percent = f"{100 * (iteration / float(total)):.{decimals}f}"
    filled_length = int(round(bar_length * iteration // total))
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    print(f'\r{prefix} |{bar}| {percent}%', end='\r')

# Example usage in a for-loop
for i in range(100):
    time.sleep(0.1)
    print_progress(i, 99)

