In [None]:
import polars as pl
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt

## Import classification compounds

In [None]:
class_comp = pl.read_csv("data_tables_BF_paper/fl_data.csv")

In [None]:
def enclose_dmso(value):
    if value == "dmso":
        return "[dmso]"
    return value

# Applying the function to the 'compound' column
class_comp = class_comp.with_columns(class_comp['compound'].apply(enclose_dmso))

In [None]:
class_comp = class_comp.rename({
    'compound': 'Metadata_cmpdName'
})
class_comp = class_comp.select(["Metadata_cmpdName", "moa"]).unique()

In [None]:
specs3k_meta = pl.read_csv("/share/data/analyses/benjamin/Single_cell_project/DP_specs3k/inputs/metadata/Metadata_specs3k_DeepProfiler.csv")

In [None]:
specs3k_meta_big = pl.read_csv("/share/data/analyses/benjamin/Single_cell_project/specs3k/specs3k_metadata.csv")

In [None]:
specs3k_filter = class_comp.join(specs3k_meta, on ="Metadata_cmpdName", how = "inner").unique()
specs3k_class_cbkid = list(specs3k_filter["Metadata_cmpdName"].unique())
#specs3k_class_comp = specs3k_meta.filter(pl.col("Metadata_cmpdName").is_in(specs3k_class_cbkid + ["[dmso]"]))
#specs3k_class_comp = specs3k_class_comp.drop(["moa"]).join(specs3k_filter, left_on= ["Metadata_cmpdName", "Metadata_Well", "Metadata_Plate"], right_on= ["cbkid", "well", "barcode"], how = "left")

In [None]:
specs3k_class_cbkid

In [None]:
specs2k_comp = pl.read_csv("/share/data/analyses/benjamin/Single_cell_project/specs2k_cmpd.csv")
specs2k_meta = pl.read_csv("/share/data/analyses/benjamin/Single_cell_project/DP_specs2k/inputs/metadata/metadata_deepprofilerspecs2k.csv")

In [None]:
specs2k_filter = class_comp.join(specs2k_meta, on ="Metadata_cmpdName", how = "inner").unique()
specs2k_class_cbkid = list(specs2k_filter["Metadata_cmpdName"].unique())


In [None]:
specs5k_classication_list = pl.concat([specs2k_filter.drop(["DNA", "ER", "AGP", "Mito", "RNA"]), specs3k_filter.drop(["moa_right", "Unnamed: 0", "DNA", "ER", "AGP", "Mito", "RNA"])])
specs5k_classication_list = specs5k_classication_list.with_columns(specs5k_classication_list['Metadata_cmpdName'].str.to_uppercase())

In [None]:
specs5k_classication_list.groupby("moa").count()

In [None]:
specs5k_classication_list.write_parquet("BF_moa_specs5k_compound_list.parquet")

In [None]:
specs5k_classication_list = pl.read_parquet("specs5k_compound_list.parquet")

## Generate feature data

In [None]:
specs3k_feature_path = "/home/jovyan/share/data/analyses/benjamin/Single_cell_project_rapids/SPECS/deepprofiler/Results/normalized_features"

In [None]:
specs2k_feature_path = "/home/jovyan/share/data/analyses/benjamin/Single_cell_project_rapids/SPECS2K/deepprofiler/Results/"

In [None]:
import polars as pl
import os
import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

def merge_locations(df, location_folder):

    out_df = pl.DataFrame()
    combinations = df.unique(["Metadata_Plate", "Metadata_Well", "Metadata_Site"])
    # Iterate through unique combinations of Plate, Well, and Site
    for combination in tqdm.tqdm(combinations.to_pandas().itertuples(index=False), total = len(combinations)):
        plate, well, site = combination.Metadata_Plate, combination.Metadata_Well, combination.Metadata_Site

        # Construct the file path for the CSV
        file_path = f"{location_folder}/{plate}/{well}-{site}-Nuclei.csv"

        # Check if the file exists
        if os.path.exists(file_path):
            # Read the CSV file
            csv_df = pl.read_csv(file_path)
            filter = df.filter((pl.col("Metadata_Plate") == plate) &
                                            (pl.col("Metadata_Well") == well) &
                                            (pl.col("Metadata_Site") == site))
            # Ensure that csv_df aligns with the subset of original df in terms of row count
            if len(csv_df) != len(filter):
                # Handle error or misalignment
                print(f"{combination} doesn't match")  # or log it, or raise an error
            temp = pl.concat([filter, csv_df], how = "horizontal")
            out_df = pl.concat([out_df, temp], how = "vertical")
            # Perform the column concatenation operation
            # Assuming the order of rows in csv_df corresponds exactly to the order in the subset of df
            
    return out_df


def read_and_merge_single_file(df, plate, well, site, location_folder):
    file_path = f"{location_folder}/{plate}/{well}-{site}-Nuclei.csv"
    if os.path.exists(file_path):
        csv_df = pl.read_csv(file_path)
        filter_df = df.filter((pl.col("Metadata_Plate") == plate) &
                              (pl.col("Metadata_Well") == well) &
                              (pl.col("Metadata_Site") == site))
        if len(csv_df) == len(filter_df):
            return pl.concat([filter_df, csv_df], how="horizontal")
    return None

def merge_locations_parallel(df, location_folder, max_workers=10):
    combinations = df.unique(["Metadata_Plate", "Metadata_Well", "Metadata_Site"])
    dfs_to_concat = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create and submit tasks
        future_to_combination = {
            executor.submit(read_and_merge_single_file, df, comb["Metadata_Plate"], comb["Metadata_Well"], comb["Metadata_Site"], location_folder): comb 
            for comb in combinations.to_dicts()
        }
        
        for future in tqdm.tqdm(as_completed(future_to_combination), total=len(future_to_combination)):
            result = future.result()
            if result is not None:
                dfs_to_concat.append(result)
    
    # Concatenate all DataFrames at once at the end
    out_df = pl.concat(dfs_to_concat, how="vertical")
    return out_df

In [None]:
import os
import gc
import tqdm
def generate_supervised_data(cmpd_df, feature_path):
    plates = list(cmpd_df["Metadata_Plate"].unique())
    plates.sort()
    sc_features = []
    for p in tqdm.tqdm(plates):
        file_path = f"{feature_path}/sc_profiles_normalized_specs2k_{p}.parquet"
        if os.path.exists(file_path):
            #print("Analysising plate:", p)
            temp_cmpd_df = cmpd_df.filter(pl.col("Metadata_Plate") == p)
            cmps = list(temp_cmpd_df["Metadata_cmpdName"].unique())
            features = pl.read_parquet(file_path)
            features_filt = features.filter(pl.col("Metadata_cmpdName").is_in(cmps))
            sc_features.append(features_filt)
            gc.collect()
    sc_df = pl.concat(sc_features)
    return sc_df

In [None]:
specs3k_sc_features = generate_supervised_data(specs5k_classication_list, specs3k_feature_path).unique()

In [None]:
specs2k_sc_features = generate_supervised_data(specs5k_classication_list, specs2k_feature_path).unique()

In [None]:
location_path = "/home/jovyan/share/data/analyses/benjamin/Single_cell_project/DP_specs3k/inputs/locations/"
specs3k_sc_locations = merge_locations_parallel(specs3k_sc_features, location_path, max_workers = 15)

In [None]:
location_path2k = "/home/jovyan/share/data/analyses/benjamin/Single_cell_project/DP_specs2k/inputs/locations/"
specs2k_sc_locations = merge_locations_parallel(specs2k_sc_features, location_path2k, max_workers = 15)

In [None]:
specs2k_sc_locations =  specs2k_sc_locations.filter((pl.col("Nuclei_Location_Center_X") > 250) &
                                                  (pl.col("Nuclei_Location_Center_X") < 2250) &
                                                  (pl.col("Nuclei_Location_Center_Y") > 250) &
                                                  (pl.col("Nuclei_Location_Center_Y") < 2250))

In [None]:
specs3k_sc_locations =  specs3k_sc_locations.filter((pl.col("Nuclei_Location_Center_X") > 250) &
                                                  (pl.col("Nuclei_Location_Center_X") < 2250) &
                                                  (pl.col("Nuclei_Location_Center_Y") > 250) &
                                                  (pl.col("Nuclei_Location_Center_Y") < 2250))

In [None]:
specs2k_sc_locations.write_parquet("sc_profiles_classification_specs2k.parquet")

In [None]:
specs3k_sc_features.write_parquet("sc_profiles_classification_specs3k.parquet")

In [None]:
specs3k_sc_features_total = specs3k_sc_locations.drop(["Metadata_cmpdConc", "moa", "compound_name"]).join(specs5k_classication_list, left_on = ["Metadata_Plate", "Metadata_Well","Metadata_cmpdName", "Metadata_Site"], right_on = ["Metadata_Plate", "Metadata_Well","Metadata_cmpdName", "Metadata_Site"], how ="left")
#specs3k_sc_features_total = specs3k_sc_features_total.with_columns(
#    specs3k_sc_features_total['moa_broad'].fill_null(pl.lit("DMSO"))
#)

In [None]:
specs2k_sc_features_total = specs2k_sc_locations.drop(["Unnamed: 0", "Metadata_cmpdConc", "moa", "compound_name"]).join(specs5k_classication_list, left_on = ["Metadata_Plate", "Metadata_Well","Metadata_cmpdName", "Metadata_Site"], right_on = ["Metadata_Plate", "Metadata_Well","Metadata_cmpdName", "Metadata_Site"], how ="left")
specs2k_sc_features_total = specs2k_sc_features_total.filter(~pl.col("Metadata_Plate").is_in(["P103620", "P103621", "P103619"]))

In [None]:
specs5k_sc_features_total = pl.concat([specs3k_sc_features_total, specs2k_sc_features_total])

In [None]:
specs5k_sc_features_total.write_parquet("sc_profiles_classification_specs5k_total_BF.parquet")

In [None]:
specs5k_sc_features_total = pl.read_parquet("datasets/sc_profiles_classification_specs5k_total.parquet")

In [None]:
specs5k_sc_features_total = specs5k_sc_features_total.filter(~pl.col("Metadata_Plate").is_in(["P103620", "P103621", "P103619"]))

## Show summary stats

In [None]:
def show_group_dist(feature_df, group_col):
    # Group and count the values
    grouped_df = feature_df.groupby(group_col).agg(pl.count().alias('count'))
    group_names = grouped_df[group_col].to_list()
    counts = grouped_df['count'].to_list()

    # Set a larger figure size for better readability
    plt.figure(figsize=(12, 6))  # Width, Height in inches
    
    # Plot the bars with a custom color and wider bars
    plt.bar(group_names, counts, color='dodgerblue', width=0.6)

    # Rotate and align the x labels with a larger font size
    plt.xticks(rotation=45, ha='right', fontsize=10)
    
    # Set the y labels with a larger font size
    plt.yticks(fontsize=10)
    
    # Set labels and title with larger font sizes
    plt.xlabel('Group', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.title('Number of Data Points per Group', fontsize=14)

    # Optional: Set a tight layout to ensure everything fits without overlap
    plt.tight_layout()

    # Show the plot
    plt.show()

In [None]:
show_group_dist(specs5k_sc_features_total, "moa_broad")

## Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
def encode_labels(df):
    le = LabelEncoder()
    le.fit(df["moa"])
    df_labels = list(le.transform(df["moa"])) 
    df = df.with_columns(pl.Series(name="label", values=df_labels))  
    return df 

In [None]:
specs5k_sc_features_total = encode_labels(specs5k_sc_features_total)

In [None]:
specs5k_sc_features_total.group_by("label").count()

## Undersampling

In [None]:
from imblearn.under_sampling import NearMiss
import numpy as np

In [None]:
def stratified_sampling_pl(df, class_col, stratify_cols, fraction):
    """
    Perform stratified downsampling using Polars, focusing on a correct approach.
    
    Parameters:
    - df: Polars DataFrame, the dataset to sample from.
    - class_col: str, the column name for class labels.
    - stratify_cols: list of str, columns for further stratification within each class.
    - fraction: float, target fraction for downsampling.
    
    Returns:
    - Polars DataFrame after downsampling.
    """
    # Calculate the target downsampling size based on the smallest class size
    smallest_class_size = df[class_col].value_counts().min()["counts"][0]
    target_size = int(smallest_class_size * fraction)

    # Prepare to collect downsampled data frames
    downsampled_frames = []

    # Iterate over each class to perform downsampling
    for class_label in df.select(class_col).unique().to_numpy().flatten():
        class_df = df.filter(pl.col(class_col) == class_label)
        
        # Calculate downsampling fraction for the current class
        current_size = class_df.height
        downsample_fraction = min(1.0, (target_size / current_size) * fraction)
        grouped = class_df.groupby(stratify_cols)
        # Perform stratified sampling if needed
        if 0.1 < downsample_fraction < 1.0:
            # Randomly sample rows to achieve approximately the target size
            sampled_df = grouped.apply(lambda x: x.sample(fraction=downsample_fraction))
        elif downsample_fraction < 0.1:
            sampled_df = class_df.sample(fraction = downsample_fraction)
        else:
            sampled_df = class_df
        
        downsampled_frames.append(sampled_df)

    # Concatenate the downsampled frames into a single DataFrame
    downsampled_df = pl.concat(downsampled_frames)
    
    return downsampled_df

def sample_n_rows_per_group(df, group_cols, fraction, seed=None):
    # Define a custom sampling function that operates on DataFrames
    def sample_group(group_df):
                
        if len(group_df) <= n_samples:
            return group_df
        return group_df.sample(fraction=fraction, with_replacement=False, seed=seed)

    # Group the DataFrame and apply the custom sampling function to each group
    sampled_groups = (df
                      .group_by(group_cols)
                      .apply(sample_group))

    return sampled_groups

In [None]:
def undersampling(df, strategy):
    df_pd = df.to_pandas()
    if strategy == "nearmmiss":
        feature_cols = [col for col in df.columns if "Feature" in col]
        metadata_cols = [col for col in df.columns if col not in feature_cols]
        metadata_cols.remove("label")
        nm = NearMiss(version=1, n_jobs= -1)

        # Split features and target
        #X = specs3k_sc_features_pandas[[col for col in specs3k_sc_features_total.columns if not "label"]]
        X = df_pd[feature_cols]
        y = df_pd['label']

        # Apply NearMiss
        X_res, y_res = nm.fit_resample(X, y)

        df_resampled = pl.DataFrame(X_res)
        df_resampled = df_resampled.with_columns(pl.Series('label', y_res))

        resampled_df = df_resampled.join(df, on = feature_cols, how='left')
        resampled_df = resampled_df.drop("")
    elif strategy == "random":
        resampled_df = stratified_sampling_pl(df, "label", ["Metadata_Plate", "Metadata_Well", "Metadata_Site", "Metadata_cmpdName"], 1)
    
    elif strategy == "control_group_sampling":
        # Identify the most abundant class and its size
        
        # Assuming 'control_label' is the label of your control group
        control_label = 6
        
        # Filter the DataFrame for the control group and other groups
        control_group = df.filter(pl.col('label') == control_label)
        other_groups = df.filter(pl.col('label') != control_label)

        value_counts = other_groups.select(pl.col('label')).groupby('label').agg(pl.count().alias('count'))
        most_abundant_class_size = value_counts.select(pl.max('count')).to_numpy()[0][0]

        sample_rate = most_abundant_class_size/(control_group.shape[0])
        print(sample_rate)
        
        if 0.1 < sample_rate < 1.0:
            # Randomly sample rows to achieve approximately the target size
            control_grouped = (control_group.group_by(["Metadata_Plate", "Metadata_Well", "Metadata_Site", "Metadata_cmpdName"]))
            sampled = control_grouped.apply(lambda x: x.sample(fraction=sample_rate, seed = 42))
        elif sample_rate < 0.1:
            control_grouped = (control_group.group_by(["Metadata_Plate", "Metadata_Well", "Metadata_cmpdName"]))
            sampled = control_grouped.apply(lambda x: x.sample(fraction=sample_rate, seed = 42))
        
        # Concatenate the sampled control group back with the other data
        resampled_df = pl.concat([other_groups, sampled])
    
    return resampled_df


In [None]:
import gc
gc.collect()

In [None]:
resampled_specs5k_big = undersampling(specs5k_sc_features_total, "control_group_sampling")

In [None]:
show_group_dist(resampled_specs5k, "moa")

In [None]:
def prepare_class_data(df, plate2k, plate3k):
    df = df.drop('')
    df = df.with_columns(
    pl.when(pl.col('Metadata_Plate').is_in(plate2k)).then(pl.lit("specs2k"))
    .when(pl.col('Metadata_Plate').is_in(plate3k)).then(pl.lit("specs3k"))
    .otherwise(pl.lit("other"))
    .alias('project')
    )
    return df

In [None]:
resampled_specs5k_big = prepare_class_data(resampled_specs5k_big, specs2k_plates, specs3k_plates)

In [None]:
specs2k_plates = ['P103617',
 'P103602',
 'P103595',
 'P103597',
 'P103613',
 'P103591',
 'P103615',
 'P103607',
 'P103619',
 'P103606',
 'P103616',
 'P103601',
 'P103603',
 'P103620',
 'P103614',
 'P103621',
 'P103593',
 'P103592',
 'P103612',
 'P103608',
 'P103600',
 'P103609',
 'P103618',
 'P103589',
 'P103605',
 'P103590',
 'P103599',
 'P103610',
 'P103604',
 'P103611',
 'P103598',
 'P103596',
 'P103594']
specs3k_plates = ['P101382',
 'P101339',
 'P101338',
 'P101337',
 'P101354',
 'P101350',
 'P101360',
 'P101375',
 'P101363',
 'P101335',
 'P101373',
 'P101372',
 'P101352',
 'P101334',
 'P101369',
 'P101336',
 'P101345',
 'P101377',
 'P101346',
 'P101366',
 'P101359',
 'P101361',
 'P101364',
 'P101365',
 'P101362',
 'P101374',
 'P101380',
 'P101367',
 'P101358',
 'P101342',
 'P101371',
 'P101341',
 'P101368',
 'P101348',
 'P101370',
 'P101379',
 'P101386',
 'P101353',
 'P101381',
 'P101351',
 'P101357',
 'P101384',
 'P101347',
 'P101343',
 'P101387',
 'P101385',
 'P101355',
 'P101340',
 'P101378',
 'P101344',
 'P101349',
 'P101376',
 'P101356']

In [None]:
len(specs3k_plates) + len(specs2k_plates)

In [None]:
resampled_specs5k_big.write_parquet("specs5k_undersampled_BF_moa.parquet")

## Filter non-sign

In [None]:
resampled_specs5k = pl.read_parquet("datasets/specs5k_undersampled_BF_moa.parquet")

In [None]:
etest_res = pl.read_csv("etest_res_specs5k_200_samples_50000_perms_BF.csv", ignore_errors=True)

In [None]:
sign_cmp = etest_res.filter(pl.col("significant_adj") == True)

In [None]:
group_counts = resampled_specs5k.groupby('Metadata_cmpdName').agg([
    pl.count().alias('count')
])
# Step 2: Filter groups where count is over 200
groups_over_200 = group_counts.filter(pl.col('count') < 200)['Metadata_cmpdName']

In [None]:
bf_comps = list(sign_cmp[""]) + list(groups_over_200) + ["[DMSO]"]
bf_comps = list(set(bf_comps))

In [None]:
resampled_specs5k_sign = resampled_specs5k.filter((pl.col("Metadata_cmpdName").is_in(bf_comps)))

In [None]:
show_group_dist(resampled_specs5k_sign, "moa")

In [None]:
resampled_specs5k_sign.write_parquet("specs5k_undersampled_significant_BF.parquet")

## Aggregated

In [None]:
features_fixed = [feat for feat in resampled_specs5k_sign.columns if "Feature" in feat]
resampled_specs5k_aggregated = (
    resampled_specs5k_sign
    .groupby(["moa", "project", 'Metadata_Plate', 'Metadata_Well', 'Metadata_cmpdName'])
    .agg([pl.col(feature).median().alias(feature) for feature in features_fixed])
)

In [None]:
resampled_specs5k_aggregated

In [None]:
resampled_specs5k_aggregated.write_parquet("specs5k_undersampled_moa_aggregated_BF.parquet")

## Split for training csv

In [None]:
import polars as pl
import tqdm
def stratified_split(df, group_columns, n_splits=3):
    # Create a unique group identifier based on the combination of group columns
    unique_group_column = "unique_group"
    df = df.with_columns(pl.struct([pl.col(c) for c in group_columns]).cast(str).alias(unique_group_column))
    # Calculate the size of each split for each unique group
    group_sizes = df.groupby(unique_group_column).agg(pl.count().alias('size'))
    split_info = group_sizes.with_columns(
        (pl.col('size') / n_splits).floor().alias('split_size'),
        (pl.col('size') % n_splits).alias('remainder')
    )

    # Prepare a list to hold each split
    splits = [pl.DataFrame() for _ in range(n_splits)]

    # Iterate over each unique group and split accordingly
    for group in tqdm.tqdm(split_info[unique_group_column]):
        group_df = df.filter(pl.col(unique_group_column) == group)
        size_info = split_info.filter(pl.col(unique_group_column) == group)

        split_size = size_info['split_size'][0]
        remainder = size_info['remainder'][0]

        start_idx = 0
        for i in range(n_splits):
            additional_size = 1 if i < remainder else 0
            # Ensure the slice length is an integer
            slice_length = int(split_size + additional_size)
            end_idx = start_idx + slice_length
            group_split = group_df.slice(start_idx, slice_length)
            splits[i] = pl.concat([splits[i], group_split])
            start_idx = end_idx

    # Optionally, drop the unique group identifier from the split DataFrames
    splits = [split.drop(unique_group_column) for split in splits]

    return splits

In [None]:
split = stratified_split(resampled_specs5k, ["moa", "Metadata_cmpdName", "Metadata_Plate", "Metadata_Well"])

In [None]:
features_fixed = [feat for feat in split[0].columns if "Feature" in feat]

In [None]:
for i, df in enumerate(split):
    df = df.select(features_fixed + ["label"])
    file_name = f"BF_training_split_3_ALL/specs5k_moa_split_{i}_significant.csv"
    df.write_csv(file_name)

In [None]:
split[0].select(["moa", "label"]).unique()

In [None]:
all = pl.DataFrame()
for i in [0, 1, 2]:
    file_name = f"BF_training_split_3_ALL/specs5k_moa_split_{i}_significant.csv"
    temp = pl.read_csv(file_name)
    all = pl.concat([all, temp])

In [None]:
all.unique()

In [None]:
all.write_csv("BF_training_split_3_ALL/specs5k_moa_split_ALL_significant.csv")