In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib
import matplotlib.gridspec as gridspec
import seaborn as sns
import scipy.stats as stats
import scipy.optimize as opt
import pickle
import itertools
import random
import os
import ast
from library2_utils.color_scheme import cell_line_colors, cell_line_symbols
from library2_utils.transfer_functions import transfer_function
from library2_utils.mirna_combinations import get_combinations
from library2_utils.additive_model import add_mirna_expression
from library2_utils.design_utilities import tsi

# set the font size
plt.rcParams.update({'font.size': 7})
# set Helvetica globally
plt.rcParams['font.family'] = 'Helvetica'

cell_lines_subset = ["HEK293T", "HeLa", "SKNSH", "MCF7", "HUH7", "A549"]
cell_lines_rest = ["HaCaT", "JEG3", "Tera1", "PC3"]
cell_lines_measured = cell_lines_subset + cell_lines_rest

cell_lines_measured_UTR = [cell_line + "_3UTR" for cell_line in cell_lines_measured]
cell_lines_measured_pred = ["predicted_" + cell_line for cell_line in cell_lines_measured]
cell_lines_all_target = ["target_" + cell_line for cell_line in cell_lines_measured]

label_rename = {
    "HUH-7": "HUH7",
    "JEG-3": "JEG3",
    "Tera-1": "Tera1",
    "SK-N-SH": "SKNSH",
    "PC-3": "PC3",
}

# get mirbase
mirbase = pd.read_csv("../microrna_data/mirbase_extended.csv", index_col=0)

base_plot_folder = "../plots/16_design_for_tissues"
# create folder if it does not exist
if not os.path.exists(base_plot_folder):
    os.makedirs(base_plot_folder)
output_folder = "../outputs/16_design_for_tissues"
# create folder if it does not exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)


key_shorthand = {
    "24_miRNA_full_subset_quality_AND4": "quality_subset_AND4",
    "25_miRNA_full_subset_quality_AND5": "quality_subset_AND5",
    "26_miRNA_full_subset_quality_AND6": "quality_subset_AND6",
    "27_miRNA_full_quality_AND4": "quality_all_AND4",
    "28_miRNA_full_quality_AND5": "quality_all_AND5",
    "29_miRNA_full_quality_AND6": "quality_all_AND6",
    "30_miRNA_AND4_subset_mse_designs": "mse_subset_AND4",
    "31_miRNA_AND5_subset_mse_designs": "mse_subset_AND5",
    "32_miRNA_AND6_subset_mse_designs": "mse_subset_AND6",
    "33_miRNA_AND4_all_mse_designs": "mse_all_AND4",
    "34_miRNA_AND5_all_mse_designs": "mse_all_AND5",
    "35_miRNA_AND6_all_mse_designs": "mse_all_AND6",
    "subset_quality": "subset_quality",
    "full_quality": "full_quality",
    "subset_mse": "subset_mse",
    "all_mse": "all_mse"
}

main_colormap = "rocket"
box_color = "deepskyblue"

In [None]:
dataset_processed_folder = "../microrna_data/15_human_data_merge/processed"
merged_dataset = pd.read_csv(os.path.join(dataset_processed_folder, "15.3_merged_tissue_datasets.csv"), index_col=0)

with open(os.path.join(dataset_processed_folder, "tissue_data_popt.pkl"), "rb") as f:
    tissue_popt = pickle.load(f)
with open(os.path.join(dataset_processed_folder, "tissue_data_scales.pkl"), "rb") as f:
    tissue_scales = pickle.load(f)

# CREATE TISSUE DESIGNS

In [3]:
def add_mirna_combs(mirna_expr, combs):
    """Takes a dataframe with microRNA expressions and tuples of combinations.
    Returns a dataframe with the added expression of the microRNAs in the constructs.
    
    mirna_expr: dataframe with microRNA expression (index: microRNA names, columns: cell lines)
    construct_df: dataframe with construct information (index: construct names, columns: microRNA names)"""
    # if isinstance(combs[0], tuple):
    multiindex = pd.MultiIndex.from_tuples(combs, names=[f'miRNA{i+1}' for i in range(len(combs[0]))])
    added_expression = pd.DataFrame(columns=mirna_expr.columns, index=multiindex)
    added_expression = added_expression.astype("float")
    added_expression = added_expression.sort_index()
    
    for i, comb in enumerate(combs):
        added_expression.loc[comb,:] = mirna_expr.loc[comb,:].sum(axis=0).values

    return added_expression

def calculate_mse(df, mse_target, loss_emphasis):
    """This function calculates the mean squared error of a design for a given target stability distribution.
    df has expression values for different cell lines in the columns and microRNAs in the rows.
    mse_target is a dataframe with the target distribution across cell lines as a single row."""
        
    mse = (df - mse_target)**2
    mse = mse.mul(loss_emphasis, axis=1)
    mse = mse.mean(axis=1)

    return mse

def calculate_fitness(pop, expression, loss_emphasis={}, mse_target=[]):
    """This function calculates the fitness of a population of designs based on the projected stabiltiy and target."""
    
    # Calculate the stability levels for the designs in the population according to the additive model
    add_expr = add_mirna_combs(expression, pop).apply(lambda x: transfer_function(x, *tissue_popt))

    # if loss emphasis is empty, generate it
    if len(loss_emphasis) == 0:
        loss_emphasis = {cell_line: 1 for cell_line in add_expr.columns}

    mse = calculate_mse(add_expr, mse_target, loss_emphasis)
    fitness = 1/mse

    return fitness

def evaluate_fitness(pop, expression, loss_emphasis={}, mse_target=[]):
    add_expr = add_mirna_combs(expression, pop).apply(lambda x: transfer_function(x, *tissue_popt))
    add_expr["quality"] = calculate_fitness(pop, expression, loss_emphasis, mse_target)

    return add_expr

def drop_duplicates(df):
    """ Drop all duplicate designs. Assumes that the indices are tuples of microRNAs. """
    sorted_idx = df.index.map(sorted)
    df['sorted_index'] = [tuple(i) for i in sorted_idx]
    duplicates = df.duplicated(subset='sorted_index', keep=False)
    df = df.drop_duplicates(subset='sorted_index', keep='first').drop(columns='sorted_index')
    return df, duplicates 

def select_parents(fitnesses):
    """Selects two parents from the population using tournament selection."""
    # Tournament selection
    tournament_size = 3
    parents = []

    for _ in range(2):  # Select two parents
        tournament = fitnesses.sample(tournament_size)
        # select the best individual
        winner = tournament.sort_values(ascending=False).index[0]
        parents.append(winner)

    return tuple(parents)

def crossover(parent1, parent2, n):
    # Single point crossover
    idx = random.randint(0, n-1)
    child = parent1[:idx] + parent2[idx:]
    return child

def mutate(child, miRNAs, n):
    # Randomly replace one microRNA with another
    if random.random() < 0.2:  # 20% mutation rate
        idx = random.randint(0, n-1)
        new_mirna = random.choice(miRNAs)
        child = list(child)
        child[idx] = new_mirna
    return tuple(child)

# -----------------------------------------------------------------------

def determine_mirna_usage(df):
    usage_dict = {}
    used_mirnas = df.index.tolist()
    for design in used_mirnas:
        for mirna in design:
            if mirna in usage_dict:
                usage_dict[mirna] += 1
            else:
                usage_dict[mirna] = 1
    
    # sort dict by value
    usage_dict = {k: v for k, v in sorted(usage_dict.items(), key=lambda item: item[1], reverse=True)}
    return usage_dict

def count_mirnas_per_design(df):
    """Combinations is a list of dataframe of designs.
    Returns a dictionary with the number of times each miRNA is used across the designs.
    If a single miRNA is used multiple times in a single design, it is counted only once."""
    combinations = df.index.tolist()

    mirna_count = {}
    for design in combinations:
        design_count = {}
        for mirna in design:
            if mirna in design_count:
                continue
            else:
                design_count[mirna] = 1
            if mirna in mirna_count:
                mirna_count[mirna] += 1
            else:
                mirna_count[mirna] = 1

    mirna_count_df = pd.DataFrame.from_dict(mirna_count, orient='index', columns=['count'])
    mirna_count_df = mirna_count_df.sort_values(by=['count'], ascending=False)            
    return mirna_count_df

# -----------------------------------------------------------------------

def generate_genetic_design(target, n_mirnas, mirnas, mirna_expression, loss_emphasis={},
                            no_designs=10, generations=30, population_size=500):
    
    # Initial population
    population = [tuple(random.choice(mirnas) for _ in range(n_mirnas)) for _ in range(population_size)]

    # Run the GA for a set number of generations
    for generation in range(generations):
        fitnesses = calculate_fitness(pop=population,
                                      expression=mirna_expression,
                                      loss_emphasis=loss_emphasis,
                                      mse_target=target)
        new_population = []
        for _ in range(population_size):
            parent1, parent2 = select_parents(fitnesses)
            child = crossover(parent1, parent2, n_mirnas)
            child = mutate(child, mirnas, n_mirnas)
            new_population.append(child)
        population = new_population

    # Get the best designs
    designs = evaluate_fitness(population, mirna_expression, loss_emphasis=loss_emphasis, mse_target=target)
    designs, _ = drop_duplicates(designs)
    designs.sort_values(by=['quality'], ascending=False, inplace=True)
    designs = designs.head(no_designs)

    return designs

def add_numbered_index(df, base_name):
    """Df is assumed to have a multiindex of microRNAs. First, convert the multi-index to columns."""
    df = df.reset_index()
    """Then, add a column with the design number."""
    df.index = [f"{base_name}_{i+1}" for i in range(len(df))]
    return df

def generate_mse_designs(mse_targets, designs_per_target, base_name, mirna_data,
                         loss="mse", n_mirnas=4, loss_emphases={}, increase_diversity=1):
    # List of microRNAs and their impacts
    miRNAs = list(mirna_data.index)

    all_designs = []

    if len(loss_emphases) == 0:
        loss_emphases = [{} for i in range(len(mse_targets))]
        
    for i, mse_target in enumerate(mse_targets):
        print(f"Processing {base_name} {i+1}/{len(mse_targets)}")
        miRNAs_filter = miRNAs.copy()
        target_designs = []
        
        for _ in range(increase_diversity):
            designs = generate_genetic_design(
                target=mse_target,
                loss_emphasis=loss_emphases[i],
                n_mirnas=n_mirnas,
                mirnas=miRNAs_filter,
                mirna_expression=mirna_data,
                no_designs=int(designs_per_target/increase_diversity),
            )

            designs["target"] = str(mse_target)
            designs["emphasis"] = str(loss_emphases[i])
            designs["type"] = base_name

            used_mirnas = count_mirnas_per_design(designs)
            top_mirnas = used_mirnas.head(5).index.to_list()
            # print(cell_line, " ", top_mirnas)
            miRNAs_filter = [mirna for mirna in miRNAs_filter if mirna not in top_mirnas]
            target_designs.append(designs.head(int(designs_per_target/increase_diversity)))

        target_designs = pd.concat(target_designs)
        all_designs.append(target_designs)
        
    all_designs_df = pd.concat(all_designs)

    return all_designs_df

In [4]:
def make_multiple_mse_designs(base_name, mirna_dataset, targets, emphases, cell_lines_used,
                              loss, sublabel, designs_per_cell_line, n):
    
    """This function generates designs for multiple targets and loss emphases."""
    
    diversity = 1
    designs = generate_mse_designs(mse_targets=targets,
                            designs_per_target=designs_per_cell_line, 
                            mirna_data=mirna_dataset,
                            base_name=base_name,
                            loss_emphases=emphases,
                            n_mirnas=n,
                            increase_diversity=diversity)
    
    designs[cell_lines_used] = designs[cell_lines_used].astype("float")
    designs["sublabel"] = str(sublabel)
    base_number = 1
        
    designs = add_numbered_index(designs, base_name=f"{base_number}_miRNA_full_{base_name}_AND{n}")
    return designs

In [5]:
merged_dataset_tissues = merged_dataset.drop(columns=cell_lines_measured)
merged_dataset_tissues = 10**merged_dataset_tissues

In [None]:
merged_dataset_tissues_log10 = np.log10(merged_dataset_tissues)
plt.figure(figsize=(5.5, 5.5))
sns.heatmap(merged_dataset_tissues_log10.corr()**2, cmap="viridis", square=True,
            cbar_kws={'label': r'$r^2$', 'shrink': 0.8})
plt.savefig(os.path.join(base_plot_folder, "tissue_correlation_heatmap.png"), dpi=300)

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

df_t = merged_dataset_tissues_log10.transpose()

# Perform hierarchical clustering
linked = linkage(df_t, method='ward')

# Plot the dendrogram
plt.figure(figsize=(4, 3))
dendrogram(linked, labels=df_t.index, leaf_rotation=90)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Tissues')
plt.ylabel('Distance')
plt.show()

In [21]:
# broad organs
broad_organs = ['brain', 'bone', 'muscle', 'stomach', 'liver', 'lung', 'thyroid', 'lymph_node', 'spleen', 'kidney', 'testis', 'skin', 'adipocyte']
# brain regions
brain_regions = ['thalamus', 'white_matter', 'grey_matter', 'nucleus_caudatus', 'cerebellum', 'spinal_cord', 'dura_mater']
# target numbers
target_numbers = [1, 2, 3, 4, 5, 6, 7, 8]

In [None]:
designs_per_cell_line = 1
base_name="single_target_broad_organs"
cell_lines_used = broad_organs

single_targets = []
for i in range(len(cell_lines_used)):
    single_target = {cell_line: 1 for cell_line in cell_lines_used}
    single_target[cell_lines_used[i]] = 0
    single_targets.append(single_target)

single_emphases = []
for i in range(len(single_targets)):
    single_emphasis = {cell_line: 1 for cell_line in cell_lines_used}
    single_emphasis[cell_lines_used[i]] = len(cell_lines_used)/2.0
    single_emphases.append(single_emphasis)

# get the subset of mirna data that is relevant
mirna_dataset = merged_dataset_tissues[cell_lines_used]
# constrain to miRNAs that are expressed in at least one tissue
mirna_dataset = mirna_dataset[mirna_dataset.max(axis=1) > 3000]
# create the possibility of leaving out miRNA sites altogether
mirna_dataset.loc["empty"] = 0

# create an output folder
output_folder_curr = os.path.join(output_folder, base_name)
os.makedirs(output_folder_curr, exist_ok=True)

for target_number in target_numbers:
    designs = make_multiple_mse_designs(base_name=base_name,
                        targets=single_targets,
                        mirna_dataset=mirna_dataset,
                        emphases=single_emphases,
                        cell_lines_used=cell_lines_used,
                        loss="mse",
                        sublabel=cell_lines_used,
                        n=target_number,
                        designs_per_cell_line=designs_per_cell_line)

    designs.to_csv(os.path.join(output_folder_curr, f"designs_{base_name}_{target_number}.csv"))

In [None]:
designs_per_cell_line = 1
base_name="single_active_broad_organs"

cell_lines_used = broad_organs

single_actives = []
for i in range(len(cell_lines_used)):
    single_active = {cell_line: 0 for cell_line in cell_lines_used}
    single_active[cell_lines_used[i]] = 1
    single_actives.append(single_active)

single_emphases = []
for i in range(len(single_actives)):
    single_emphasis = {cell_line: 1 for cell_line in cell_lines_used}
    single_emphasis[cell_lines_used[i]] = len(cell_lines_used)
    single_emphases.append(single_emphasis)
    
mirna_dataset = merged_dataset_tissues[cell_lines_used]
# constrain to miRNAs that are expressed in at least one tissue
mirna_dataset = mirna_dataset[mirna_dataset.max(axis=1) > 3000]
mirna_dataset.loc["empty"] = 0

# create an output folder
output_folder_curr = os.path.join(output_folder, base_name)
os.makedirs(output_folder_curr, exist_ok=True)

for target_number in target_numbers:
    designs = make_multiple_mse_designs(base_name=base_name,
                        targets=single_actives,
                        mirna_dataset=mirna_dataset,
                        emphases=single_emphases,
                        cell_lines_used=cell_lines_used,
                        loss="mse",
                        sublabel=cell_lines_used,
                        n=target_number,
                        designs_per_cell_line=designs_per_cell_line)

    designs.to_csv(os.path.join(output_folder_curr, f"designs_{base_name}_{target_number}.csv"))

In [None]:
designs_per_cell_line = 1
base_name="single_target_brain_regions"

cell_lines_used = brain_regions

single_targets = []
for i in range(len(cell_lines_used)):
    single_target = {cell_line: 1 for cell_line in cell_lines_used}
    single_target[cell_lines_used[i]] = 0
    single_targets.append(single_target)

single_emphases = []
for i in range(len(single_targets)):
    single_emphasis = {cell_line: 1 for cell_line in cell_lines_used}
    single_emphasis[cell_lines_used[i]] = len(cell_lines_used)/2.0
    single_emphases.append(single_emphasis)
    
mirna_dataset = merged_dataset_tissues[cell_lines_used]
# constrain to miRNAs that are expressed in at least one tissue
mirna_dataset = mirna_dataset[mirna_dataset.max(axis=1) > 3000]
mirna_dataset.loc["empty"] = 0

# create an output folder
output_folder_curr = os.path.join(output_folder, base_name)
os.makedirs(output_folder_curr, exist_ok=True)

designs = make_multiple_mse_designs(base_name=base_name,
                    targets=single_targets,
                    mirna_dataset=mirna_dataset,
                    emphases=single_emphases,
                    cell_lines_used=cell_lines_used,
                    loss="mse",
                    sublabel=cell_lines_used,
                    n=6,
                    designs_per_cell_line=designs_per_cell_line)

designs.to_csv(os.path.join(output_folder_curr, f"designs_{base_name}_6.csv"))

In [None]:
designs_per_cell_line = 1
base_name="single_active_brain_regions"

cell_lines_used = brain_regions

single_actives = []
for i in range(len(cell_lines_used)):
    single_active = {cell_line: 0 for cell_line in cell_lines_used}
    single_active[cell_lines_used[i]] = 1
    single_actives.append(single_active)

single_emphases = []
for i in range(len(single_actives)):
    single_emphasis = {cell_line: 1 for cell_line in cell_lines_used}
    single_emphasis[cell_lines_used[i]] = len(cell_lines_used)
    single_emphases.append(single_emphasis)
    
mirna_dataset = merged_dataset_tissues[cell_lines_used]
# constrain to miRNAs that are expressed in at least one tissue
mirna_dataset = mirna_dataset[mirna_dataset.max(axis=1) > 3000]
mirna_dataset.loc["empty"] = 0

# create an output folder
output_folder_curr = os.path.join(output_folder, base_name)
os.makedirs(output_folder_curr, exist_ok=True)

designs = make_multiple_mse_designs(base_name=base_name,
                    targets=single_actives,
                    mirna_dataset=mirna_dataset,
                    emphases=single_emphases,
                    cell_lines_used=cell_lines_used,
                    loss="mse",
                    sublabel=cell_lines_used,
                    n=6,
                    designs_per_cell_line=designs_per_cell_line)

designs.to_csv(os.path.join(output_folder_curr, f"designs_{base_name}_6.csv"))

# Plot the designs

In [13]:
def extract_target_cell_lines(cell_line_dict, target_value=0):
    if type(cell_line_dict) == str:
        cell_line_dict = ast.literal_eval(cell_line_dict)
    result = [cell_line for cell_line, value in cell_line_dict.items() if value == target_value]
    # apply rename dictionary
    result = [label_rename[cell_line] if cell_line in label_rename else cell_line for cell_line in result]
    # make result a tuple
    # result = tuple(result)
    # make it a clean string
    result = ", ".join(result)
    return result

def get_cell_line_to_row_mapping(df):
    """The input should be a df that only contains the cell_lines or a list."""
    cell_line_to_row = {}
    i = 0
    if type(df) == list:
        for cell_line in df:
            cell_line_to_row[cell_line] = i
            i += 1
        return cell_line_to_row
    elif type(df) == pd.DataFrame:
        for cell_line in df.columns:
            cell_line_to_row[cell_line] = i
            i += 1
    return cell_line_to_row

In [8]:
plot_folder = os.path.join(base_plot_folder, "tissue_designs")
# create folder if it does not exist
if not os.path.exists(plot_folder):
    os.makedirs(plot_folder)

# broad organs
broad_organs = ['brain', 'bone', 'muscle', 'stomach', 'liver', 'lung', 'thyroid', 'lymph_node', 'spleen', 'kidney', 'testis', 'skin', 'adipocyte']
# brain regions
brain_regions = ['thalamus', 'white_matter', 'grey_matter', 'nucleus_caudatus', 'cerebellum', 'spinal_cord', 'dura_mater']

In [14]:
def plot_design_heatmap(df, filename, organ_dict_or_list, design_type, plot_folder, title=None):
    if "active" in design_type:
        df["target_cell_lines"] = df["target"].apply(lambda x: extract_target_cell_lines(x, 1))
    if "target" in design_type:
        df["target_cell_lines"] = df["target"].apply(lambda x: extract_target_cell_lines(x, 0))
    
    if type(organ_dict_or_list) == list:
        curr_cell_set = [column for column in df.columns if column in organ_dict_or_list]
    else:
        curr_cell_set = [column for column in df.columns if column in organ_dict_or_list.values()]
    
    # get a mapping to rows
    cell_line_to_row_map = get_cell_line_to_row_mapping(curr_cell_set)

    # -----------------------------------------------------------------------
    plt.clf()
    if "broad_organs" in filename:
        figsize=(3, 1.8)
    if "brain_regions" in filename:
        figsize=(2, 1.2)
    fig, ax = plt.subplots(figsize=figsize) 
    
    # Create the heatmap subplot
    sns.heatmap(df[curr_cell_set].T, cmap=main_colormap, vmin=0, vmax=1, fmt=".2f", square=True, annot=False,
                        cbar=True, cbar_kws={'label': 'stability'}, ax=ax)

    x_pos = 0
    for index, row in df.iterrows():
        target_cell_lines = row["target_cell_lines"]
        if ", " in target_cell_lines:
            target_cell_lines = target_cell_lines.split(", ")
            target_rows = [cell_line_to_row_map[cell_line] for cell_line in target_cell_lines]
        else:
            target_rows = [cell_line_to_row_map[target_cell_lines]]

        for target_row in target_rows:
            rect = patches.Rectangle((x_pos, target_row), 1, 1, linewidth=1.5, edgecolor=box_color, facecolor='none')
            ax.add_patch(rect)

        x_pos += 1

    plt.xlim([-0.1, df[curr_cell_set].shape[0] + 0.1])
    plt.ylim([len(df[curr_cell_set].columns) + 0.1, -0.1])
    plt.yticks(ticks=np.arange(0.5, len(curr_cell_set)+0.5), labels=[cell.replace("predicted_", "").replace("_", " ") for cell in curr_cell_set], rotation=0)
    plt.xticks([])
    plt.xlabel("Designs")
    
    if title:
        plt.title(title, fontsize=8)

    plt.tight_layout()
    for format in ["png", "svg"]:
        plt.savefig(os.path.join(plot_folder, f"{filename}.{format}"), dpi=300)
    plt.close()

In [10]:
design_folders = {
    "broad_organs_single_target": os.path.join(output_folder, "single_target_broad_organs"),
    "broad_organs_single_active": os.path.join(output_folder, "single_active_broad_organs"),
    "brain_regions_single_target": os.path.join(output_folder, "single_target_brain_regions"),
    "brain_regions_single_active": os.path.join(output_folder, "single_active_brain_regions"),
}
plot_folders = {
    "broad_organs_single_target": os.path.join(plot_folder, "broad_organs_single_target"),
    "broad_organs_single_active": os.path.join(plot_folder, "broad_organs_single_active"),
    "brain_regions_single_target": os.path.join(plot_folder, "brain_regions_single_target"),
    "brain_regions_single_active": os.path.join(plot_folder, "brain_regions_single_active"),
}

In [None]:
organs = {
    "broad_organs_single_target": broad_organs,
    "broad_organs_single_active": broad_organs,
    "brain_regions_single_target": brain_regions,
    "brain_regions_single_active": brain_regions,
}

current = "broad_organs_single_target"
for current in list(organs.keys())[:2]:
    design_folder = design_folders[current]
    # list all files in the folder
    design_files = os.listdir(design_folder)
    plot_folder_curr = plot_folders[current]
    os.makedirs(plot_folder_curr, exist_ok=True)

    # iterate over the files
    dfs = []
    qualities = []
    for design_file in design_files:
        if design_file.endswith(".csv"):
            df = pd.read_csv(os.path.join(design_folder, design_file), index_col=0)
            dfs.append(df)
            plot_design_heatmap(df, design_file.split(".")[0], organs[current],
                                design_file.split(".")[0], plot_folder_curr)
                                #title=design_file.split(".")[0].split("_")[-1]+" targets maximum")
            
    all_dfs = pd.concat(dfs)
    all_dfs["target_number"] = all_dfs.index.str.split("_").str[-2].str[-1]
    mirna_columns = [column for column in all_dfs if "miRNA" in column]

    # replace "empty" by nan
    all_dfs = all_dfs.replace("empty", np.nan)

    # find out how many miRNAs are being used at all
    all_dfs["targets_used"] = all_dfs[mirna_columns].count(axis=1)

    # find out how many unique miRNAs are being used
    all_dfs["unique_mirnas"] = all_dfs[mirna_columns].nunique(axis=1)

    # get the maximum quality for each target
    all_dfs["quality_max"] = all_dfs.groupby("target_cell_lines")["quality"].transform("max")

    # calculate relative qualities
    all_dfs["quality_relative"] = 100 * all_dfs["quality"] / all_dfs["quality_max"]
    
    if "active" in current:
        all_dfs_active = all_dfs.copy()
    if "target" in current:
        all_dfs_target = all_dfs.copy()
    target_numbers = all_dfs["target_number"].unique()    

In [None]:
# create a plot of the mean targets used and unique miRNAs used
plt.clf()
fig, ax = plt.subplots(figsize=(2, 1.6))
mean_by_target = all_dfs_target.groupby("target_number")["targets_used"].mean()
std_by_target = all_dfs_target.groupby("target_number")["targets_used"].std()
plt.errorbar(x=target_numbers, y=mean_by_target, yerr=std_by_target, label="single inactive",
                fmt="o-", color="tab:red", capsize=3, markersize=5)
mean_by_target = all_dfs_active.groupby("target_number")["targets_used"].mean()
std_by_target = all_dfs_active.groupby("target_number")["targets_used"].std()
plt.errorbar(x=target_numbers, y=mean_by_target, yerr=std_by_target, label="single active",
                fmt="o-", color="tab:blue", capsize=3, markersize=5)
plt.xlabel("Maximum allowed number of targets")
plt.ylabel("Targets used in designs")
plt.legend(loc="upper left")
plt.yticks(np.arange(1, 9, 1))
plt.ylim([0.5, 8.5])
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"mean_targets_used.{format}"), dpi=300)

In [None]:
fig, ax = plt.subplots(figsize=(6, 1.8))
sns.swarmplot(x='target_number', y='targets_used', data=all_dfs_active, color='tab:red', size=3, alpha=1, label="single active")
sns.swarmplot(x='target_number', y='targets_used', data=all_dfs_target, color='tab:blue', size=3, alpha=1, label="single inactive")

# Set labels
plt.xlabel("Maximum allowed number of targets")
plt.ylabel("Targets used in design")

plt.legend(loc='upper left')
plt.show()

In [None]:
# create a plot of the mean targets used and unique miRNAs used
plt.clf()
fig, ax = plt.subplots(figsize=(2, 1.6))
sns.stripplot(x='target_number', y='quality_relative', data=all_dfs_active, alpha=0.6, color='tab:blue', size=3)
sns.stripplot(x='target_number', y='quality_relative', data=all_dfs_target, alpha=0.6, color='tab:red', size=3)

# Add custom legend
ax.scatter([], [], c='tab:red', alpha=0.6, s=9, label="single inactive")
ax.scatter([], [], c='tab:blue', alpha=0.6, s=9, label="single active")

plt.xlabel("Maximum allowed number of targets")
plt.ylabel("Weighted relative\ndesign quality (%)")
plt.legend(loc='lower right')
plt.ylim([0, 105])
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"quality_targets_used.{format}"), dpi=300)

In [None]:
# create a plot of the mean targets used and unique miRNAs used
plt.clf()
fig, ax = plt.subplots(figsize=(2, 1.6))
mean_by_target = all_dfs_target.groupby("target_number")["unique_mirnas"].mean()
std_by_target = all_dfs_target.groupby("target_number")["unique_mirnas"].std()
plt.errorbar(x=target_numbers, y=mean_by_target, yerr=std_by_target, label="single inactive",
                fmt="o-", color="tab:red", capsize=3, markersize=5)
mean_by_target = all_dfs_active.groupby("target_number")["unique_mirnas"].mean()
std_by_target = all_dfs_active.groupby("target_number")["unique_mirnas"].std()
plt.errorbar(x=target_numbers, y=mean_by_target, yerr=std_by_target, label="single active",
                fmt="o-", color="tab:blue", capsize=3, markersize=5)
plt.xlabel("Maximum allowed number of targets")
plt.ylabel("Unique targets")
plt.legend(loc="upper left")
plt.yticks(np.arange(1, 9, 1))
plt.ylim([0.5, 8.5])
plt.tight_layout()
for format in ["png", "svg"]:
    plt.savefig(os.path.join(plot_folder, f"unique_targets_used.{format}"), dpi=300)