In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
data_dir='data'

In [None]:
from sklearn.preprocessing import scale, minmax_scale
from sklearn.metrics import root_mean_squared_error, ndcg_score
def calc_test(true_scores, pred_scores, k=10):
    rho, _ = stats.spearmanr(true_scores, pred_scores)

    # RMSE
    rmse = root_mean_squared_error(true_scores, pred_scores)

    # NDCG@k
    std_tgts = minmax_scale([true_scores], (0, 5), axis=1)
    ndcg_val = ndcg_score(std_tgts,[pred_scores], k=k)

    result ={
        'spearman': rho,
        # 'rmse': rmse,
        'ndcg': ndcg_val
    }
    return result

import numpy as np
from typing import Union, Tuple

def calculate_mean_similarity(latent_matrix: np.ndarray):

    # 入力チェック
    if not isinstance(latent_matrix, np.ndarray):
        raise TypeError("latent_matrix must be numpy.ndarray")

    if len(latent_matrix.shape) != 2:
        raise ValueError("latent_matrix must be 2-dimensional")

    N, H = latent_matrix.shape

    if N < 2:
        raise ValueError("Number of samples must be greater than 1")

    # 各ベクトルのノルムを計算
    norms = np.linalg.norm(latent_matrix, axis=1, keepdims=True)
    # ゼロ除算を防ぐ
    norms = np.where(norms == 0, 1e-8, norms)

    # 正規化された行列を計算
    normalized_matrix = latent_matrix / norms

    # コサイン類似度行列を計算
    similarity_matrix = np.dot(normalized_matrix, normalized_matrix.T)
    # 対角要素を0にする（自己との類似度は除外）
    np.fill_diagonal(similarity_matrix, 0)

    # 平均コサイン類似度を計算
    mean_similarity = similarity_matrix.sum() / (N * (N-1))

    return mean_similarity

import logomaker
def draw_logo(seqs, ax=None):
    if isinstance(seqs, str):
        seqs = [seqs]
    counts_matrix = logomaker.alignment_to_matrix(seqs)

    logo = logomaker.Logo(counts_matrix,
            shade_below=.5,
            fade_below=.5,
            color_scheme='NajafabadiEtAl2017',
            ax=ax
        )
    logo.ax.spines['right'].set_visible(False)
    logo.ax.spines['top'].set_visible(False)
    logo.ax.spines['bottom'].set_visible(False)
    logo.ax.spines['left'].set_visible(False)
    # logo.ax.set_xticks(np.arange(length))
    logo.ax.set_yticks([])

def get_ddg(path):
    results_df = pd.read_csv(path)

    ddg_scores = (results_df[results_df["scored_state"]=="ddG"]
                     .groupby("case_name")["total_score"]
                     .min()
                     .sort_index())
    return ddg_scores

def mutstr_to_mutseq(mutstr, wt_seq, offset=0, indel_indices=None):
    if indel_indices is None:
        indel2indices = {i+offset:i for i in range(len(wt_seq))}
    else:
        indel2indices = {v:i for i,v in enumerate(indel_indices)}
    mutseq = list(wt_seq)
    mutations = mutstr.split(',')

    for mutation in mutations:
        # wt, pos, mut = mutation[0], int(mutation[2:-1]) - offset, mutation[-1]
        wt, pos, mut = mutation[0], int(indel2indices[int(mutation[2:-1])]), mutation[-1]
        assert wt == mutseq[pos]
        mutseq[pos] = mut
    return ''.join(mutseq)

def mutseq_to_mutstr(mutseq, wt_seq, chain, offset=0, indel_indices=None):
    if indel_indices is None:
        indices2indel = {i:i+offset for i in range(len(wt_seq))}
    else:
        indices2indel = {i:v for i,v in enumerate(indel_indices)}
    mutations = []
    assert len(mutseq)==len(wt_seq)
    for i, (wt, mut) in enumerate(zip(wt_seq, mutseq)):
        if wt != mut:
            pos = indices2indel[i]
            mutations.append(f"{wt}{chain}{pos}{mut}")

    return ','.join(mutations)


In [None]:
import pandas as pd
from pygmo import hypervolume

def greedy_hypervolume_subset(points, n, ref_point):
    selected = []
    remaining = list(range(len(points)))

    for _ in tqdm(range(n)):
        max_hv = -float('inf')
        best_idx = None

        for idx in remaining:
            # 現在の選択 + 候補点のHypervolume計算
            current_points = points[selected + [idx]]
            hv = hypervolume(current_points)
            current_hv = hv.compute(ref_point)

            if current_hv > max_hv:
                max_hv = current_hv
                best_idx = idx

        if best_idx is not None:
            selected.append(best_idx)
            remaining.remove(best_idx)

    return selected, max_hv

def normalize_score(score):
    return (score-score.quantile(0.05))/(score.quantile(0.95)-score.quantile(0.05)+1e-10)


In [None]:
h3_dict = {
    "4D5_HER2_fitness_1N8Z":   "SRWGGDGFYAMDY",
    "5A12_Ang2_fitness_4ZFG":  "ARFVFFLPYAMDY",
    "5A12_VEGF_fitness_4ZFF":  "ARFVFFLPYAMDY",
}

exp2name={
    "greedy":"Greedy", "greedy_unbias": "Greedy(no bias)",
    "greedy_unbias_offline": "Greedy-Offline(no bias)", "greedy_offline": "Greedy-Offline",
    "ucb": "UCB", "ucb_unbias": "UCB(no bias)", "ga": "GA", "ucb_offline": "UCB-Offline",
    "greedy_multi": "Greedy(Multi-Objective)", "ucb_multi": "UCB(Multi-Objective)",
    "ucb_multi_offline": "UCB-Offline(Multi-Objective)", "greedy_multi_offline": "Greedy-Offline(Multi-Objective)",
    "greedy_sum": "Greedy(Sum)","greedy_sum_offline": "Greedy-Offline(Sum)",
}


In [None]:
exps_dual = ["greedy", "greedy_multi"]
font_size=15


In [None]:
jobdf = pd.read_csv("jobs/job_dual.csv")

In [None]:
import yaml
import os

In [None]:
ref_points = [2, 2, 2]

In [None]:
import os
cycles = {}
dfs = []
configs = []
for confpath in jobdf["CONFIG"]:
    with open(confpath) as f:
        data = yaml.safe_load(f)
    target=data["data_dir"].split("/")[1]
    model_type = data["data_dir"].split("/")[2]
    exp=data["data_dir"].split("/")[3]
    dfs_ = []
    for du_target in ["target_0","target_1"]:
        df = pd.read_csv(os.path.join(data_dir, "..", data["data_dir"], "9", du_target, "train_data", "training_data.csv"))
        df["target"]=target
        df["model_type"]=model_type
        dfs_.append(df)

    df = dfs_[0].copy()
    df["DMS_score_0"] = dfs_[0]["DMS_score"]
    df["DMS_score_1"] = dfs_[1]["DMS_score"]
    df = df.drop("DMS_score",axis=1)
    df["flxddg_0"] = -dfs_[0]["DMS_score"]
    df["flxddg_1"] = -dfs_[1]["DMS_score"]

    df["mutations"] = df["mutations"].fillna("")
    df["mutations_wt"] = df["mutseq"].apply(lambda x: mutseq_to_mutstr(x, h3_dict["4D5_HER2_fitness_1N8Z"], "B", offset=0))
    df["exp"]=exp
    score_cols = ["flxddg_0_std", "flxddg_1_std", "ablang2_perplexity_std"]
    score_cols = ["flxddg_0_std", "flxddg_1_std", "ablang2_perplexity_std"]

    # hv
    df["flxddg_0_std"] = normalize_score(df["flxddg_0"])
    df["flxddg_1_std"] = normalize_score(df["flxddg_1"])
    df["ablang2_perplexity_std"] = normalize_score(df["ablang2_perplexity"])
    df["IP_seq_std"] = normalize_score(-df["IP_seq"])

    for score_col, ref_point in zip(score_cols, ref_points):
        df[score_col]*=ref_point
    df["#Mutation"]=df["mutations_wt"].apply(lambda x: len(x.split(",")) if x !="" else 0)
    df["sum_score"]=df[score_cols].sum(axis=1)
    df["sum_score_2"]=df[score_cols[:2]].sum(axis=1)
    dfs.append(df)
    configs.append({
        "target":target,
        "MAXCYCLE":10,
        "model_type": model_type,
        "exp":exp,
        "data_dir": data["data_dir"]
    })
len(dfs)

In [None]:
test_targets = {"target_0": "5A12_Ang2_fitness_4ZFG", "target_1": "5A12_VEGF_fitness_4ZFF"}

In [None]:
flex_ddg_dfs={}
sampled_seq_dfs = {}
flex_ddg_df_alls = {}
for target in test_targets.values():
    for mode in ["bias", "unbias"]:
        flex_ddg_df = pd.read_csv(f"flexddgs/{target}/{mode}/outputs-results.csv")
        flex_ddg_df = flex_ddg_df[flex_ddg_df["scored_state"]=="ddG"].groupby("case_name")["total_score"].min().sort_index()
        flex_ddg_dfs[target+"_"+mode]=flex_ddg_df
        sampled_seq_dfs[target+"_"+mode]=pd.read_csv(f"flexddgs/{target}/{mode}/sampled_mutations.csv", index_col=0)

test_dfs = {target: pd.read_csv(f"flexddgs/{target}/bias/sampled_mutations.csv") for target in test_targets.values()}
for target in test_dfs:
    test_dfs[target]["DMS_score"] = - flex_ddg_dfs[target+"_bias"].values


In [None]:
ref_point = [2,2,1]
score_cols = ["flxddg_0_std", "flxddg_1_std", "ablang2_perplexity_std"]


In [None]:
from fast_pareto import is_pareto_front, nondominated_rank


In [None]:
import yaml
from tqdm import tqdm

N=40

all_df_merges=[]
filter_df_merges=[]
top_df_merges_0=[]
top_df_merges_1=[]
non_dominated_df_merges=[]
hv_df_merges=[]
dual_df_merges=[]
sum_df_merges=[]
cycle_df_merges=[]
for i in range(len(dfs)):
    target=configs[i]["target"]
    exp=configs[i]["exp"]
    df = dfs[i]
    CYCLE=configs[i]["MAXCYCLE"]
    top_dfs_0 = {cycle+1: df[df["cycle"]<=cycle].sort_values("DMS_score_0", ascending=False).head(N)
               for cycle in range(CYCLE)}
    top_dfs_1 = {cycle+1: df[df["cycle"]<=cycle].sort_values("DMS_score_1", ascending=False).head(N)
               for cycle in range(CYCLE)}
    all_dfs = {cycle+1: df[df["cycle"]<=cycle] for cycle in range(CYCLE)}

    cycle_dfs = {cycle+1: df[df["cycle"]==cycle] for cycle in range(CYCLE)}
    # Sum filtering
    sum_dfs = {cycle+1: df[df["cycle"]<=cycle].sort_values("sum_score", ascending=True).head(N)
               for cycle in range(CYCLE)}

    # Filter filtering
    filter_dfs = {}
    for cycle in range(CYCLE):
        cycle_df = df[df["cycle"]<=cycle].copy()
        cycle_df = cycle_df[cycle_df["ablang2_perplexity"]<10]
        # cycle_df = cycle_df[cycle_df["IP_seq"]>6]
        ranks = nondominated_rank(cycle_df[score_cols[:2]].values)
        filter_dfs[cycle+1] = cycle_df.iloc[np.argsort(ranks)][:N]

    # Non-dominated filtering
    non_dominated_dfs = {}
    for cycle in range(CYCLE):
        cycle_df = df[df["cycle"]<=cycle]
        ranks = nondominated_rank(cycle_df[score_cols[:2]].values)
        non_dominated_dfs[cycle+1] = cycle_df.iloc[np.argsort(ranks)][:N]

    df_c = df.copy()
    for i in range(len(score_cols)):
        df_c = df_c[df_c[score_cols[i]] <= ref_point[i]]
    hv_dfs = {}
    for cycle in range(CYCLE):
        cycle_df = df_c[df_c["cycle"]<=cycle]
        selected_indices, _ = greedy_hypervolume_subset(cycle_df[score_cols].values, N, ref_point)
        hv_dfs[cycle+1] = cycle_df.iloc[selected_indices]

    df_c = df.copy()
    for i in range(2):
        df_c = df_c[df_c[score_cols[i]] <= ref_point[i]]

    dual_dfs = {}
    for cycle in range(CYCLE):
        cycle_df = df_c[df_c["cycle"]<=cycle]
        selected_indices, _ = greedy_hypervolume_subset(cycle_df[score_cols[:2]].values, N, ref_point[:2])
        dual_dfs[cycle+1] = cycle_df.iloc[selected_indices]

    top_df_merge_0 = pd.concat(top_dfs_0)
    top_df_merge_1 = pd.concat(top_dfs_1)
    all_df_merge = pd.concat(all_dfs)
    hv_df_merge = pd.concat(hv_dfs)
    dual_df_merge = pd.concat(dual_dfs)
    sum_df_merge = pd.concat(sum_dfs)
    filter_df_merge = pd.concat(filter_dfs)
    non_dominated_df_merge = pd.concat(non_dominated_dfs)
    cycle_df_merge = pd.concat(cycle_dfs)

    top_df_merge_0.index.names=["CYCLE", "index"]
    top_df_merge_1.index.names=["CYCLE", "index"]
    all_df_merge.index.names=["CYCLE", "index"]
    hv_df_merge.index.names=["CYCLE", "index"]
    dual_df_merge.index.names=["CYCLE", "index"]
    sum_df_merge.index.names=["CYCLE", "index"]
    filter_df_merge.index.names=["CYCLE", "index"]
    non_dominated_df_merge.index.names=["CYCLE", "index"]
    cycle_df_merge.index.names=["CYCLE", "index"]

    top_df_merge_0 = top_df_merge_0.reset_index()
    top_df_merge_1 = top_df_merge_1.reset_index()
    all_df_merge = all_df_merge.reset_index()
    hv_df_merge = hv_df_merge.reset_index()
    dual_df_merge = dual_df_merge.reset_index()
    sum_df_merge = sum_df_merge.reset_index()
    filter_df_merge = filter_df_merge.reset_index()
    non_dominated_df_merge = non_dominated_df_merge.reset_index()
    cycle_df_merge = cycle_df_merge.reset_index()

    top_df_merges_0.append(top_df_merge_0)
    top_df_merges_1.append(top_df_merge_1)
    all_df_merges.append(all_df_merge)
    hv_df_merges.append(hv_df_merge)
    dual_df_merges.append(dual_df_merge)
    sum_df_merges.append(sum_df_merge)
    filter_df_merges.append(filter_df_merge)
    non_dominated_df_merges.append(non_dominated_df_merge)
    cycle_df_merges.append(cycle_df_merge)

top_df_merge_cat_0 = pd.concat(top_df_merges_0)
top_df_merge_cat_1 = pd.concat(top_df_merges_1)
all_df_merge_cat = pd.concat(all_df_merges)
hv_df_merge_cat = pd.concat(hv_df_merges)
dual_df_merge_cat = pd.concat(dual_df_merges)
sum_df_merge_cat = pd.concat(sum_df_merges)
filter_df_merge_cat = pd.concat(filter_df_merges)
non_dominated_df_merge_cat = pd.concat(non_dominated_df_merges)
cycle_df_merge_cat = pd.concat(cycle_df_merges)

all_divs = []
top_divs_0 = []
top_divs_1 = []
hv_divs = []
dual_divs = []
sum_divs = []
filter_divs = []
non_dominated_divs = []
cycle_divs = []
for i in range(len(dfs)):
    alldf = all_df_merges[i]
    topdf0 = top_df_merges_0[i]
    topdf1 = top_df_merges_1[i]
    hvdf = hv_df_merges[i]
    dualdf = dual_df_merges[i]
    sumdf = sum_df_merges[i]
    filterdf = filter_df_merges[i]
    non_dominateddf = non_dominated_df_merges[i]
    cycledf = cycle_df_merges[i]

    conf=configs[i]
    input_dir = os.path.join(data_dir, conf["target"], conf["model_type"], conf["exp"], "9", "target_0", "train_data")
    emb = np.load(os.path.join(input_dir, "embedding.npy"))
    # emb = np.load(os.path.join(input_dir, "embedding_umap5.npy"))

    # Calculate diversity metrics for all sequences
    divs = {cycle: 1-calculate_mean_similarity(emb[alldf[(alldf["CYCLE"]==cycle)]["index"].values]) for cycle in range(1,11)}
    mean_muts = pd.Series({cycle: alldf[alldf["CYCLE"]==cycle]["#Mutation"].mean() for cycle in range(1,11)})
    med_muts = pd.Series({cycle: alldf[alldf["CYCLE"]==cycle]["#Mutation"].median() for cycle in range(1,11)})
    divs = pd.Series(divs)
    divs.index.name="CYCLE"
    divs.name="Diversity"
    divs = divs.reset_index()
    divs["mean_mutation_num"] = mean_muts.values
    divs["median_mutation_num"] = med_muts.values
    divs["target"]=conf["target"]
    divs["model_type"]=conf["model_type"]
    divs["exp"]=conf["exp"]
    all_divs.append(divs)

    # Calculate diversity metrics for top sequences
    top_div_0 = {cycle: 1-calculate_mean_similarity(emb[topdf0[(topdf0["CYCLE"]==cycle)]["index"].values]) for cycle in range(1,11)}
    top_mean_muts_0 = pd.Series({cycle: topdf0[topdf0["CYCLE"]==cycle]["#Mutation"].mean() for cycle in range(1,11)})
    top_med_muts_0 = pd.Series({cycle: topdf0[topdf0["CYCLE"]==cycle]["#Mutation"].median() for cycle in range(1,11)})
    top_div_0 = pd.Series(top_div_0)
    top_div_0.index.name="CYCLE"
    top_div_0.name="Diversity"
    top_div_0 = top_div_0.reset_index()
    top_div_0["mean_mutation_num"] = top_mean_muts_0.values
    top_div_0["median_mutation_num"] = top_med_muts_0.values
    top_div_0["target"]=conf["target"]
    top_div_0["model_type"]=conf["model_type"]
    top_div_0["exp"]=conf["exp"]
    top_divs_0.append(top_div_0)

    top_div_1 = {cycle: 1-calculate_mean_similarity(emb[topdf1[(topdf1["CYCLE"]==cycle)]["index"].values]) for cycle in range(1,11)}
    top_mean_muts_1 = pd.Series({cycle: topdf1[topdf1["CYCLE"]==cycle]["#Mutation"].mean() for cycle in range(1,11)})
    top_med_muts_1 = pd.Series({cycle: topdf1[topdf1["CYCLE"]==cycle]["#Mutation"].median() for cycle in range(1,11)})
    top_div_1 = pd.Series(top_div_1)
    top_div_1.index.name="CYCLE"
    top_div_1.name="Diversity"
    top_div_1 = top_div_1.reset_index()
    top_div_1["mean_mutation_num"] = top_mean_muts_1.values
    top_div_1["median_mutation_num"] = top_med_muts_1.values
    top_div_1["target"]=conf["target"]
    top_div_1["model_type"]=conf["model_type"]
    top_div_1["exp"]=conf["exp"]
    top_divs_1.append(top_div_1)

    # Calculate diversity metrics for hypervolume sequences
    hv_div = {cycle: 1-calculate_mean_similarity(emb[hvdf[(hvdf["CYCLE"]==cycle)]["index"].values]) for cycle in range(1,11)}
    hv_mean_muts = pd.Series({cycle: hvdf[hvdf["CYCLE"]==cycle]["#Mutation"].mean() for cycle in range(1,11)})
    hv_med_muts = pd.Series({cycle: hvdf[hvdf["CYCLE"]==cycle]["#Mutation"].median() for cycle in range(1,11)})
    hv_div = pd.Series(hv_div)
    hv_div.index.name="CYCLE"
    hv_div.name="Diversity"
    hv_div = hv_div.reset_index()
    hv_div["mean_mutation_num"] = hv_mean_muts.values
    hv_div["median_mutation_num"] = hv_med_muts.values
    hv_div["target"]=conf["target"]
    hv_div["model_type"]=conf["model_type"]
    hv_div["exp"]=conf["exp"]
    hv_divs.append(hv_div)

    # Calculate diversity metrics for dual sequences
    dual_div = {cycle: 1-calculate_mean_similarity(emb[dualdf[(dualdf["CYCLE"]==cycle)]["index"].values]) for cycle in range(1,11)}
    dual_mean_muts = pd.Series({cycle: dualdf[dualdf["CYCLE"]==cycle]["#Mutation"].mean() for cycle in range(1,11)})
    dual_med_muts = pd.Series({cycle: dualdf[dualdf["CYCLE"]==cycle]["#Mutation"].median() for cycle in range(1,11)})
    dual_div = pd.Series(dual_div)
    dual_div.index.name="CYCLE"
    dual_div.name="Diversity"
    dual_div = dual_div.reset_index()
    dual_div["mean_mutation_num"] = dual_mean_muts.values
    dual_div["median_mutation_num"] = dual_med_muts.values
    dual_div["target"]=conf["target"]
    dual_div["model_type"]=conf["model_type"]
    dual_div["exp"]=conf["exp"]
    dual_divs.append(dual_div)

    # Calculate diversity metrics for sum sequences
    sum_div = {cycle: 1-calculate_mean_similarity(emb[sumdf[(sumdf["CYCLE"]==cycle)]["index"].values]) for cycle in range(1,11)}
    sum_mean_muts = pd.Series({cycle: sumdf[sumdf["CYCLE"]==cycle]["#Mutation"].mean() for cycle in range(1,11)})
    sum_med_muts = pd.Series({cycle: sumdf[sumdf["CYCLE"]==cycle]["#Mutation"].median() for cycle in range(1,11)})
    sum_div = pd.Series(sum_div)
    sum_div.index.name="CYCLE"
    sum_div.name="Diversity"
    sum_div = sum_div.reset_index()
    sum_div["mean_mutation_num"] = sum_mean_muts.values
    sum_div["median_mutation_num"] = sum_med_muts.values
    sum_div["target"]=conf["target"]
    sum_div["model_type"]=conf["model_type"]
    sum_div["exp"]=conf["exp"]
    sum_divs.append(sum_div)

    # Calculate diversity metrics for filtered sequences
    filter_div = {cycle: 1-calculate_mean_similarity(emb[filterdf[(filterdf["CYCLE"]==cycle)]["index"].values]) for cycle in range(1,11)}
    filter_mean_muts = pd.Series({cycle: filterdf[filterdf["CYCLE"]==cycle]["#Mutation"].mean() for cycle in range(1,11)})
    filter_med_muts = pd.Series({cycle: filterdf[filterdf["CYCLE"]==cycle]["#Mutation"].median() for cycle in range(1,11)})
    filter_div = pd.Series(filter_div)
    filter_div.index.name="CYCLE"
    filter_div.name="Diversity"
    filter_div = filter_div.reset_index()
    filter_div["mean_mutation_num"] = filter_mean_muts.values
    filter_div["median_mutation_num"] = filter_med_muts.values
    filter_div["target"]=conf["target"]
    filter_div["model_type"]=conf["model_type"]
    filter_div["exp"]=conf["exp"]
    filter_divs.append(filter_div)

    # Calculate diversity metrics for non-dominated sequences
    non_dominated_div = {cycle: 1-calculate_mean_similarity(emb[non_dominateddf[(non_dominateddf["CYCLE"]==cycle)]["index"].values]) for cycle in range(1,11)}
    non_dominated_mean_muts = pd.Series({cycle: non_dominateddf[non_dominateddf["CYCLE"]==cycle]["#Mutation"].mean() for cycle in range(1,11)})
    non_dominated_med_muts = pd.Series({cycle: non_dominateddf[non_dominateddf["CYCLE"]==cycle]["#Mutation"].median() for cycle in range(1,11)})
    non_dominated_div = pd.Series(non_dominated_div)
    non_dominated_div.index.name="CYCLE"
    non_dominated_div.name="Diversity"
    non_dominated_div = non_dominated_div.reset_index()
    non_dominated_div["mean_mutation_num"] = non_dominated_mean_muts.values
    non_dominated_div["median_mutation_num"] = non_dominated_med_muts.values
    non_dominated_div["target"]=conf["target"]
    non_dominated_div["model_type"]=conf["model_type"]
    non_dominated_div["exp"]=conf["exp"]
    non_dominated_divs.append(non_dominated_div)

    # Calculate diversity metrics for cycle sequences
    cycle_div = {cycle: 1-calculate_mean_similarity(emb[cycledf[(cycledf["CYCLE"]==cycle)]["index"].values]) for cycle in range(1,11)}
    cycle_mean_muts = pd.Series({cycle: cycledf[cycledf["CYCLE"]==cycle]["#Mutation"].mean() for cycle in range(1,11)})
    cycle_med_muts = pd.Series({cycle: cycledf[cycledf["CYCLE"]==cycle]["#Mutation"].median() for cycle in range(1,11)})
    cycle_div = pd.Series(cycle_div)
    cycle_div.index.name="CYCLE"
    cycle_div.name="Diversity"
    cycle_div = cycle_div.reset_index()
    cycle_div["mean_mutation_num"] = cycle_mean_muts.values
    cycle_div["median_mutation_num"] = cycle_med_muts.values
    cycle_div["target"]=conf["target"]
    cycle_div["model_type"]=conf["model_type"]
    cycle_div["exp"]=conf["exp"]
    cycle_divs.append(cycle_div)

all_divs_cat = pd.concat(all_divs, ignore_index=True)
top_divs_cat_0 = pd.concat(top_divs_0, ignore_index=True)
top_divs_cat_1 = pd.concat(top_divs_1, ignore_index=True)
hv_divs_cat = pd.concat(hv_divs, ignore_index=True)
dual_divs_cat = pd.concat(dual_divs, ignore_index=True)
sum_divs_cat = pd.concat(sum_divs, ignore_index=True)
filter_divs_cat = pd.concat(filter_divs, ignore_index=True)
non_dominated_divs_cat = pd.concat(non_dominated_divs, ignore_index=True)
cycle_divs_cat = pd.concat(cycle_divs, ignore_index=True)


In [None]:
targets = ["5A12_dual", "5A12_dual_weak"]

In [None]:
all_test_scores_0=[]
all_test_scores_1=[]
for conf in configs:
    target = conf["target"]
    if target not in targets:
        print(target)
        continue
    for cycle in range(10):
        input_dir_0 = os.path.join(data_dir, conf["target"], conf["model_type"], conf["exp"], str(cycle), "target_0", "train_data")
        input_dir_1 = os.path.join(data_dir, conf["target"], conf["model_type"], conf["exp"], str(cycle), "target_1", "train_data")
        test_pred_0 = np.load(os.path.join(input_dir_0, "test_inference_bias.npy"))
        test_pred_1 = np.load(os.path.join(input_dir_1, "test_inference_bias.npy"))
        test_df_0 = test_dfs[test_targets["target_0"]].copy()
        test_df_0["Pred"] = test_pred_0
        test_df_1 = test_dfs[test_targets["target_1"]].copy()
        test_df_1["Pred"] = test_pred_1
        all_test_scores_0.append({
            **calc_test(test_df_0["DMS_score"], test_df_0["Pred"]),
            "CYCLE": cycle+1,
            "target": conf["target"],
            "model_type": conf["model_type"],
            "exp": conf["exp"],
        })
        all_test_scores_1.append({
            **calc_test(test_df_1["DMS_score"], test_df_1["Pred"]),
            "CYCLE": cycle+1,
            "target": conf["target"],
            "model_type": conf["model_type"],
            "exp": conf["exp"],
        })
all_test_scores_cat_0 = pd.DataFrame(all_test_scores_0)
all_test_scores_cat_1 = pd.DataFrame(all_test_scores_1)
all_test_scores_cat_0["spearman"] = all_test_scores_cat_0["spearman"].fillna(0)
all_test_scores_cat_1["spearman"] = all_test_scores_cat_1["spearman"].fillna(0)

all_test_scores_cat_0["spearman_0"] = all_test_scores_cat_0["spearman"]
all_test_scores_cat_1["spearman_1"] = all_test_scores_cat_1["spearman"]

all_test_scores_cat_0["ndcg_0"] = all_test_scores_cat_0["ndcg"]
all_test_scores_cat_1["ndcg_1"] = all_test_scores_cat_1["ndcg"]

In [None]:
cycle_df_merge_cat.to_csv("cycle_df_merge_cat_dual.csv",index=False)

In [None]:
sum_df_merge_cat.to_csv("../results/flexddg_online/dual/sum_results.csv",index=False)
all_test_scores_cat_0.to_csv("../results/flexddg_online/dual/all_results_test_Ang2.csv",index=False)
all_test_scores_cat_1.to_csv("../flexddg_online/dual/all_results_test_VEGF.csv",index=False)
all_df_merge_cat.to_csv("../results/flexddg_online/dual/all_results.csv",index=False)