# Main AL Figures

In [None]:
import numpy as np
import string

In [None]:
train_sizes_001 = np.logspace(np.log10(0.01), np.log10(1), 5)
train_sizes_005 = np.logspace(np.log10(0.05), np.log10(1), 5)
train_sizes_010 = np.logspace(np.log10(0.10), np.log10(1), 5)

# round
train_sizes_001 = np.round(train_sizes_001, 3)
train_sizes_005 = np.round(train_sizes_005, 3)
train_sizes_010 = np.round(train_sizes_010, 3)

print(train_sizes_001)
print(train_sizes_005)
print(train_sizes_010)

In [None]:
all_sizes = list(train_sizes_001) + list(train_sizes_005) + list(train_sizes_010)
all_sizes = sorted(set(all_sizes))
all_sizes = [round(x, 3) for x in all_sizes]
print(all_sizes)

### Change of uncertainty over time in AL

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}

al_results_dir = "../src/active_learning/al_results"

results_dirs = []
for rootdir, dirs, files in os.walk(al_results_dir):
    for subdir in dirs:
        results_dirs.append(os.path.join(rootdir, subdir))

results_dirs = [s.split('/') for s in results_dirs]
results_dirs = [l[4:] for l in results_dirs if ((len(l)==9 and "dropout" not in l) or (len(l)==10 and "dropout" in l))]

for dir_names in results_dirs:
    
    if dir_names[4] == "dropout":
        results_dir = al_results_dir+f"/{dir_names[0]}/{dir_names[1]}/{dir_names[2]}/{dir_names[3]}/{dir_names[4]}/{dir_names[5]}/"
    else:
        results_dir = al_results_dir+f"/{dir_names[0]}/{dir_names[1]}/{dir_names[2]}/{dir_names[3]}/{dir_names[4]}/"
    
    fig_name = f"{dir_names[0]}-{dir_names[1]}-{dir_names[2]}-{dir_names[3]}-{dir_names[4]}"
    main_fig_names = [
        "aav-sampled-cnn-esm-ensemble",
        "meltome-mixed_split-gp-esm-gp",
        "gb1-sampled-gp-esm-gp",
        "meltome-mixed_split-cnn-esm-evidential",
        "meltome-mixed_split-cnn-esm-ensemble",
        "gb1-sampled-cnn-esm-ensemble",
        "aav-sampled-cnn-esm-evidential",
        "gb1-sampled-cnn-esm-evidential",
    ]
    if fig_name in main_fig_names:
        print(fig_name)
        csv_list = [s for s in os.listdir(results_dir) if s.endswith(".csv")]

        try:
            df = pd.read_csv(f"{results_dir}{csv_list[0]}")
        except:
            continue

        for csv in csv_list[1:]:
            df = pd.concat([df, pd.read_csv(f"{results_dir}{csv}")])

        df.sort_values(["Strategy", "Trial"], inplace=True)
        
        # restrict which training data sizes to plot
        df = df.loc[df["Train Data Ratio"].isin(all_sizes)]
        
        # Drop duplicates because some training data sizes are repeated if they happen to be in multiple ranges
        df.drop_duplicates(inplace=True, ignore_index=True)
        
        for train_sizes in [train_sizes_010]:
            df_ = df.loc[df["Train Data Ratio"].isin(train_sizes)].copy()
            
            print(sorted(df_["Train Data Ratio"].unique()))

            f = plt.figure(figsize=(3,3))

            for strategy in ["random", "explorative_sample", "explorative_greedy"]:
                if strategy=="random":
                    plotstyle = 'o-'
                    strategy_label = "Random"
                elif strategy=="explorative_greedy":
                    plotstyle = 'o--'
                    strategy_label = "Explorative Greedy"
                elif strategy=="explorative_sample":
                    plotstyle = 'o:'
                    strategy_label = "Explorative Sample"
                
                df_mean = df_.loc[(df.Strategy==strategy)].groupby("Train Data Ratio").mean()
                df_std = df_.loc[(df.Strategy==strategy)].groupby("Train Data Ratio").std()

                plt.subplot(1, 1, 1)
                plt.plot(df_mean.index, df_mean.MeanUncertainty, plotstyle, label=strategy_label)
                plt.fill_between(df_mean.index, df_mean.MeanUncertainty-df_std.MeanUncertainty, df_mean.MeanUncertainty+df_std.MeanUncertainty, alpha=0.3)
            
            plt.xlabel("Train Data Ratio")
            plt.ylabel("Test Mean Uncertainty")
            
            # plt.title(train_sizes)
            
            # if fig_name.startswith("meltome"):
            #     plt.ylim(0.4, 0.7)
            #     # plt.yticks([0.54, 0.58, 0.62, 0.66, 0.70])
            # elif fig_name.startswith("gb1"):
            #     plt.ylim(0.5, 0.88)
            #     # plt.yticks([0.72, 0.76, 0.80, 0.84, 0.88])
            # elif fig_name.startswith("aav"):
            #     plt.ylim(0.6, 0.85)
            #     # plt.yticks([0.77, 0.79, 0.81, 0.83, 0.85])
            
            f.tight_layout()

            starting_point = str(min(train_sizes)).translate(str.maketrans('', '', string.punctuation))
            plt.savefig(f"al_figures/main9/{fig_name}_meanUnc.pdf", 
                        facecolor='white')
            plt.show()

#### Figure 5

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}

al_results_dir = "../src/active_learning/al_results"

results_dirs = []
for rootdir, dirs, files in os.walk(al_results_dir):
    for subdir in dirs:
        results_dirs.append(os.path.join(rootdir, subdir))

results_dirs = [s.split('/') for s in results_dirs]
results_dirs = [l[4:] for l in results_dirs if ((len(l)==9 and "dropout" not in l) or (len(l)==10 and "dropout" in l))]

for dir_names in results_dirs:
    
    if dir_names[4] == "dropout":
        results_dir = al_results_dir+f"/{dir_names[0]}/{dir_names[1]}/{dir_names[2]}/{dir_names[3]}/{dir_names[4]}/{dir_names[5]}/"
    else:
        results_dir = al_results_dir+f"/{dir_names[0]}/{dir_names[1]}/{dir_names[2]}/{dir_names[3]}/{dir_names[4]}/"
    
    fig_name = f"{dir_names[0]}-{dir_names[1]}-{dir_names[2]}-{dir_names[3]}-{dir_names[4]}"
    main_fig_names = [
        "aav-sampled-cnn-esm-ensemble",
        "meltome-mixed_split-gp-esm-gp",
        "gb1-sampled-gp-esm-gp",
        "meltome-mixed_split-cnn-esm-evidential",
        "meltome-mixed_split-cnn-esm-ensemble",
        "gb1-sampled-cnn-esm-ensemble",
        "aav-sampled-cnn-esm-evidential",
        "gb1-sampled-cnn-esm-evidential",
    ]
    if fig_name in main_fig_names:
        print(fig_name)
        csv_list = [s for s in os.listdir(results_dir) if s.endswith(".csv")]

        try:
            df = pd.read_csv(f"{results_dir}{csv_list[0]}")
        except:
            continue

        for csv in csv_list[1:]:
            df = pd.concat([df, pd.read_csv(f"{results_dir}{csv}")])

        df.sort_values(["Strategy", "Trial"], inplace=True)
        
        # restrict which training data sizes to plot
        df = df.loc[df["Train Data Ratio"].isin(all_sizes)]
        
        # Drop duplicates because some training data sizes are repeated if they happen to be in multiple ranges
        df.drop_duplicates(inplace=True, ignore_index=True)
        
        for train_sizes in [train_sizes_001, train_sizes_005, train_sizes_010]:
            df_ = df.loc[df["Train Data Ratio"].isin(train_sizes)].copy()
            
            print(sorted(df_["Train Data Ratio"].unique()))

            f = plt.figure(figsize=(3,3))

            for strategy in ["random", "explorative_sample", "explorative_greedy"]:
                if strategy=="random":
                    plotstyle = 'o-'
                    strategy_label = "Random"
                elif strategy=="explorative_greedy":
                    plotstyle = 'o--'
                    strategy_label = "Explorative Greedy"
                elif strategy=="explorative_sample":
                    plotstyle = 'o:'
                    strategy_label = "Explorative Sample"
                
                df_mean = df_.loc[(df.Strategy==strategy)].groupby("Train Data Ratio").mean()
                df_std = df_.loc[(df.Strategy==strategy)].groupby("Train Data Ratio").std()

                plt.subplot(1, 1, 1)
                plt.plot(df_mean.index, df_mean.TestRho, plotstyle, label=strategy_label)
                plt.fill_between(df_mean.index, df_mean.TestRho-df_std.TestRho, df_mean.TestRho+df_std.TestRho, alpha=0.3)
            
            plt.xlabel("Train Data Ratio")
            plt.ylabel(r"Test $\rho$")
            
            # plt.title(train_sizes)
            
            if fig_name.startswith("meltome"):
                plt.ylim(0.4, 0.7)
                # plt.yticks([0.54, 0.58, 0.62, 0.66, 0.70])
            elif fig_name.startswith("gb1"):
                plt.ylim(0.5, 0.88)
                # plt.yticks([0.72, 0.76, 0.80, 0.84, 0.88])
            elif fig_name.startswith("aav"):
                plt.ylim(0.6, 0.85)
                # plt.yticks([0.77, 0.79, 0.81, 0.83, 0.85])
            
            f.tight_layout()

            starting_point = str(min(train_sizes)).translate(str.maketrans('', '', string.punctuation))
            plt.savefig(f"al_figures/main9/{fig_name}_{starting_point}.pdf", 
                        facecolor='white')
            plt.show()

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}

al_results_dir = "../src/active_learning/al_results"

results_dirs = []
for rootdir, dirs, files in os.walk(al_results_dir):
    for subdir in dirs:
        results_dirs.append(os.path.join(rootdir, subdir))

results_dirs = [s.split('/') for s in results_dirs]
results_dirs = [l[4:] for l in results_dirs if ((len(l)==9 and "dropout" not in l) or (len(l)==10 and "dropout" in l))]

for dir_names in results_dirs:
    
    if dir_names[4] == "dropout":
        results_dir = al_results_dir+f"/{dir_names[0]}/{dir_names[1]}/{dir_names[2]}/{dir_names[3]}/{dir_names[4]}/{dir_names[5]}/"
    else:
        results_dir = al_results_dir+f"/{dir_names[0]}/{dir_names[1]}/{dir_names[2]}/{dir_names[3]}/{dir_names[4]}/"
    
    fig_name = f"{dir_names[0]}-{dir_names[1]}-{dir_names[2]}-{dir_names[3]}-{dir_names[4]}"
    main_fig_names = [
        "aav-sampled-cnn-esm-ensemble",
        "meltome-mixed_split-gp-esm-gp",
        "gb1-sampled-gp-esm-gp",
        "meltome-mixed_split-cnn-esm-evidential",
        "meltome-mixed_split-cnn-esm-ensemble",
        "gb1-sampled-cnn-esm-ensemble",
        "aav-sampled-cnn-esm-evidential",
        "gb1-sampled-cnn-esm-evidential",
    ]
    if fig_name in main_fig_names:
        print(fig_name)
        csv_list = [s for s in os.listdir(results_dir) if s.endswith(".csv")]

        try:
            df = pd.read_csv(f"{results_dir}{csv_list[0]}")
        except:
            continue

        for csv in csv_list[1:]:
            df = pd.concat([df, pd.read_csv(f"{results_dir}{csv}")])

        df.sort_values(["Strategy", "Trial"], inplace=True)
        
        # restrict which training data sizes to plot
        df = df.loc[df["Train Data Ratio"].isin(all_sizes)]
        
        # Drop duplicates because some training data sizes are repeated if they happen to be in multiple ranges
        df.drop_duplicates(inplace=True, ignore_index=True)
        
        for train_sizes in [train_sizes_010]:
            df_ = df.loc[df["Train Data Ratio"].isin(train_sizes)].copy()
            
            # print(sorted(df_["Train Data Ratio"].unique()))

            f = plt.figure(figsize=(3,3))

            for strategy in ["explorative_greedy"]:
                if strategy=="random":
                    plotstyle = 'o-'
                    strategy_label = "Random"
                elif strategy=="explorative_greedy":
                    plotstyle = 'o--'
                    strategy_label = "Explorative Greedy"
                elif strategy=="explorative_sample":
                    plotstyle = 'o:'
                    strategy_label = "Explorative Sample"
                
                df_mean = df_.loc[(df.Strategy==strategy)].groupby("Train Data Ratio").mean()
                df_std = df_.loc[(df.Strategy==strategy)].groupby("Train Data Ratio").std()

                # plt.subplot(1, 1, 1)
                # plt.plot(df_mean.index, df_mean.TestRho, plotstyle, label=strategy_label)
                # plt.fill_between(df_mean.index, df_mean.TestRho-df_std.TestRho, df_mean.TestRho+df_std.TestRho, alpha=0.3)

                # calculate area under curve of TestRho vs. index
                auc = np.trapz(df_mean.TestRho, x=df_mean.index)
                print(f"{strategy_label} AUC: {auc}")
            
            # plt.xlabel("Train Data Ratio")
            # plt.ylabel(r"Test $\rho$")
            
            # plt.title(train_sizes)
            
            # if fig_name.startswith("meltome"):
            #     plt.ylim(0.4, 0.7)
            #     # plt.yticks([0.54, 0.58, 0.62, 0.66, 0.70])
            # elif fig_name.startswith("gb1"):
            #     plt.ylim(0.5, 0.88)
            #     # plt.yticks([0.72, 0.76, 0.80, 0.84, 0.88])
            # elif fig_name.startswith("aav"):
            #     plt.ylim(0.6, 0.85)
            #     # plt.yticks([0.77, 0.79, 0.81, 0.83, 0.85])
            
            # f.tight_layout()

            # starting_point = str(min(train_sizes)).translate(str.maketrans('', '', string.punctuation))
            # plt.savefig(f"al_figures/main9/{fig_name}_{starting_point}.pdf", 
            #             facecolor='white')
            # plt.show()

In [None]:
raise

# All AL Figures

#### Figures S5-S57

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}

al_results_dir = "../src/active_learning/al_results"

results_dirs = []
for rootdir, dirs, files in os.walk(al_results_dir):
    for subdir in dirs:
        results_dirs.append(os.path.join(rootdir, subdir))

results_dirs = [s.split('/') for s in results_dirs]
results_dirs = [l[4:] for l in results_dirs if ((len(l)==9 and "dropout" not in l) or (len(l)==10 and "dropout" in l))]

for dir_names in results_dirs:
    print(dir_names)
    
    if dir_names[4] == "dropout":
        results_dir = al_results_dir+f"/{dir_names[0]}/{dir_names[1]}/{dir_names[2]}/{dir_names[3]}/{dir_names[4]}/{dir_names[5]}/"
    else:
        results_dir = al_results_dir+f"/{dir_names[0]}/{dir_names[1]}/{dir_names[2]}/{dir_names[3]}/{dir_names[4]}/"
    
    csv_list = sorted([s for s in os.listdir(results_dir) if s.endswith(".csv")])
    
    try:
        df = pd.read_csv(f"{results_dir}{csv_list[0]}")
    except:
        continue
        
    for csv in csv_list[1:]:
        df = pd.concat([df, pd.read_csv(f"{results_dir}{csv}")])

    df.sort_values(["Strategy", "Trial"], inplace=True)
    
    # Don't use old / duplicate results, only use most recent
    df.drop_duplicates(subset=["Strategy", "Trial", "Train Data Ratio"], keep="last", inplace=True)

    f = plt.figure(figsize=(10,10))

    for strategy in ["random", "explorative_sample", "explorative_greedy"]:
        if strategy=="random":
            plotstyle = 'o-'
            strategy_label = "Random"
        elif strategy=="explorative_greedy":
            plotstyle = 'o--'
            strategy_label = "Explorative Greedy"
        elif strategy=="explorative_sample":
            plotstyle = 'o:'
            strategy_label = "Explorative Sample"

        df_mean = df.loc[(df.Strategy==strategy)].groupby("Train Data Ratio").mean()
        df_std = df.loc[(df.Strategy==strategy)].groupby("Train Data Ratio").std()
        
        
        plt.subplot(2, 2, 1)
        plt.plot(df_mean.index, df_mean.TestRho, plotstyle, label=strategy_label)
        plt.fill_between(df_mean.index, df_mean.TestRho-df_std.TestRho, df_mean.TestRho+df_std.TestRho, alpha=0.3)
        plt.xlabel("Train Data Ratio")
        plt.ylabel("Test Spearman Rank Correlation")
        plt.legend()

        plt.subplot(2, 2, 2)
        plt.plot(df_mean.index, df_mean.TestRMSE, plotstyle, label=strategy_label)
        plt.fill_between(df_mean.index, df_mean.TestRMSE-df_std.TestRMSE, df_mean.TestRMSE+df_std.TestRMSE, alpha=0.3)
        plt.xlabel("Train Data Ratio")
        plt.ylabel("Test RMSE")
        plt.legend()

        plt.subplot(2, 2, 3)
        plt.plot(df_mean.index, df_mean.TestMAE, plotstyle, label=strategy_label)
        plt.fill_between(df_mean.index, df_mean.TestMAE-df_std.TestMAE, df_mean.TestMAE+df_std.TestMAE, alpha=0.3)
        plt.xlabel("Train Data Ratio")
        plt.ylabel("Test MAE")
        plt.legend()

        plt.subplot(2, 2, 4)
        plt.plot(df_mean.index, df_mean.TestR2, plotstyle, label=strategy_label)
        plt.fill_between(df_mean.index, df_mean.TestR2-df_std.TestR2, df_mean.TestR2+df_std.TestR2, alpha=0.3)
        plt.xlabel("Train Data Ratio")
        plt.ylabel("Test R2")
        plt.legend()

    #f.suptitle(f"{dir_names[0]} {dir_names[1]} {dir_names[2]} {dir_names[3]} {dir_names[4]}", fontsize=14)
    f.tight_layout()
    plt.savefig(f"al_figures/{dir_names[0]}-{dir_names[1]}-{dir_names[2]}-{dir_names[3]}-{dir_names[4]}.pdf", 
                facecolor='white')
    plt.show()