<h1>Extracting results</h1>
<p>Here we extract data required to plot results from the data generated by ... </p>

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None 
from glob import glob
import json

<h3>collecting data</h3>
<p>The following function renames some columns, recodes some True/False columns using string values, and outputs a subset of columns into a CSV file that can be used for plotting</p>

In [None]:
def extract_core_results(df):
    df["train_size"] = [
        float(json.loads(holdouts_kwargs)["train_size"])
        for holdouts_kwargs in df.holdouts_kwargs
    ]

    df["features_names"] = [
        json.loads(edge_feature.replace("'", "\""))[0] if pd.notna(edge_feature) else feature_name
        for feature_name, edge_feature in zip(
            df.features_names,
            df["('model_parameters', 'edge_features')"]
        )
    ]
    df["evaluation_negative_sampling_method"] = [
    "DANS"
    if use_scale_free_distribution
    else "UNS"
    for use_scale_free_distribution in df.use_scale_free_distribution
    ]
    df = df[
        [
            pd.isna(a) or a == b
            for a, b in zip(
                df["('features_parameters', 'use_scale_free_distribution')"],
                df["('model_parameters', 'use_scale_free_distribution')"],
            )
        ]
    ]
    df["model_negative_examples"] = [
     "{training}".format(training=("DANS" if f else "UNS"))
        for f in df["('features_parameters', 'use_scale_free_distribution')"]
    ]
    columns = ["evaluation_mode", "features_names","evaluation_negative_sampling_method",
               "model_negative_examples", "accuracy",  "balanced_accuracy", 
               "false_discovery_rate","matthews_correlation_coefficient",
           "precision", "recall", "specificity", "f1_score", "auroc", "auprc"]
    return df[columns].copy()

In [None]:
indirectory = input()

<h3>STRING Protein-Protein Association data</h3>

In [None]:
df = pd.concat([
    pd.read_csv(
        path,
        index_col=0
    )
    for path in glob(f"{indirectory}/experiments/Edge Prediction/HomoSapiens/holdout_*/*.csv.gz")
]).reset_index(drop=True)

In [None]:
string_results = extract_core_results(df)

In [None]:
string_results.head()

In [None]:
string_results.to_csv("string_results.csv")

<h3>SLI synthetic lethality results</h3>

In [None]:
df = pd.concat([
    pd.read_csv(
        path,
        index_col=0
    )
    for path in glob(f"{indirectory}/experiments/Edge Prediction/(SLDB | HomoSapiens)/holdout_*/*.csv.gz")
]).reset_index(drop=True)

In [None]:
sli_results = extract_core_results(df)

In [None]:
sli_results.head()

In [None]:
sli_results.to_csv("sli_results.csv")

<H1>Aggregating Results</H1>
<p>Here, we calculate the mean and standard deviation of the
    balanced accuracy, false discovery rate, matthews correlation coefficient,
    F1 score, AUROC, and AUPRC.</p>

In [None]:
def get_mean_and_std(df):
    """
    df should be one of string_results or sli_results
    """
    # We are just interested in the following seven graph/random walk methods
    graph_methods = {'First-order LINE', 'DeepWalk SkipGram', 'Walklets CBOW', 'HOPE',
                 'Second-order LINE','DeepWalk CBOW',    'Walklets SkipGram'}
    df = df[df['features_names'].isin(graph_methods)]
    # Rename columns for conciseness
    df =  df.rename(columns={"evaluation_negative_sampling_method": "evaluation", 
                   "evaluation_mode": "mode","features_names":"methods"}, errors="raise")
    # Define the columns that we want to calculate mean and std def for
    evaluation_d = {'balanced_accuracy':['mean','std'],
               'false_discovery_rate':['mean','std'], 
               'matthews_correlation_coefficient':['mean','std'], 
               'f1_score':['mean','std'], 
               'auroc':['mean','std'], 
               'auprc':['mean','std']}
    # Calculate mean and standard dev
    df2 =  df.groupby(['methods','mode','evaluation']).agg(evaluation_d).reset_index()
    # Make a new column for convenience in planning
    df2["approach"] = df2["evaluation"] + " (" + df2["mode"] + ")"
    return df2

In [None]:
# Rename columns from tuples to simple strings for readability
columns = ["methods",'mode','evaluation', "balanced_acc.mean","balanced_acc.std","FDR.mean",
                    "FDR.std", "MCC.mean", "MCC.std", "F1.mean", "F1.std", "AUROC.mean",
                     "AUROC.std", "AUPRC.mean","AUPRC.std", "approach"]

<h3>SLI</h3>

In [None]:
sli_stats = get_mean_and_std(sli_results)

In [None]:
sli_stats.columns = columns
sli_stats.to_csv("sli_stats.csv")

<h3>STRING</h3>

In [None]:
string_stats = get_mean_and_std(string_results)
string_stats.head()

In [None]:
string_stats.columns = columns
string_stats.to_csv("string_stats.csv")