# Imports

In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

# general
import re
import collections
import pickle
import warnings 
import joblib
import pathlib
import datetime

# data
import numpy as np
import pandas as pd
import h5py

# ml / stats
import sklearn
import scipy.stats

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

# init matplotlib defaults
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'

In [None]:
%run -m rpy2.situation

# Parameters

In [None]:
benchmark_dir = pathlib.Path("../_out/benchmark")
out_dir = pathlib.Path("../_out/figures")
out_dir.mkdir(parents=True, exist_ok=True)
method_name = "CorALS"

# Functions

## Load runtime / memory

In [None]:
def load_benchmark(file, benchmark_dir="../_out/benchmark"):
    
    benchmark_dir = pathlib.Path(benchmark_dir)

    results = []

    def visit(name, data):
        if isinstance(data, h5py.Group):
            if "memory" in data:
                memory = data["memory"][:]
                runtime = data["runtime"][:]
                if "memory_backend" in data.attrs:
                    memory_backend = data.attrs["memory_backend"]
                else:
                    memory_backend = "default"
                if "timestamp" in data.attrs:
                    timestamp = datetime.datetime.fromtimestamp(data.attrs["timestamp"])
                else:
                    timestamp = None
                results.extend([(name, i, m, r, memory_backend, timestamp) for i, (m, r) in enumerate(zip(memory, runtime))])
            else:
                print("Skipping: ", name)
                
    with h5py.File(benchmark_dir / file, "r") as f:
        f.visititems(visit)

    results = pd.DataFrame(results, columns=["algorithm", "repetition", "memory", "runtime", "memory_backend", "timestamp"])
    return results

In [None]:
def load_benchmark_experiments(benchmark_dir="../_out/benchmark"):

    benchmark_dir = pathlib.Path(benchmark_dir)
    
    benchmark_experiments = []

    for f in sorted(benchmark_dir.glob("benchmark_*")):
        
        print(f.name)

        results = load_benchmark(f, "")

        results["language"] = re.search("lang-(.*?)___", f.name).group(1)
        results["prefix"] = re.search("prefix-(.*?)___", f.name).group(1)
        
        context = re.search("context-(.*?)___", f.name).group(1)
        context_param = None
        if context.startswith("topk"):
            context_param = float(context.split("-")[1].replace("percent", ""))
            context = context.split("-")[0]
        results["context"] = context    
        results["context_param"] = context_param
        
        # data
        results["data"] = re.search("data-(.*?)___", f.name).group(1)
        
        # repeat
        results["n_repeat"] = int(re.search("repeat-(.*?)(.h5|___)", f.name).group(1))

        # filename
        results["file"] = f.name
#         display(results)
        
        benchmark_experiments.append(results)

    benchmark_experiments = pd.concat(benchmark_experiments)

    # dropping first run (0) to account for compiler optimization
    # this is a bit questionable depending on application scenario
    benchmark_experiments = benchmark_experiments[benchmark_experiments.n_repeat >= 1] 
    return benchmark_experiments

## Accuracy

In [None]:
def load_accuracy(file, benchmark_dir="../_out/benchmark"):
    
    benchmark_dir = pathlib.Path(benchmark_dir)

    results = collections.OrderedDict()

    def visit(name, data):
        if isinstance(data, h5py.Dataset) and "metrics/" in name:
            results[name.split("/")[1]] = data[...]

    with h5py.File(benchmark_dir / file, "r") as f:
        approximation_factors = f["approximation_factors"][...]
        f.visititems(visit)
        
    df = pd.DataFrame(results, index=approximation_factors)
    df.index.name = "approximation_factor"
    
    return df

In [None]:
def load_accuracy_experiments(benchmark_dir="../_out/benchmark"):

    benchmark_dir = pathlib.Path(benchmark_dir)
    
    exps = []

    for f in sorted(benchmark_dir.glob("ac*curacy_*")):
        print(f.name)

        results = load_accuracy(f, "")
        results["data"] = re.search("data.(.*?)___", f.name).group(1)
        results["context"] = re.search("(topk(diff)?)", f.name).group(1)
#         results["topk"] = "moob"
#         results["topk"] = float(re.search("topk(diff)?-(.*?)percent", f.name).group(1))
#         results["topkdiff"] = float(re.search("topkdiff-(.*?)percent", f.name).group(1))
        results["context_param"] = float(re.search("topk(diff)?-(.*?)percent", f.name).group(2))
    
        if "method" in f.name:
            results["method"] = re.search("method-(.*)[\._]", f.name).group(1)
        else:
            results["method"] = 'unknown'
            
        if "spearman" in f.name:
            results["spearman"] = re.search("spearman-(.*)[\._]", f.name).group(1)
        else:
            results["spearman"] = 'unknown'

        exps.append(results)

    exps = pd.concat(exps)

    # dropping first run (0) to account for compiler optimization
    # this is a bit questionable depending on application scenario
    return exps.reset_index().set_index(["data", "context", "context_param", "method", "spearman", "approximation_factor"])

## Formatting

In [None]:
def format_timespan(timespan_in_seconds):
    if np.isnan(timespan_in_seconds):
        return "-"
    
    hours = timespan_in_seconds // 60**2
    minutes = timespan_in_seconds // 60 - hours * 60
    seconds = timespan_in_seconds - minutes * 60 - hours * 60**2
    if minutes > 0:
        formatted = f'{seconds:04.1f}'
    else:
        formatted = f'{seconds:.1f}'
    if minutes > 0:
        if hours > 0:
            formatted = f'{minutes:02.0f}:' + formatted
        else:
            formatted = f'{minutes:.0f}:' + formatted
    if hours > 0:
        formatted = f'{hours:.0f}:' + formatted
    return formatted

In [None]:
def format_memory(memory_in_mb):
    if np.isnan(memory_in_mb):
        return "-"
    return f"{memory_in_mb / 1024:.01f} GB"

In [None]:
datasets_map = collections.OrderedDict([
    ("preeclampsia_postprocessed_nonegatives_dropduplicates", "Preeclampsia"),
    ("pregnancy_postprocessed_nonegatives_dropduplicates", "Pregnancy"),
    ("cancer_postprocessed_nonegatives_dropduplicates_sample-0.25", "Cancer (0.25)"),
    ("cancer_postprocessed_nonegatives_dropduplicates_sample-0.50", "Cancer (0.50)"),
    ("cancer_postprocessed_nonegatives_dropduplicates_sample-1.00", "Cancer (1.00)"),
    ("singlecell_postprocessed", "Single Cell"),
    ("singlecell_large_postprocessed", "Single Cell 2"),
    ("large_synthetic_mn_m-500_n-200000_postprocessed", "Sim"),
])

In [None]:
def format_benchmarks(
        algorithms=None, global_select=True, statistic="runtime", statistic_formatter="auto", column_regex=None, stats="default"):
    
    stats = []
    for algorithm_name, algorithm_select in algorithms:
        # select data
        df = benchmark_experiments.loc[global_select & algorithm_select].copy()
        if df.shape[0] == 0:
            df = pd.DataFrame(dict(data=["dummy"], algorithm=algorithm_name, statistic=[-1]))
        df["algorithm"] = algorithm_name
        stats.append(df) 
    stats = pd.concat(stats)
    
    # calculate stats
    stats = stats[["data", "algorithm", statistic]].groupby(["algorithm", "data"]).agg(["median", "mean", "std", "count"])
    
    # pull up algorithms into columns
    stats = stats.unstack(level=0).reorder_levels([0,2,1], axis=1)
    
    # sort algorithms
    columns = [c for n, _ in algorithms for c in stats.columns if c[1] == n]
    stats = stats[columns]
    
    # rename datasets
    stats.rename(datasets_map, axis=0, level="data", inplace=True)
    
    # filter unwanted stats
    if column_regex is not None:
        stats = stats.filter(regex=column_regex)
      
    # sort datasets (and drop unwanted ones)
    stats = stats.loc[[d for d in datasets_map.values() if d in stats.index],:]  
    
    # set formatter
    if statistic_formatter == "auto":
        if statistic == "runtime":
            statistic_formatter = format_timespan
        elif statistic == "memory":
            statistic_formatter = format_memory
        else:
            raise ValueEror(f"No known formatter fo statistic: {statistic}")
    
    # format
    if statistic_formatter is not None:
        
        formatted = stats.style\
            .format(statistic_formatter, subset=stats.filter(regex="median|mean|std").columns)
        
        formatters = \
            [statistic_formatter if re.match("median|mean|std", c[2]) else None for c in stats.columns]  
        
        latex = stats.to_latex(formatters=formatters, na_rep="-")
    else:
        formatted = None
        latex = None
    
    # done
    return stats, formatted, latex

# Load experiments and prepare visualizations

In [None]:
benchmark_experiments = load_benchmark_experiments()
benchmark_experiments

In [None]:
benchmark_experiments[
    (benchmark_experiments.prefix == "full_default") 
    & (benchmark_experiments.context == "fast") 
    & (benchmark_experiments.language == "julia") 
    & (benchmark_experiments.algorithm == "cor_cor")
    & (benchmark_experiments.data.str.contains("0.5"))].head(20)

In [None]:
# # experiments overview
# experiment_identifiers = ["language", "prefix", "context", "context_param", "data"]
# benchmark_experiments[experiment_identifiers].drop_duplicates().sort_values(experiment_identifiers)

In [None]:
accuracy_experiments = load_accuracy_experiments()

In [None]:
accuracy_experiments

In [None]:
colors = sns.color_palette()
styles = ["-", "--", ":"]

In [None]:
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt 
from matplotlib.lines import Line2D

patchList = []
data_key = Line2D(
    [0], [0], marker="o", color=colors[0], linestyle=styles[0], 
    label="Python default", 
    markersize=0)
patchList.append(data_key)
data_key = Line2D(
    [0], [0], marker="o", color=colors[0], linestyle=styles[1], 
    label=f"Python {method_name}", 
    markersize=0)
patchList.append(data_key)

data_key = Line2D(
    [0], [0], marker="o", color=colors[1], linestyle=styles[0], 
    label="Julia default", 
    markersize=0)
patchList.append(data_key)
data_key = Line2D(
    [0], [0], marker="o", color=colors[1], linestyle=styles[1], 
    label=f"Julia {method_name}", 
    markersize=0)
patchList.append(data_key)

data_key = Line2D(
    [0], [0], marker="o", color=colors[2], linestyle=styles[0], 
    label="R default", 
    markersize=0)
patchList.append(data_key)
data_key = Line2D(
    [0], [0], marker="o", color=colors[2], linestyle=styles[1], 
    label=f"R {method_name}", 
    markersize=0)
patchList.append(data_key)

fig, axes = plt.subplots(1,1, dpi=150, figsize=(2,2))
ax = axes
ax.legend(handles=patchList, loc="upper left", frameon=False)
ax.axis("off")
figures_path = pathlib.Path('../_out/figures')
figures_path.mkdir(parents=True, exist_ok=True)
fig.savefig(figures_path / 'legend.pdf', bbox_inches='tight')

# Main Paper

In [None]:
columns_stats = ["prefix", "language", "algorithm", "data", "memory", "runtime", "context_param", "memory_backend"]
agg_stats = ["median", "mean", "std", "size"]

## Data Table

In [None]:
%%time
import coralsarticle.data.utils
data_path = pathlib.Path("../data/benchmark/")
data = collections.OrderedDict()
for p in data_path.glob("*"):
    if re.match("(pree|preg|canc|sing|large_synthetic)", p.name): 
        df = coralsarticle.data.utils.load_h5(p)
        name = re.search("(.*).h5", p.name).group(1)
        data[name] = df

In [None]:
def format_data(d, name):
    df_data = pd.DataFrame({
        "Dataset": [name],
        "Features": [d.shape[1]],
        "Samples": [d.shape[0]],
        "Feature/Sample Ratio": [int(np.round(d.shape[1] / d.shape[0]))],
    })
    return df_data

data_df = pd.concat([
    format_data(df, datasets_map[n]) for n, df in data.items()
])
data_df

In [None]:
print(data_df.to_latex(index=False))

In [None]:
x = data_df.Features
y = x ** 2

xx = np.linspace(min(x),600000)
yy = xx**2


fig, ax = plt.subplots(figsize=(10,10))
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.plot(xx, yy)


ax.scatter(x, y, s=200)

In [None]:
# clean-up
del data

## Full Matrix

In [None]:
# everything with non-synthetic data
select = \
    ~(benchmark_experiments["data"].str.startswith("synthetic")) \
    & (benchmark_experiments["context"] == "fast")

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    r = benchmark_experiments[select][columns_stats]\
        .groupby(["data", "language", "algorithm", "prefix"])\
        .agg(agg_stats) \
        .sort_index()
    display(r)

In [None]:
# select benchmark experiments
b = benchmark_experiments
algorithms_default = [  
    (f"{method_name}",            (b.language == "python") & (b.algorithm == "cor_matrix_symmetrical_nocopy")),
    (f"{method_name} (parallel)", (b.language == "python") & (b.algorithm == "cor_matrix_symmetrical_nthreads-64")),
    ("R",             (b.language == "r")      & (b.algorithm == "cor_cor")),
    ("Julia",         (b.language == "julia")  & (b.algorithm == "cor_cor")),
    ("Python",        (b.language == "python") & (b.algorithm == "cor_corrcoef")),
    ("WGCNA",                         (b.language == "r")      & (b.algorithm == "cor_wgcna")),
    ("coop",                          (b.language == "r")      & (b.algorithm == "cor_coop")),
    ("Rfast",                     (b.language == "r")      & (b.algorithm == "cor_rfast")),
    ("HiClimR",                   (b.language == "r")      & (b.algorithm == "cor_hiclimr")),
]

In [None]:
# general filter
global_select = \
    ~(benchmark_experiments["data"].str.startswith("synthetic")) \
    & (benchmark_experiments["prefix"] == "full_default") \
    & (benchmark_experiments["context"] == "fast") \
#     & (benchmark_experiments["language"] == "python")

In [None]:
# long stats
df, f, l = format_benchmarks(algorithms_default, global_select, statistic="runtime", column_regex=".*")
display(f)

In [None]:
# long stats
df, f, l = format_benchmarks(algorithms_default, global_select, statistic="memory", column_regex=".*")
display(f)

In [None]:
# stats for paper (runtime)
df, f, l = format_benchmarks(algorithms_default, global_select, statistic="runtime", column_regex="median")
display(f)
print(l)

In [None]:
# stats for paper (runtime)
df, f, l = format_benchmarks(algorithms_default, global_select, statistic="memory", column_regex="median")
display(f)
print(l)

In [None]:
x = np.array([16897, 32211, 64813, 129626, 200000, 259252])
y = np.array([ 2.38289074,   8.038218,  31.60920241, 125.55584884])
import sklearn.linear_model
lr = sklearn.linear_model.LinearRegression()
lr.fit(np.log(x[:-2].reshape(-1,1)), np.log(y))

yp = np.exp(lr.predict(np.log(x).reshape(-1,1)))


xx = np.linspace(min(x),200000)
yy = np.exp(lr.predict(np.log(xx).reshape(-1,1)))
plt.scatter(xx, yy, s=10)

plt.scatter(x[:-2], y, s=100)
plt.scatter(x, yp, marker="x", c="black")
plt.xscale("log")
plt.yscale("log")

plt.xticks(x, x, rotation=320)
plt.yticks(yp, [f"{yy:.02f} Gb" for yy in yp])
plt.xlabel("number of features")
plt.ylabel("memory")

xx = np.array([16000, 32000, 64000, 128000, 250000])
yy = np.exp(lr.predict(np.log(xx).reshape(-1,1)))
for xxx, yyy in zip(xx, yy):
    print(f"features: {xxx:10d} -> memory: {yyy:10.02f}")

## Top-K

In [None]:
# everything with non-synthetic data

select = \
        ~(benchmark_experiments["data"].str.startswith("synthetic")) \
        & (benchmark_experiments["context"] == "topk")

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    r = benchmark_experiments[select][columns_stats]\
        .groupby(["data", "language", "context_param", "algorithm", "prefix", "memory_backend"])\
        .agg(agg_stats)\
        .sort_index()
    display(r)

In [None]:
b = benchmark_experiments
algorithms_default = [  
    ("R",             (b.language == "r")      & (b.algorithm == "topk_matrix")),
    ("Julia",         (b.language == "julia")  & (b.algorithm == "topk_matrix")),
    ("Python",        (b.language == "python") & (b.algorithm == "topk_matrix")),
    (f"{method_name}",            (b.language == "python") & (b.algorithm == "topk_balltree_combined_tree")),
]
algorithms_runtime = [
    (f"{method_name} (parallel)", (b.language == "python") & (b.algorithm == "topk_balltree_combined_tree_optimized_parallel_64") & (b.memory_backend == "psutil_uss")),
]
algorithms_memory = [
    (f"{method_name} (parallel)", (b.language == "python") & (b.algorithm == "topk_balltree_combined_tree_optimized_parallel_64") & (b.memory_backend == "psutil_uss")),
]

In [None]:
global_select = \
    ~(benchmark_experiments["data"].str.startswith("synthetic")) \
    & (benchmark_experiments["context"] == "topk") \
    & (benchmark_experiments["context_param"] == 0.1) \
#     & (benchmark_experiments["language"] == "python")

In [None]:
df, f, l = format_benchmarks(algorithms_default + algorithms_runtime, global_select, statistic="runtime", column_regex=".*")
display(f)

In [None]:
df, f, l = format_benchmarks(algorithms_default + algorithms_runtime, global_select, statistic="runtime", column_regex="median")
display(f)
print(l)

In [None]:
df, f, l = format_benchmarks(algorithms_default + algorithms_memory, global_select, statistic="memory", column_regex="median")
display(f)
print(l)

In [None]:
x = np.array([16897, 32211, 64813, 129626, 200000, 259252, 600000])
y = np.array([ 7.5, 27.3, 157.6])
# y = np.array([ 6.4, 23.3, 93.8])
# y = np.array([ 6.8, 23.7, 94.3])

import sklearn.linear_model
lr = sklearn.linear_model.LinearRegression()
lr.fit(np.log(x[:len(y)].reshape(-1,1)), np.log(y))

yp = np.exp(lr.predict(np.log(x).reshape(-1,1)))


xx = np.linspace(min(x),max(x))
yy = np.exp(lr.predict(np.log(xx).reshape(-1,1)))
plt.scatter(xx, yy, s=10)

plt.scatter(x[:len(y)], y, s=100)
plt.scatter(x, yp, marker="x", c="black")
plt.xscale("log")
plt.yscale("log")

plt.xticks(x, x, rotation=320)
plt.yticks(yp, [f"{yy:.02f} Gb" for yy in yp])
plt.xlabel("number of features")
plt.ylabel("memory")

xx = np.array([16000, 32000, 64000, 128000, 200000, 250000])
yy = np.exp(lr.predict(np.log(xx).reshape(-1,1)))
for xxx, yyy in zip(xx, yy):
    print(f"features: {xxx:10d} -> memory: {yyy:10.02f}")

# Supplement


## Euclidean distance vs. correlation 

In [None]:
def f(cor):
    return np.sqrt(2 - 2*cor)

fig, ax = plt.subplots(1,1, figsize=(2,2), dpi=300)
c = np.linspace(-1,1,100)
ax.plot(c, f(c), linewidth=2)
ax.set(
    xlabel="correlation",
    ylabel="Euclidean distance")
ax.axhline(np.sqrt(2), color="grey", linestyle="--")
ax.axvline(0, color="grey", linestyle="--")

fig.savefig(out_dir / "dist-vs-cor.pdf", bbox_inches="tight", dpi=300)

## Tree variants

In [None]:
b = benchmark_experiments
algorithms = [  
    ("twice",        (b.language == "python") & (b.algorithm == "topk_balltree_twice")),
    ("no-dual",      (b.language == "python") & (b.algorithm == "topk_balltree_combined_tree_no-dual")),
    ("tree",         (b.language == "python") & (b.algorithm == "topk_balltree_combined_tree")),
]


In [None]:
global_select = \
    ~(benchmark_experiments["data"].str.startswith("synthetic")) \
    & (benchmark_experiments["context"] == "topk") \
    & (benchmark_experiments["context_param"] == 0.1) \
#     & (benchmark_experiments["language"] == "python")

In [None]:
df, f, l = format_benchmarks(algorithms, global_select, statistic="runtime", column_regex="median")
display(f)
print(l)

In [None]:
df, f, l = format_benchmarks(algorithms, global_select, statistic="memory", column_regex="median")
display(f)
print(l)

## Full matrix - Synthetic

In [None]:
benchmark_experiments

In [None]:
results = benchmark_experiments[
    (benchmark_experiments["data"].str.startswith("synthetic")) 
    & (benchmark_experiments["context"] == "fast") 
#     & (benchmark_experiments["algorithm"].isin(algorithms)) 
].copy()
results.loc[:, "m"] = results["data"].str.extract(".*m-(.*?)_.*").values.flatten().astype(int)
results.loc[:, "n"] = results["data"].str.extract(".*n-(.*?)_").values.flatten().astype(int)

In [None]:
results.algorithm.unique()

In [None]:
algorithms_python_final = [
    "cor_corrcoef",
#     "cor_matrix_symmetrical", 
    "cor_matrix_symmetrical_nocopy"
]

algorithms_julia_final = [
    "cor_cor", 
#     "cor_symmetrical", 
    "cor_symmetrical_nocopy2"
]

algorithms_r_final = [
    "cor_cor", 
#     "cor_symmetrical", 
    "cor_symmetrical_nocopy"
]

fig, axes = plt.subplots(1,1, figsize=(3,2), dpi=150)
# fig, axes = plt.subplots(1,1, figsize=(10,10), dpi=150)
ax = axes

ax.axvline(20000, linestyle="-", color="lightgrey")

color = colors[0]
styles = ["-", "--", ":", "dashdot", "dashdot","dashdot",]
for i, a in enumerate(algorithms_python_final):
    select = results[(results["language"] == "python") & (results["algorithm"] == a) & (results["m"] == 50)][["m", "n", "runtime", "memory"]].groupby("n").median()
    ax.plot(select.index, select["runtime"], color=color, linestyle=styles[i], label=a, marker="+")

color = colors[1]
styles = ["-", "--", ":", "dashdot", "dashdot","dashdot",]
for i, a in enumerate(algorithms_julia_final):
    print(a)
    select = results[(results["language"] == "julia") & (results["algorithm"] == a) & (results["m"] == 50)][["m", "n", "runtime", "memory"]].groupby("n").median()
    ax.plot(select.index, select["runtime"], color=color, linestyle=styles[i], label=a, marker="+")

color = colors[2]
styles = ["-", "--", ":", "dashdot", "dashdot","dashdot",]
for i, a in enumerate(algorithms_r_final):
    print(a)
    select = results[(results["language"] == "r") & (results["algorithm"] == a) & (results["m"] == 50)][["m", "n", "runtime", "memory"]].groupby("n").median()
    ax.plot(select.index, select["runtime"], color=color, linestyle=styles[i], label=a, marker="+")
    
    
ax.set_xscale("log")
ax.set_yscale("log")
# ax.set_xlim([0,5000])
# ax.set_ylim([0,0.1])
# ax.set_xlim([10000,40000])
# ax.set_ylim([1,10])
# ax.set_xlim([5000,40000])
# ax.set_ylim([0.1,10])

# ax.set_xlim([2000,10000])
ax.set_ylim([0.01,10])

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.set(xlabel="features", ylabel="seconds")

ax.set_xticks([1000,3000,10000,30000])
ax.set_xticklabels([1000,3000,10000,30000])

ax.set_yticks([0.1, 1, 10])
ax.set_yticklabels([0.1,1,10])

# ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# ax.set_xticks(select.index.values[:-1])
# ax.set_xticklabels([str(int(x)) for x in select.index.values / 100][:-1])

fig.savefig(out_dir / "fast_runtime_features.pdf", bbox_inches="tight")

In [None]:
algorithms_python_final = [
    "cor_corrcoef",
#     "cor_matrix_symmetrical", 
    "cor_matrix_symmetrical_nocopy"
]

algorithms_julia_final = [
    "cor_cor", 
#     "cor_symmetrical", 
    "cor_symmetrical_nocopy2"
]

algorithms_r_final = [
    "cor_cor", 
#     "cor_symmetrical", 
    "cor_symmetrical_nocopy"
]

fig, axes = plt.subplots(1,1, figsize=(3,2), dpi=150)
# fig, axes = plt.subplots(1,1, figsize=(10,10), dpi=150)
ax = axes

ax.axvline(50, linestyle="-", color="lightgrey")

color = colors[0]
for i, a in enumerate(algorithms_python_final):
    select = results[(results["language"] == "python") & (results["algorithm"] == a) & (results["n"] == 20000)][["m", "n", "runtime", "memory"]].groupby("m").median()
    ax.plot(select.index, select["runtime"], color=color, linestyle=styles[i], label=a, marker="+")

color = colors[1]
styles = ["-", "--", ":"]
for i, a in enumerate(algorithms_julia_final):
    print(a)
    select = results[(results["language"] == "julia") & (results["algorithm"] == a) & (results["n"] == 20000)][["m", "n", "runtime", "memory"]].groupby("m").median()
    ax.plot(select.index, select["runtime"], color=color, linestyle=styles[i], label=a, marker="+")

color = colors[2]
styles = ["-", "--", ":", "dashdot", (0, (5, 1)),(0, (3, 1, 1, 1, 1, 1))]
for i, a in enumerate(algorithms_r_final):
    print(a)
    select = results[(results["language"] == "r") & (results["algorithm"] == a) & (results["n"] == 20000)][["m", "n", "runtime", "memory"]].groupby("m").median()
    ax.plot(select.index, select["runtime"], color=color, linestyle=styles[i], label=a, marker="+")

ax.set_xscale("log")
ax.set_yscale("log")
# ax.set_xlim([0,5000])
# ax.set_ylim([0,0.1])
# ax.set_xlim([10000,40000])
# ax.set_ylim([1,10])
# ax.set_xlim([5000,40000])
# ax.set_ylim([0.1,10])

ax.set_xlim([10,500])
ax.set_ylim([0,10])

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.set(xlabel="samples", ylabel="seconds")

# ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

ax.set_xticks([10,25, 50,100, 200,500])
ax.set_xticklabels([10,25,50, 100, 200,500])

ax.set_yticks([1,2,5,10])
ax.set_yticklabels([1,2,5,10])

# ax.set_yticks([0.1, 1, 10, 50])
# ax.set_yticklabels([0.1,1,10, 50])

fig.savefig(out_dir / "fast_runtime_samples.pdf", bbox_inches="tight")

## Top-k - Synthetic

In [None]:
algorithms_python = ["topk_matrix", "topk_balltree_combined_tree"]
algorithms_julia = ["topk_matrix", "topk_balltree_nn_combined_tree"]
algorithms_r = ["topk_matrix"]

In [None]:
results = benchmark_experiments[
    (benchmark_experiments["data"].str.startswith("synthetic")) 
    & (benchmark_experiments["context"] == "topk") 
    & (benchmark_experiments["context_param"] == 0.1)
    & (benchmark_experiments["algorithm"].isin(algorithms_python + algorithms_julia)) 
].copy()
results.loc[:, "m"] = results["data"].str.extract(".*m-(.*?)_.*").values.flatten().astype(int)
results.loc[:, "n"] = results["data"].str.extract(".*n-(.*?)_").values.flatten().astype(int)

In [None]:
fig, axes = plt.subplots(1,1, figsize=(3,2), dpi=150)
ax = axes

ax.axvline(20000, linestyle="-", color="lightgrey")

select = results[(results["language"] == "python") & (results["algorithm"] == "topk_matrix") & (results["m"] == 50)][["m", "n", "runtime", "memory"]].groupby("n").median()
ax.plot(select.index, select["runtime"], color=colors[0], label="Python cor", marker="+")

select = results[(results["language"] == "python") & (results["algorithm"] == "topk_balltree_combined_tree") & (results["m"] == 50)][["m", "n", "runtime", "memory"]].groupby("n").median()
ax.plot(select.index, select["runtime"], color=colors[0], linestyle="--", label="Python LR", marker="+")

select = results[(results["language"] == "julia") & (results["algorithm"] == "topk_matrix") & (results["m"] == 50)][["m", "n", "runtime", "memory"]].groupby("n").median()
ax.plot(select.index, select["runtime"], color=colors[1], label="Julia cor", marker="+")

select = results[(results["language"] == "julia") & (results["algorithm"] == "topk_balltree_nn_combined_tree") & (results["m"] == 50)][["m", "n", "runtime", "memory"]].groupby("n").median()
ax.plot(select.index, select["runtime"], color=colors[1], linestyle="--", label="Julia LR", marker="+")

select = results[(results["language"] == "r") & (results["algorithm"] == "topk_matrix") & (results["m"] == 50)][["m", "n", "runtime", "memory"]].groupby("n").median()
ax.plot(select.index, select["runtime"], color=colors[2], label="R cor", marker="+")

ax.set_xscale("log")
ax.set_yscale("log")
# ax.set_xlim([1000, 20000])


ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax.set(xlabel="features", ylabel="seconds")

ax.set_xticks([1000,3000,10000,30000])
ax.set_xticklabels([1000,3000,10000,30000])

ax.set_yticks([0.1, 1, 10,100, 1000])
ax.set_yticklabels([0.1,1,10,100, 1000])

fig.savefig(out_dir / "topk_runtime_features.pdf", bbox_inches="tight")

In [None]:
fig, axes = plt.subplots(1,1, figsize=(3,2), dpi=150)
ax = axes

ax.axvline(50, linestyle="-", color="lightgrey")

select = results[(results["language"] == "python") & (results["algorithm"] == "topk_matrix") & (results["n"] == 20000)][["m", "n", "runtime", "memory"]].groupby("m").median()
ax.plot(select.index, select["runtime"], color=colors[0], label="Python cor", marker="+")

select = results[(results["language"] == "python") & (results["algorithm"] == "topk_balltree_combined_tree") & (results["n"] == 20000)][["m", "n", "runtime", "memory"]].groupby("m").median()
ax.plot(select.index, select["runtime"], color=colors[0], linestyle="--", label="Python LR", marker="+")

select = results[(results["language"] == "julia") & (results["algorithm"] == "topk_matrix") & (results["n"] == 20000)][["m", "n", "runtime", "memory"]].groupby("m").median()
ax.plot(select.index, select["runtime"], color=colors[1], label="Julia cor", marker="+")

select = results[(results["language"] == "julia") & (results["algorithm"] == "topk_balltree_nn_combined_tree") & (results["n"] == 20000)][["m", "n", "runtime", "memory"]].groupby("m").median()
ax.plot(select.index, select["runtime"], color=colors[1], linestyle="--", label="Julia LR", marker="+")

select = results[(results["language"] == "r") & (results["algorithm"] == "topk_matrix") & (results["n"] == 20000)][["m", "n", "runtime", "memory"]].groupby("m").median()
ax.plot(select.index, select["runtime"], color=colors[2], label="R cor", marker="+")

ax.set_yscale("log")
ax.set_xscale("log")
# plt.xlim([0,100])

ax.set_xticks([10,25, 50,100, 200,500])
ax.set_xticklabels([10,25,50, 100, 200,500])


ax.set_yticks([1,10,100, 1000])
ax.set_yticklabels([1,10,100, 1000])

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
    
ax.set(xlabel="samples", ylabel="seconds")
fig.savefig(out_dir / "topk_runtime_samples.pdf", bbox_inches="tight")

In [None]:
fig, axes = plt.subplots(1,1, figsize=(3,2), dpi=150)
ax = axes

r = results[(results["n"] == 20000) & (results["m"] == 50)]
sns.barplot(
    x=[n.capitalize() for n in r["language"].values], 
    y=r["memory"].values / 1024, 
    hue=[f"{method_name}" if "balltree" in n else "default" for n in r["algorithm"].values], 
    ax=ax, 
    palette=["black", "grey"])
# sns.swarmplot(
#     x=[n.capitalize() for n in r["language"].values], 
#     y=r["memory"].values / 1024, 
#     hue=[f"{method_name}" if "balltree" in n else "default" for n in r["algorithm"].values], 
#     ax=ax, 
#     palette=["black", "grey"])

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.set_ylim([0.1,11])

# plt.yscale("log")
ax.set(xlabel=None, ylabel="memory (Gb)")

fig.savefig(out_dir / "topk_memory_50x20000.pdf", bbox_inches="tight")

## Parallelization

In [None]:
results = benchmark_experiments.copy()
results["algorithm_base"] = results.algorithm.str.replace("(_parallel.*)?(_nthreads.*)?", "")
results["algorithm_parallel"] = results.algorithm.str.extract(".*_parallel_([0-9]+).*")
results["algorithm_threads"] = results.algorithm.str.extract(".*_nthreads-([0-9]+).*")
results.loc[results.algorithm_parallel.isna(),"algorithm_parallel"] = 1 
results.loc[results.algorithm_threads.isna(),"algorithm_threads"] = 1 

results["algorithm_parallel"] = results["algorithm_parallel"].astype(int)
results["algorithm_threads"] = results["algorithm_threads"].astype(int)

results

### Full matrix

In [None]:
results.data.unique()

In [None]:
algorithms_python

In [None]:
r = results[
    (results.context == "fast") 
    & (results.language == "python") 
    & (results.data.str.contains("cancer_postprocessed_nonegatives_dropduplicates_sample-0.25"))
]
r

In [None]:
# using copy actually increases runtimes with more cores!
algorithms_python = ["cor_corrcoef", "cor_matrix_symmetrical", "cor_matrix_symmetrical_nocopy"]
# algorithms_python = ["cor_corrcoef", "cor_matrix_symmetrical_nocopy"]
# algorithms_python = ["cor_corrcoef", "cor_matrix_symmetrical"]
# algorithms_python = ["cor_matrix_symmetrical"]
# algorithms_python = ["cor_matrix_symmetrical_nocopy"]
# algorithms_python = ["cor_matrix_symmetrical", "cor_matrix_symmetrical_nocopy"]
algorithms_julia = []

r = results[
    results.algorithm_base.isin(algorithms_python) 
    & (results.context == "fast") 
    & (results.language == "python") 
    & (results.data.str.contains("cancer_postprocessed_nonnegatives_dropduplicates_sample-0.25"))]

r = results[
#     results.algorithm_base.isin(algorithms_python) 
    (results.context == "fast") 
    & (results.language == "python") 
    & (results.data.str.contains("cancer_postprocessed_nonegatives_dropduplicates_sample-0.25"))
]
r

fig, ax = plt.subplots(1,1)
sns.lineplot(x=r.algorithm_threads.values, y=r.runtime.values, hue=r.algorithm_base.values, ax=ax)#, order=[1,2,4,8,16,32,64])
ax.set_xscale("log")
ax.set_yscale("log")

ax.set_xticks([1,2,4,8,16,32,64])
ax.set_xticklabels([1,2,4,8,16,32,64])
# plt.yscale("log")
# plt.ylim([5,20])

In [None]:
rr = r[(r.algorithm_base == "cor_matrix_symmetrical_nocopy")].groupby(["algorithm_base", "algorithm_threads"]).median().reset_index()
rr2 = r[(r.algorithm_base == "cor_matrix_symmetrical")].groupby(["algorithm_base", "algorithm_threads"]).median().reset_index()
rr3 = r[(r.algorithm_base == "cor_corrcoef")].groupby(["algorithm_base", "algorithm_threads"]).median().reset_index()

order = np.argsort(rr["algorithm_threads"].values)

fig, ax1 = plt.subplots(1,1, figsize=(3,2), dpi=150)

g1, = ax1.plot(
    rr2["algorithm_threads"].values[order], 
    rr2["runtime"].values[order],
    marker="o",
    color=colors[0],
    linestyle="--")

g1, = ax1.plot(
    rr["algorithm_threads"].values[order], 
    rr["runtime"].values[order],
    marker="o",
    color=colors[0],
    linestyle=":")


g1, = ax1.plot(
    rr3["algorithm_threads"].values[order], 
    rr3["runtime"].values[order],
    marker="o",
    color=colors[0],
    linestyle="-")

# ax1.scatter(
#     [64], 
#     [6],
#     marker="o",
#     color="green")

ax1.set_xticks(rr["algorithm_threads"].values)
ax1.set_ylabel("seconds", color=g1.get_color())
ax1.set_xlabel("cores")

ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)

ax2 = ax1.twinx()
ax2.bar(
    rr["algorithm_threads"].values[order], 
    rr["memory"].values[order] / 1000,
    color="lightgrey", 
    width=.3 * rr["algorithm_threads"].values[order],
    align="center")

ax2.set_ylabel("memory (Gb)", color="grey")
ax2.set_ylim([0,66])

ax2.spines['top'].set_visible(False)

ax1.set_zorder(1)
ax1.patch.set_visible(False)

ax1.set(xscale="log", yscale="log")
ax2.set(xscale="log")
ax1.set_xticks(rr["algorithm_threads"].values)
ax1.set_xticklabels(rr["algorithm_threads"].values)

ax1.set_yticks([5, 10,20,40, 80,160])
ax1.set_yticklabels([5, 10,20,40,80,160])

fig.tight_layout()
fig.savefig(
    f"../_out/figures/threads_real-data_fast.pdf", bbox_inches="tight")

In [None]:
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt 
from matplotlib.lines import Line2D

patchList = []
data_key = Line2D(
    [0], [0], marker="o", color=colors[0], linestyle="-", 
    label="corrcoef", 
    markersize=0)
patchList.append(data_key)

data_key = Line2D(
    [0], [0], marker="o", color=colors[0], linestyle="--", 
    label=f"{method_name} (copy)", 
    markersize=0)
patchList.append(data_key)

data_key = Line2D(
    [0], [0], marker="o", color=colors[0], linestyle=":", 
    label=f"{method_name} (no copy)", 
    markersize=0)
patchList.append(data_key)

fig, axes = plt.subplots(1,1, dpi=150, figsize=(2,2))
ax = axes
ax.legend(handles=patchList, loc="upper left", frameon=False)
ax.axis("off")
fig.savefig('../_out/figures/legend_parallel.pdf', bbox_inches='tight')

### Top-k

In [None]:
r.algorithm_parallel.unique()

In [None]:
r = results[
#     results.algorithm_base.isin(algorithms_python) 
#     &
    (results.context == "topk") 
    & (results.language == "python") 
    & (results.context_param == 0.1) 
    & (results.data.str.contains("cancer_postprocessed_nonegatives_dropduplicates_sample-1.00"))
]
r

In [None]:
runtimes = r[r.algorithm_base.str.startswith("topk_balltree_combined_tree_optimized") & (r.prefix == "topk_default") & (r.memory_backend == "psutil")]\
    .groupby(["algorithm_base", "algorithm_parallel"])\
    .agg(["mean", "count"])\
    .reset_index()
display(runtimes)

memory = r[r.algorithm_base.str.startswith("topk_balltree_combined_tree_optimized") & (r.prefix == "topk_default") & (r.memory_backend == "psutil_uss")]\
    .groupby(["algorithm_base", "algorithm_parallel"])\
    .agg(["mean", "count"])\
    .reset_index()
display(memory)

In [None]:
aggregate = runtimes.copy()
aggregate.loc[aggregate.algorithm_base.str.startswith("topk_balltree_combined_tree_optimized").values, ("memory", "mean")] = memory.memory["mean"].values
aggregate

In [None]:
rr = aggregate

order = np.argsort(rr["algorithm_parallel"].values)

fig, ax1 = plt.subplots(1,1, figsize=(3,2), dpi=150)

g1, = ax1.plot(
    rr["algorithm_parallel"].values[order], 
    rr["runtime"]["mean"].values[order] / 60,
    marker="o")

ax1.set_xticks(rr["algorithm_parallel"].values)
ax1.set_ylabel("minutes", color=g1.get_color())
ax1.set_xlabel("cores")

ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)

ax2 = ax1.twinx()
ax2.bar(
    rr["algorithm_parallel"].values[order], 
    rr["memory"]["mean"].values[order] / 1000,
    color="lightgrey", 
    width=.3 * rr["algorithm_parallel"].values[order],
    align="center")

ax2.set_ylabel("memory (Gb)", color="grey")
ax2.set_ylim([0,80])

ax2.spines['top'].set_visible(False)

ax1.set_zorder(1)
ax1.patch.set_visible(False)

# ax1.set(xscale="log", yscale="log")
ax2.set(xscale="log")

ax1.set_xticks(rr["algorithm_parallel"].values)
ax1.set_xticklabels(rr["algorithm_parallel"].values)

# ax1.set_yticks(rr["algorithm_parallel"].values)
# ax1.set_yticklabels(rr["algorithm_parallel"].values)


fig.tight_layout()
fig.savefig(out_dir / "threads_real-data_topk.pdf", bbox_inches="tight")

## Comprehensive comparison with other libraries

Also see `scripts` for small scale experiments.

## Accuracy

### Plot

In [None]:
accuracies = [
    ("preeclampsia", accuracy_experiments.loc[("preeclampsia_negative_dropduplicates", "topk", 1, "unknown")]),
    ("pregnancy", accuracy_experiments.loc[("pregnancy_negative_dropduplicates", "topk", 1, "unknown")]),
    ("cancer", accuracy_experiments.loc[("cancer_negative_dropduplicates_sample-0.25", "topk", 1, "unknown")])
]


for c in accuracies[0][1].columns:
    
    fig, axes = plt.subplots(1,1, figsize=(3,2), dpi=150)
    ax = axes
        
    for n, accuracy in accuracies: 
#         display(accuracy)
        ax.plot(accuracy.reset_index()["approximation_factor"], accuracy[c].values, label=n, marker="+")
        
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    ax.set(xlabel="approximation factor", ylabel=c)
    ax.legend()
    fig.savefig(
        f"../_out/figures/accuracy_real-data_topk-1percent_{c}.pdf", bbox_inches="tight")