# Summary

# Imports

In [None]:
import importlib
import os
import sys
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import seaborn as sns
from scipy import stats

In [None]:
%matplotlib inline

In [None]:
pd.set_option("max_columns", 100)

In [None]:
SRC_PATH = Path.cwd().joinpath('..', 'src').resolve(strict=True)

if SRC_PATH.as_posix() not in sys.path:
    sys.path.insert(0, SRC_PATH.as_posix())

import helper
importlib.reload(helper)

# Parameters

In [None]:
NOTEBOOK_PATH = Path('validation_protherm_dataset_combined')
NOTEBOOK_PATH

In [None]:
OUTPUT_PATH = Path(os.getenv('OUTPUT_DIR', NOTEBOOK_PATH.name)).resolve()
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH

In [None]:
PROJECT_VERSION = os.getenv("PROJECT_VERSION")

In [None]:
DEBUG = "CI" not in os.environ    
DEBUG

In [None]:
if DEBUG:
    PROJECT_VERSION = "0.1"
else:
    assert PROJECT_VERSION is not None
    
PROJECT_VERSION

In [None]:
# if DEBUG:
#     %load_ext autoreload
#     %autoreload 2

# `DATAPKG`

In [None]:
DATAPKG = {}

In [None]:
DATAPKG['validation_protherm_dataset'] = sorted(
    Path(os.environ['DATAPKG_OUTPUT_DIR'])
    .joinpath("adjacency-net-v2", f"v{PROJECT_VERSION}", "validation_protherm_dataset")
    .glob("*/*_dataset.parquet")
)

In [None]:
DATAPKG['validation_protherm_dataset']

# Dataset

In [None]:
validation_protherm_dataset = None


def assert_eq(a1, a2):
    if isinstance(a1[0], np.ndarray):
        for b1, b2 in zip(a1, a2):
            assert (b1 == b2).all()
    else:
        assert (a1 == a2).all()
            

for file in DATAPKG['validation_protherm_dataset']:
    df = pq.read_table(file, use_pandas_metadata=True).to_pandas(integer_object_nulls=True)
    if validation_protherm_dataset is None:
        validation_protherm_dataset = df
    else:
        validation_protherm_dataset = (
            validation_protherm_dataset
            .merge(df, how="outer", left_index=True, right_index=True, validate="1:1", suffixes=("", "_dup"))
        )
        for col in validation_protherm_dataset.columns:
            if col.endswith(f"_dup"):
                col_ref = col[:-4]
                assert_eq(validation_protherm_dataset[col], validation_protherm_dataset[col_ref])
                del validation_protherm_dataset[col]

In [None]:
validation_protherm_dataset.head(2)

# Plot

In [None]:
data_ref = [
    ('Provean', 0.25, None, False),
    ('FoldX', 0.48, None, False),
    ('ELASPIC', 0.54, None, False),
]

data_net = []

for column in validation_protherm_dataset.columns:
    if column.endswith("_change"):
        corr, pvalue = stats.spearmanr(
            validation_protherm_dataset[column],
            -validation_protherm_dataset['ddg_exp']
        )
        data_net.append((column[:7], corr, pvalue, True))

data_net.sort(key=lambda x: x[1], reverse=True)

df = pd.DataFrame(data_ref + data_net, columns=['feature', 'correlation', 'pvalue', 'is_network'])
df

In [None]:
cmap = plt.get_cmap("Set1")

feature_names = {}

with plt.rc_context(rc={'figure.figsize': (2 + 0.5 * len(df), 4), 'font.size': 13}):
    x = np.arange(len(df))
    c = [cmap(2) if is_network else cmap(1) for is_network in df['is_network']]
    plt.bar(x, df['correlation'], color=c)
    plt.xticks(x, [feature_names.get(f, f[:7]) for f in df['feature'].values], rotation=45)
    plt.ylim(0.4, 1)
    plt.ylabel("Correlation")
    plt.title("Predicting Protherm ΔΔG")
    plt.ylim(0, 1)

    plt.tight_layout()
    plt.savefig(OUTPUT_PATH.joinpath(f"protherm_correlations.png"), dpi=300, bbox_inches="tight")
    plt.savefig(OUTPUT_PATH.joinpath(f"protherm_correlations.pdf"), bbox_inches="tight")