# Summary

----

# Imports

In [1]:
import torch

In [2]:
%run _imports.ipynb

Setting the PACKAGE_VERSION environment variable.
Setting the DOCS_SECRET_KEY environment variable.
Setting the PYTHON_VERSION environment variable.
Setting the SPARK_MASTER environment variable.
Setting the SPARK_ARGS environment variable.
Setting the DB_TYPE environment variable.
Setting the DB_PORT environment variable.


2018-04-30 13:41:37.891125


In [3]:
import math
from sklearn import metrics

In [4]:
import pagnn
import pagnn.training.gan
import pagnn.prediction.gan

In [5]:
pagnn.settings.CUDA = False

# Parameters

In [6]:
NOTEBOOK_NAME = 'validate_trained_network'
NOTEBOOK_PATH = Path(NOTEBOOK_NAME)
NOTEBOOK_PATH.mkdir(exist_ok=True)

In [7]:
validation_files = {
    'permute': 'validation_gan_permute_80_1000.pickle',
    'exact': 'validation_gan_exact_80_1000.pickle',
    'start': 'validation_gan_start_80_1000.pickle',
    'stop': 'validation_gan_stop_80_1000.pickle',
    'middle': 'validation_gan_middle_80_1000.pickle',
    'edges': 'validation_gan_edges_80_1000.pickle',
}

# Workspace

In [8]:
validation_data = dict()
for name, file in validation_files.items():
    with Path('train_neural_network').joinpath(file).open('rb') as fin:
        validation_data[name] = pickle.load(fin)

In [9]:
validation_data['permute'][0]

DataSetGAN(seqs=[b'YQLEKDPIAGAETFYVDGAANRETKLGKAGYVTDRGRRKIVSLTETTNQKTELQAIYIALQDSGSEVNIVTDSQYALGIIQAQPDKSESELVNQIIEQLIGKERVYLSWVPAHKGIGGNEQVDKLVSSG', b'VTDRGRRKIVSLTETTNQKTELQAIYIALQDSGSEVNIVTDSQYALGIIQAQPDKSESELVNQIIEQLIGKERVYLSWVPAHKGIGGNEQVDKLVSSGYQLEKDPIAGAETFYVDGAANRETKLGKAGY'], adjs=[<129x129 sparse matrix of type '<class 'numpy.int16'>'
	with 1682 stored elements in COOrdinate format>], targets=[1, 0], meta=None)

In [10]:
mutation_data = dict()
for name in ['protherm', 'humsavar']:
    mutation_data[name] = pagnn.training.gan.get_mutation_dataset(name, NOTEBOOK_PATH.parent)

## Functions

In [11]:
def load_networks(unique_name, step):

    args = pagnn.prediction.gan.Args(
        input_file='',
        output_file='',
        work_path=(
            NOTEBOOK_PATH.parent
            .joinpath('train_neural_network')
            .joinpath(unique_name)),
        step=step,
        nseqs=10_000,
    )

    # Training arguments
    args_training = pagnn.training.gan.Args(root_path=args.work_path.parent)
    args_training.unique_name = args.work_path.name

    # Load network
    net_d = pagnn.models.AESeqAdjApplyExtra(
        'discriminator',
        hidden_size=args_training.hidden_size,
        bottleneck_size=1,
    )

    net_g = pagnn.models.AESeqAdjApplyExtra(
        'generator',
        hidden_size=args_training.hidden_size,
        bottleneck_size=16,
        encoder_network=net_d,
    )

    net_d.load_state_dict(
        torch.load(
            args_training.work_path.joinpath('models').joinpath(f'net_d-step_{args.step}.model')
            .as_posix(),
            map_location='cpu'))
    net_g.load_state_dict(
        torch.load(
            args_training.work_path.joinpath('models').joinpath(f'net_g-step_{args.step}.model')
            .as_posix(),
            map_location='cpu'))
    
    return net_d, net_g

In [12]:
def get_bottleneck_indices(adjs):
    idxs = []
    start = 0
    for i, adj in enumerate(adjs):
        stop = start + adj[4].shape[1]
        idxs.append((math.floor(start / 4), math.ceil(stop / 4),))
        start = stop
    assert idxs[-1][1] == math.ceil(sum(adj[4].shape[1] for adj in adjs) / 4)
    return idxs

In [13]:
def evaluate_validation_dataset(net_d, datasets, batch_size):
    """

    Returns:
        A tuple of targets and outputs arrays.
            - Targets are ΔΔG values.
            - Outputs are (pred_mut [low] - pred_wt [high]), so they should be *positive* for
              stabilizing mutations and *negative* for destabilizing mutations (i.e. the
              *reverse* of ΔΔG).
    """
    outputs = []
    targets = []
    datasets = iter(datasets)
    
    last_run = False
    while not last_run:
        batch_pos = ([], [], [])
        batch_neg = ([], [], [])
        while len(batch_pos[0]) < batch_size:
            try:
                dataset = next(datasets)
            except StopIteration:
                last_run = True
                break
            else:
                datavar = net_d.dataset_to_datavar(dataset)
                batch_pos[0].append(datavar.seqs[0:1, :, :])
                batch_pos[1].append(datavar.adjs)
                batch_pos[2].append(dataset.targets[0])  # 1 / 1 / 1
                batch_neg[0].append(datavar.seqs[1:2, :, :])
                batch_neg[1].append(datavar.adjs)
                batch_neg[2].append(dataset.targets[1])  # 0 / ddG / 0 or 1

        if len(batch_pos[0]) == 0:
            break

        datavar_pos = (
            torch.cat(batch_pos[0], 2),
            batch_pos[1],
        )
        datavar_neg = (
            torch.cat(batch_neg[0], 2),
            batch_neg[1],
        )

        with torch.no_grad():
            output_pos = net_d(*datavar_pos)
            output_neg = net_d(*datavar_neg)

        output = []
        target = []
        for i, (start, stop) in enumerate(get_bottleneck_indices(datavar_pos[1])):
            output.append(float(output_pos[:, :, start:stop].sigmoid().mean()))
            output.append(float(output_neg[:, :, start:stop].sigmoid().mean()))
            target.extend([batch_pos[2][i], batch_neg[2][i]])
        assert (i + 1) == len(batch_pos[2]) == len(batch_neg[2])
        assert output_pos.shape == output_neg.shape
        assert stop <= output_pos.shape[2] <= (stop + 1)
        
        outputs.extend(output)
        targets.extend(target)

    outputs_ar = np.array(outputs)
    targets_ar = np.array(targets)
    return targets_ar, outputs_ar

In [14]:
def evaluate_mutation_dataset(net_d, datasets, batch_size):
    """

    Returns:
    A tuple of targets and outputs arrays.
        - Targets are 0 for benign, -1 for deleterious.
        - Outputs are (pred_mut [low] - pred_wt [high]), so they should be *positive* for
            stabilizing mutations and *negative* for destabilizing mutations (i.e. the
            *reverse* of ΔΔG).
    """
    outputs = []
    targets = []
    datasets = iter(datasets)
    
    last_run = False
    while not last_run:
        batch_pos = ([], [], [])
        while len(batch_pos[0]) < batch_size:
            try:
                dataset = next(datasets)
            except StopIteration:
                last_run = True
                break
            else:
                datavar = net_d.dataset_to_datavar(dataset)
                # Pos
                batch_pos[0].append(datavar.seqs[0:1, :, :])
                batch_pos[1].append(datavar.adjs)
                batch_pos[2].append(dataset.targets[0])  # 1 / 1 / 1
                # Neg
                batch_pos[0].append(datavar.seqs[1:2, :, :])
                batch_pos[1].append(datavar.adjs)
                batch_pos[2].append(dataset.targets[1])  # 0 / ddG / 0 or 1

        if len(batch_pos[0]) == 0:
            break

        datavar_pos = (
            torch.cat(batch_pos[0], 2),
            batch_pos[1],
        )

        with torch.no_grad():
            output_pos = net_d(*datavar_pos)

        output = []
        target = []
        for i, (start, stop) in enumerate(get_bottleneck_indices(datavar_pos[1])):
            output.append(float(output_pos[:, :, start:stop].sigmoid().mean()))
            target.append(batch_pos[2][i])
        assert (i + 1) == len(batch_pos[2])
        assert stop <= output_pos.shape[2] <= (stop + 1)
        
        outputs.extend(output)
        targets.extend(target)

    outputs_ar = np.array(outputs)
    targets_ar = np.array(targets)
    return targets_ar, outputs_ar

In [15]:
def worker(unique_name, step=12462):
    step = int(step)
    net_d, net_g = load_networks(unique_name, step)

    net_d.train()
    net_g.train()

    assert net_d.training
    assert net_g.training

    all_scores = {}
    for fn in [evaluate_mutation_dataset, evaluate_validation_dataset]:
        for batch_size in [1, 64]:
            scores = {}
            for name, datasets in validation_data.items():
                targets_valid, outputs_valid = fn(net_d, datasets, batch_size)
                scores.update({
                    f'{name}-auc': metrics.roc_auc_score(targets_valid, outputs_valid),
                    f'{name}-targets-mean': targets_valid.mean(),
                    f'{name}-outputs-mean': outputs_valid.mean(),
                })
            for name, datasets in mutation_data.items():
                targets_valid, outputs_valid = fn(net_d, datasets, batch_size)
                targets_muts = targets_valid[1::2]
                outputs_muts = outputs_valid[1::2] - outputs_valid[0::2]
                if 'protherm' in name:
                    # Protherm predicts ΔΔG, so positive values are destabilizing
                    scores[f'{name}-spearman_corr'] = sp.stats.spearmanr(-targets_valid, outputs_valid).correlation
                elif 'humsavar' in name:
                    # For humsavar: 0 = stable, 1 = deleterious
                    scores[f'{name}-auc'] = metrics.roc_auc_score(1 - targets_valid, outputs_valid)
                else:
                    scores[f'{name}-auc'] = metrics.roc_auc_score(targets_valid + 1, outputs_valid)
                scores.update({
                    f'{name}-targets-mean': targets_valid.mean(),
                    f'{name}-outputs-mean': outputs_valid.mean(),
                })
            all_scores[(fn.__name__, batch_size)] = scores
    return all_scores

## Execute

In [16]:
unique_name = 'permute-seq-0-test_x14-0.1.9.dev-4a07eef'

In [17]:
model_path = (
    NOTEBOOK_PATH
    .parent
    .joinpath('train_neural_network')
    .joinpath(unique_name)
    .joinpath('models')
)
d_models = [int(re.findall('net_d-step_(\d+).model', str(p))[0]) for p in model_path.glob('net_d-step_*.model')]
g_models = [int(re.findall('net_g-step_(\d+).model', str(p))[0]) for p in model_path.glob('net_g-step_*.model')]
assert not set(d_models) ^ set(g_models)
steps = d_models

In [18]:
df = pd.DataFrame({'step': d_models}, index=range(len(d_models)))
df['unique_name'] = 'permute-seq-0-test_x14-0.1.9.dev-4a07eef'

In [19]:
len(df)

112

In [None]:
task_id = int(os.environ['SLURM_ARRAY_TASK_ID'])

In [None]:
df = df[task_id:task_id + 1]

In [None]:
all_scores = [
    worker(unique_name, step)
    for unique_name, step
    in zip(df['unique_name'].values, df['step'].values)
]

In [21]:
with NOTEBOOK_PATH.joinpath(f'all_scores_{task_id}.pickle').open('wb') as fout:
    pickle.dump(all_scores, fout, pickle.HIGHEST_PROTOCOL)

In [23]:
raise Exception("Done!")

Exception: Done!

In [34]:
!ls {NOTEBOOK_PATH}/all_scores.pickle

validate_trained_network/all_scores.pickle


In [45]:
with NOTEBOOK_PATH.joinpath('all_scores_0.pickle').open('rb') as fin:
    data = pickle.load(fin)

In [46]:
data

[{('evaluate_mutation_dataset', 1): {'edges-auc': 0.599506,
   'edges-outputs-mean': 0.6557192423933427,
   'edges-targets-mean': 0.5,
   'exact-auc': 0.617356,
   'exact-outputs-mean': 0.6807881731571156,
   'exact-targets-mean': 0.5,
   'humsavar-auc': 0.5327501649326212,
   'humsavar-outputs-mean': 0.7160360887895476,
   'humsavar-targets-mean': 0.7586159360352909,
   'middle-auc': 0.6135455,
   'middle-outputs-mean': 0.6350259415384935,
   'middle-targets-mean': 0.5,
   'permute-auc': 0.6127520000000001,
   'permute-outputs-mean': 0.6178972894671771,
   'permute-targets-mean': 0.5,
   'protherm-outputs-mean': 0.753119285922337,
   'protherm-spearman_corr': -0.04370535501024037,
   'protherm-targets-mean': 1.0729990058253231,
   'start-auc': 0.6127005000000001,
   'start-outputs-mean': 0.6341591125544404,
   'start-targets-mean': 0.5,
   'stop-auc': 0.6058479999999999,
   'stop-outputs-mean': 0.6565360083819569,
   'stop-targets-mean': 0.5},
  ('evaluate_mutation_dataset', 64): {'ed

## Submit

In [37]:
JOB_ID = 'job_1'
NOTEBOOK_PATH.joinpath(JOB_ID).mkdir(parents=True, exist_ok=True)

In [38]:
script = f"""\
#!/bin/bash
#SBATCH --time=12:00:00
#SBATCH --nodes=1
# SBATCH --exclusive
# SBATCH --mem=0
#SBATCH --account=def-pmkim
# SBATCH --account=rrg-pmkim
#SBATCH --job-name={NOTEBOOK_NAME}
#SBATCH --export=ALL
#SBATCH --output={NOTEBOOK_PATH.absolute()}/{JOB_ID}/slurm-%A_%a.out
#SBATCH --array=0-111
set -ev

unset XDG_RUNTIME_DIR

jupyter nbconvert ./07-{NOTEBOOK_NAME}.ipynb \\
    --to html \\
    --execute \\
    --ExecutePreprocessor.timeout=$((60 * 60 * 24))
"""

In [39]:
with NOTEBOOK_PATH.with_suffix('.sh').open('wt') as fout:
    fout.write(script)

In [40]:
!cat {NOTEBOOK_NAME}.sh

#!/bin/bash
#SBATCH --time=12:00:00
#SBATCH --nodes=1
# SBATCH --exclusive
# SBATCH --mem=0
#SBATCH --account=def-pmkim
# SBATCH --account=rrg-pmkim
#SBATCH --job-name=validate_trained_network
#SBATCH --export=ALL
#SBATCH --output=/gpfs/fs0/scratch/p/pmkim/strokach/datapkg/adjacency-net/notebooks/validate_trained_network/job_1/output.log
#SBATCH --array=0-111
set -ev

unset XDG_RUNTIME_DIR

jupyter nbconvert ./07-validate_trained_network.ipynb \
    --to html \
    --execute \
    --ExecutePreprocessor.timeout=$((60 * 60 * 24))


In [41]:
!chmod +x {NOTEBOOK_NAME}.sh

In [42]:
# !./{NOTEBOOK_NAME}.sh

In [43]:
!sbatch ./{NOTEBOOK_NAME}.sh

Submitted batch job 45974
