# Summary

# Imports

In [1]:
import pyarrow

In [2]:
import concurrent.futures
import itertools
import multiprocessing
import os
import os.path as op
import pickle
import subprocess
import tempfile
from functools import partial
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import seaborn as sns
import sqlalchemy as sa
from scipy import stats

from kmtools import py_tools, sequence_tools

In [3]:
%matplotlib inline

In [4]:
pd.set_option("max_columns", 100)

# Parameters

In [5]:
NOTEBOOK_PATH = Path('validation_training_stats')
NOTEBOOK_PATH

PosixPath('validation_training_stats')

In [6]:
OUTPUT_PATH = Path(os.getenv('OUTPUT_DIR', NOTEBOOK_PATH.name)).resolve()
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH

PosixPath('/home/kimlab1/database_data/datapkg/adjacency-net-v2/notebooks/validation_training_stats')

In [7]:
TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
TASK_COUNT = os.getenv("ORIGINAL_ARRAY_TASK_COUNT") or os.getenv("SLURM_ARRAY_TASK_COUNT")
NETWORK_NAME = os.getenv("NETWORK_NAME")

TASK_ID = int(TASK_ID) if TASK_ID is not None else None
TASK_COUNT = int(TASK_COUNT) if TASK_COUNT is not None else None

TASK_ID, TASK_COUNT

(None, None)

In [8]:
DEBUG = "CI" not in os.environ    
DEBUG

True

In [9]:
if DEBUG:
    NETWORK_NAME = "bd46824"
else:
    assert NETWORK_NAME is not None

In [10]:
if DEBUG:
    %load_ext autoreload
    %autoreload 2

# `DATAPKG`

In [11]:
DATAPKG = {}

In [12]:
DATAPKG['uniparc-domain-wstructure'] = (
    Path(os.environ['DATAPKG_OUTPUT_DIR'])
    .joinpath("uniparc-domain-wstructure", "master")
)

In [13]:
DATAPKG['adjacency_net_v2'] = (
    Path(os.environ['DATAPKG_OUTPUT_DIR'])
    .joinpath("adjacency-net-v2", "master")
)

In [14]:
DATAPKG['hhsuite-wstructure'] = (
    Path(os.environ['DATAPKG_OUTPUT_DIR'])
    .joinpath("hhsuite-wstructure", "master")
)

# Network evaluation

In [15]:
%run trained_networks.ipynb

In [16]:
def predict_with_dcn_old(input_, network_info, network_state):
    from pagnn.prediction.dcn_old import Args, main
    args = Args(network_info=network_info, network_state=network_state)
    output_df = main(args, input_)
    return output_df

In [17]:
def predict_with_dcn(input_, network_info, network_state):
    from pagnn.prediction.dcn import Args, main
    args = Args(network_info=network_info, network_state=network_state)
    output_df = main(args, input_)
    return output_df

## Demo dataset

In [18]:
input_file = Path("demo_datasets/demo_dataset_1/input.parquet").resolve(strict=True)
input_file

PosixPath('/home/kimlab1/database_data/datapkg/adjacency-net-v2/notebooks/demo_datasets/demo_dataset_1/input.parquet')

In [19]:
input_df = pq.read_table(input_file).to_pandas()
input_df.head(2)

Unnamed: 0,sequence,adjacency_idx_1,adjacency_idx_2
0,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKGEL...,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, ...","[1, 2, 4, 5, 8, 157, 160, 0, 2, 3, 4, 5, 157, ..."
1,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, ...","[1, 2, 4, 5, 8, 157, 160, 0, 2, 3, 4, 5, 6, 15..."


In [20]:
predict_with_dcn_old(
    input_df,
    network_state=TRAINED_NETWORKS['dcn_old_0']['network_state'],
    network_info=TRAINED_NETWORKS['dcn_old_0']['network_info'],
)

Unnamed: 0,predictions
0,0.539994
1,0.577878
2,0.59299
3,0.533424
4,0.544132
5,0.530011
6,0.555791
7,0.44644
8,0.260939
9,0.260939


In [21]:
predict_with_dcn(
    input_df,
    network_state=TRAINED_NETWORKS['bd46824']['network_state'],
    network_info=TRAINED_NETWORKS['bd46824']['network_info'],
)

Final output_channels: 256


Initializing custom network


Unnamed: 0,predictions
0,0.000782
1,0.000498
2,0.000525
3,0.000545
4,0.000609
5,0.000668
6,0.000652
7,0.00071
8,0.000252
9,0.000252
