# Summary

In this notebook we train a NN with multiple concatenated sequences per batch.

---

# Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.autograd import Variable

In [2]:
%run _imports.ipynb

Setting the PACKAGE_NAME environment variable.
Setting the PACKAGE_VERSION environment variable.
Setting the DOCS_SECRET_KEY environment variable.
Setting the PYTHON_VERSION environment variable.
Setting the SPARK_MASTER environment variable.
Setting the SPARK_ARGS environment variable.
Setting the DB_TYPE environment variable.
Setting the DB_PORT environment variable.


2018-01-30 18:49:21.542655


In [3]:
%run _settings.ipynb

/home/kimlab2/database_data/databin/uniparc_domain/0.1/adjacency_matrix.parquet


In [4]:
from typing import NamedTuple
import random
random.seed(42)

import h5py
from sklearn import metrics

In [5]:
import pagnn
importlib.reload(pagnn)

<module 'pagnn' from '/home/kimlab1/strokach/working/pagnn/pagnn/__init__.py'>

In [6]:
NOTEBOOK_NAME = 'test_pagnn'
NOTEBOOK_PATH = Path(NOTEBOOK_NAME).absolute()
NOTEBOOK_PATH.mkdir(exist_ok=True)

# Parameters

In [7]:
sorted(Path('threshold_by_pc_identity').iterdir())

[PosixPath('threshold_by_pc_identity/adjacency_matrix_test_gt0.parquet'),
 PosixPath('threshold_by_pc_identity/adjacency_matrix_test_gt40.parquet'),
 PosixPath('threshold_by_pc_identity/adjacency_matrix_test_gt60.parquet'),
 PosixPath('threshold_by_pc_identity/adjacency_matrix_test_gt80.parquet'),
 PosixPath('threshold_by_pc_identity/adjacency_matrix_training_gt0.parquet'),
 PosixPath('threshold_by_pc_identity/adjacency_matrix_training_gt40.parquet'),
 PosixPath('threshold_by_pc_identity/adjacency_matrix_training_gt60.parquet'),
 PosixPath('threshold_by_pc_identity/adjacency_matrix_training_gt80.parquet'),
 PosixPath('threshold_by_pc_identity/adjacency_matrix_validation_gt0.parquet'),
 PosixPath('threshold_by_pc_identity/adjacency_matrix_validation_gt40.parquet'),
 PosixPath('threshold_by_pc_identity/adjacency_matrix_validation_gt60.parquet'),
 PosixPath('threshold_by_pc_identity/adjacency_matrix_validation_gt80.parquet')]

In [8]:
sorted(Path('group_by_sequence_length').iterdir())

[PosixPath('group_by_sequence_length/adjacency_matrix_test_gt0_gbseqlen.parquet'),
 PosixPath('group_by_sequence_length/adjacency_matrix_test_gt40_gbseqlen.parquet'),
 PosixPath('group_by_sequence_length/adjacency_matrix_test_gt60_gbseqlen.parquet'),
 PosixPath('group_by_sequence_length/adjacency_matrix_test_gt80_gbseqlen.parquet'),
 PosixPath('group_by_sequence_length/adjacency_matrix_training_gt0_gbseqlen.parquet'),
 PosixPath('group_by_sequence_length/adjacency_matrix_training_gt40_gbseqlen.parquet'),
 PosixPath('group_by_sequence_length/adjacency_matrix_training_gt60_gbseqlen.parquet'),
 PosixPath('group_by_sequence_length/adjacency_matrix_training_gt80_gbseqlen.parquet'),
 PosixPath('group_by_sequence_length/adjacency_matrix_validation_gt0_gbseqlen.parquet'),
 PosixPath('group_by_sequence_length/adjacency_matrix_validation_gt40_gbseqlen.parquet'),
 PosixPath('group_by_sequence_length/adjacency_matrix_validation_gt60_gbseqlen.parquet'),
 PosixPath('group_by_sequence_length/adjacenc

In [465]:
importlib.reload(pagnn)

<module 'pagnn' from '/home/kimlab1/strokach/working/pagnn/pagnn/__init__.py'>

In [466]:
random_state = np.random.RandomState(24)

rows_gen = pagnn.iter_domain_rows(
    Path('threshold_by_pc_identity').joinpath('adjacency_matrix_validation_gt80.parquet'),
    columns={
        'qseq': 'sequence',
        'residue_idx_1_corrected': 'adjacency_idx_1',
        'residue_idx_2_corrected': 'adjacency_idx_2',
    },
    random_state=random_state,
)

datasets_gen = pagnn.iter_dataset_batches(rows_gen, 50_000)

In [467]:
parquet_folders = sorted(
    Path('group_by_sequence_length')
    .joinpath('adjacency_matrix_validation_gt80_gbseqlen.parquet')
    .glob('qseq_length_bin=*')
)

weights = pagnn.get_weights(parquet_folders)

In [470]:
random_state = np.random.RandomState(42)

rows_gbseqlen_gen = pagnn.iter_dataset_rows(
    parquet_folders,
    columns={
        'qseq': 'sequence',
        'residue_idx_1_corrected': 'adjacency_idx_1',
        'residue_idx_2_corrected': 'adjacency_idx_2',
    },
    seq_length_constraint=True,
    random_state=random_state,
)
print(rows_gbseqlen_gen.send(None))

None


In [444]:
# dataset = next(datasets_gen)
# dataset

In [445]:
random_state = np.random.RandomState(42)

datasets_pos = []
datasets_neg = []

for row in tqdm.tqdm_notebook(rows_gen):
    dataset_pos = pagnn.row_to_dataset(row)
    try:
        dataset_neg = pagnn.add_negative_example(
            [dataset_pos],
            method='exact',
            datagen=rows_gbseqlen_gen,
            random_state=random_state,
        )
    except pagnn.MaxNumberOfTriesExceededError as e:
        print(e)
        continue
    datasets_pos.append(dataset_pos)
    datasets_neg.append(dataset_neg)

MaxNumberOfTriesExceededError: 1025

In [169]:
def catcher():
    m = 1
    for i in range(1000):
        m = yield (i * m)
        print(f"Received: {m}")


def multi(gen):
    val = None
    while True:
        val = yield (gen.send(val) * 100_000)

In [170]:
g = catcher()
print("o", next(g))

print(g.send(10))
print(g.send(100))
print(g.send(1000))

o 0
Received: 10
10
Received: 100
200
Received: 1000
3000


In [192]:
g = (i for i in range(100))
g.send(None)

0

In [194]:
def foo():
    yield 1
    
    
type(foo())

generator

In [171]:
g = catcher()

g = multi(g)
next(g)

print(g.send(10))
print(g.send(100))
print(g.send(1000))
print(g.send(1000))
print(g.send(1000))
print(g.send(1000))

Received: 10
1000000
Received: 100
20000000
Received: 1000
300000000
Received: 1000
400000000
Received: 1000
500000000
Received: 1000
600000000


In [55]:
dataset_1

[DataSet(seq=b'GECLEWLRRYLENGKEMLQRRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDQETRNVKAQSQTDRENLR', adj=<79x79 sparse matrix of type '<class 'numpy.float64'>'
	with 439 stored elements in COOrdinate format>, target=0),
 DataSet(seq=b'ILLRYYNQSEAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDQETRNVKAQSQTDR', adj=<86x86 sparse matrix of type '<class 'numpy.float64'>'
	with 474 stored elements in COOrdinate format>, target=0),
 DataSet(seq=b'VDLGTLRGYYNQSEAGSHTIQIMYGCDVGSDGSHSMRYFYTAMSRPGRGEPRFIAVGYVDDTQFVRFDSDAASPREEPRAPWIEQEGPEYWDRNTQIYKAQAQTDRESLRNLRGYYNQSEAGSHTWQTMYGCDLGP', adj=<136x136 sparse matrix of type '<class 'numpy.float64'>'
	with 778 stored elements in COOrdinate format>, target=0),
 DataSet(seq=b'DGRLLRGHDQYAYDGKDYIALNEDLSSWTAADTAAQITQRKWEAARVAEQLRAYLEGTCVEWLRRYLENGKETLQRSHSMRYFDTAVSRPGRGEPRFISVGYVDDTQFVRFDSDAGSPRGEPRAPWVEQEGPEYWDRETQKYKRQAQADRVNLRKLRGYYNQSEDGSHTLQWMYGCDLGPDGRLLRGYDQSAYDGKDYIALNE', adj=<203x203 sparse matrix of type '<class 'numpy.float64'>'
	wit

In [None]:
raise Exception

In [None]:
np.hstack([np.arange(5), np.array([1,2,3])])

In [None]:
np.r_[range(5), [1,2,3]]

In [None]:
np.r_[[1,2,3], [4,5,6]]

In [None]:
rs = np.random.RandomState(42)
rs.rand(32)

In [None]:
import torch
from scipy import sparse

In [None]:
def gen_seq(length=10_000):
    idx_1 = np.random.RandomState(41).randint(0, 20, length, dtype=np.int16)
    idx_2 = np.arange(length, dtype=int)
    seq = np.zeros((20, length), dtype=np.int16)
    seq[idx_1, idx_2] = 1
    return seq


assert (gen_seq(30).sum(axis=0) == 1).all()

In [None]:
def gen_seq_sparse(length=10_000):
    idx_1 = np.random.RandomState(41).randint(0, 20, length, dtype=int)
    idx_2 = np.arange(length, dtype=int)
    seq = sparse.coo_matrix(([1] * length, (idx_1, idx_2)), dtype=np.int16)
    return seq


assert (gen_seq_sparse(30).toarray().sum(axis=0) == 1).all()

In [None]:
def gen_adj(seq_length=10_000, interaction_length=200_000):
    idx_1 = np.random.RandomState(41).randint(0, seq_length, interaction_length, dtype=int)
    idx_2 = np.arange(interaction_length, dtype=int)
    seq = np.zeros((seq_length, interaction_length), dtype=np.int16)
    seq[idx_1, idx_2] = 1
    return seq


assert (gen_adj(10, 100).sum(axis=0) == 1).all()

In [None]:
def gen_adj_sparse(seq_length=10_000, interaction_length=200_000):
    idx_1 = np.random.RandomState(41).randint(0, seq_length, interaction_length, dtype=int)
    idx_2 = np.arange(interaction_length, dtype=int)
    adj = sparse.coo_matrix(([1] * interaction_length, (idx_1, idx_2)), dtype=np.int16)
    return adj


assert (gen_adj(10, 100).sum(axis=0) == 1).all()

In [None]:
def to_torch_sparse(sparray):
    return torch.sparse.FloatTensor(
        torch.LongTensor(np.vstack([sparray.row, sparray.col])),
        torch.FloatTensor(sparray.data))

In [None]:
def test_mat_multiply_nocuda(use_streams):
    seq_cuda = to_torch_sparse(gen_seq_sparse(600)).to_dense()
    adj_cuda = to_torch_sparse(gen_adj_sparse(600, 2400)).to_dense()

    for i in range(10_000):
        result = seq_cuda @ adj_cuda

In [None]:
%time test_mat_multiply_nocuda(None)

In [None]:
def test_mat_multiply_nocuda(use_streams):
    seq_cuda = to_torch_sparse(gen_seq_sparse(6000)).to_dense()
    adj_cuda = to_torch_sparse(gen_adj_sparse(600, 2400)).to_dense()

    for i in range(1_000):
        start = 0
        for _ in range(10):
            stop = start + 600
            result = seq_cuda[:, start:stop] @ adj_cuda
            start = stop

In [None]:
%time test_mat_multiply_nocuda(None)

In [None]:
def test_mat_multiply_nocuda(use_streams):
    seq_cuda = to_torch_sparse(gen_seq_sparse(60000)).to_dense()
    adj_cuda = to_torch_sparse(gen_adj_sparse(600, 2400)).to_dense()

    for i in range(100):
        start = 0
        for _ in range(100):
            stop = start + 600
            result = seq_cuda[:, start:stop] @ adj_cuda
            start = stop

In [None]:
%time test_mat_multiply_nocuda(None)