# Summary

# Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.autograd import Variable

In [2]:
%run _imports.ipynb

Setting the PYTHON_VERSION environment variable.
Setting the SPARK_MASTER environment variable.
Setting the SPARK_ARGS environment variable.
Setting the DB_TYPE environment variable.
Setting the DB_PORT environment variable.


2017-12-13 18:35:59.863847


In [3]:
%run _settings.ipynb

In [4]:
import random
from typing import NamedTuple

import h5py
from sklearn import metrics

In [5]:
import pagnn
importlib.reload(pagnn)

<module 'pagnn' from '/home/kimlab1/strokach/working/pagnn/pagnn/__init__.py'>

In [6]:
NOTEBOOK_NAME = 'generate_datasets'
NOTEBOOK_PATH = Path(NOTEBOOK_NAME).absolute()
NOTEBOOK_PATH.mkdir(exist_ok=True)

# Parameters

## Gene3D domains

In [7]:
ADJACENCY_MATRIX_PARQUET_FILES = list(ADJACENCY_MATRIX_PATH.glob('*/*.parquet'))

In [8]:
GENE3D_DOMAINS = sorted({op.basename(op.dirname(f)) for f in ADJACENCY_MATRIX_PARQUET_FILES})

random.seed(42)
random.shuffle(GENE3D_DOMAINS)
GENE3D_DOMAINS[:3]

['database_id=G3DSA%3A2.40.128.20',
 'database_id=G3DSA%3A3.50.40.10',
 'database_id=G3DSA%3A2.60.40.830']

In [9]:
with open(f'{NOTEBOOK_NAME}/gene3d_domains.pickle', 'wb') as fout:
    pickle.dump(GENE3D_DOMAINS, fout, pickle.HIGHEST_PROTOCOL)

## Training / validation domains

In [10]:
breakpoint1 = len(GENE3D_DOMAINS) * 3 // 4
print(breakpoint1)
breakpoint2 = len(GENE3D_DOMAINS) * 7 // 8
print(breakpoint2)

TRAINING_DOMAINS = GENE3D_DOMAINS[:breakpoint1]
VALIDATION_DOMAINS = GENE3D_DOMAINS[breakpoint1:breakpoint2]
TEST_DOMAINS = GENE3D_DOMAINS[breakpoint2:]

1029
1201


In [11]:
assert (len(TRAINING_DOMAINS) + len(VALIDATION_DOMAINS) + len(TEST_DOMAINS)) == len(GENE3D_DOMAINS)

In [12]:
with open(f'{NOTEBOOK_NAME}/training_domains.pickle', 'wb') as fout:
    pickle.dump(TRAINING_DOMAINS, fout, pickle.HIGHEST_PROTOCOL)
    
with open(f'{NOTEBOOK_NAME}/validation_domains.pickle', 'wb') as fout:
    pickle.dump(VALIDATION_DOMAINS, fout, pickle.HIGHEST_PROTOCOL)
    
with open(f'{NOTEBOOK_NAME}/test_domains.pickle', 'wb') as fout:
    pickle.dump(TEST_DOMAINS, fout, pickle.HIGHEST_PROTOCOL)

## Training / validation parquet files

In [13]:
_training_domains = set(TRAINING_DOMAINS)
TRAINING_PARQUET_FILES = [
    f for f in ADJACENCY_MATRIX_PARQUET_FILES if op.basename(op.dirname(f)) in _training_domains]

In [14]:
_validation_domains = set(VALIDATION_DOMAINS)
VALIDATION_PARQUET_FILES = [
    f for f in ADJACENCY_MATRIX_PARQUET_FILES if op.basename(op.dirname(f)) in _validation_domains]

In [15]:
_test_domains = set(TEST_DOMAINS)
TEST_PARQUET_FILES = [
    f for f in ADJACENCY_MATRIX_PARQUET_FILES if op.basename(op.dirname(f)) in _test_domains]

In [16]:
assert (
    len(TRAINING_PARQUET_FILES) + 
    len(VALIDATION_PARQUET_FILES) +
    len(TEST_PARQUET_FILES)) == len(ADJACENCY_MATRIX_PARQUET_FILES)

In [17]:
with open(f'{NOTEBOOK_NAME}/training_parquet_files.pickle', 'wb') as fout:
    pickle.dump(TRAINING_PARQUET_FILES, fout, pickle.HIGHEST_PROTOCOL)
    
with open(f'{NOTEBOOK_NAME}/validation_parquet_files.pickle', 'wb') as fout:
    pickle.dump(VALIDATION_PARQUET_FILES, fout, pickle.HIGHEST_PROTOCOL)
    
with open(f'{NOTEBOOK_NAME}/test_parquet_files.pickle', 'wb') as fout:
    pickle.dump(TEST_PARQUET_FILES, fout, pickle.HIGHEST_PROTOCOL)

# Training data

## Benchmark

### Generate

In [None]:
subsample = 100
domain_folders = [ADJACENCY_MATRIX_PATH.joinpath(d) for d in VALIDATION_DOMAINS]
columns = ['qseq', 'residue_idx_1_corrected', 'residue_idx_2_corrected']

for compression in ['lzf']:  # [None, 'gzip', 'lzf']:
    for shuffle in [0, 1]:
        random.seed(42)
        hdf5_file = f'{NOTEBOOK_NAME}/validation-{compression}-{shuffle}.h5'
        if op.isfile(hdf5_file):
            print(f"File already exists: {hdf5_file}")
            continue
        with h5py.File(hdf5_file, 'w') as hdf5_file:
            for domain_folder in tqdm.tqdm_notebook(domain_folders, total=len(domain_folders)):
                domain = domain_folder.name.partition('A%3A')[-1]
                for row_idx, row in enumerate(pagnn.iter_domain_rows(
                        domain_folder, columns=columns, subsample=subsample)):
                    row_key = f'/{domain}/{row_idx:05d}'
                    try:
                        dataset = pagnn.row_to_dataset(row, num_real=1, num_fake=1)
                    except pagnn.SequenceTooShortError:
                        print('Skipping short sequence...')
                        continue
                    # Write
                    ds = hdf5_file.create_group(row_key)
                    for name, data in dataset._asdict().items():
                        ds.create_dataset(
                            name, 
                            data=data,
                            compression=compression,
                            compression_opts=None if compression in [None, 'lzf'] else 6,
                            shuffle=bool(shuffle))

Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...
Skipping short sequence...


### Compare performance

In [33]:
from typing import Optional

class Timer:
    label: str
    interval: Optional[float]

    # === Private Attributes ===
    _start: Optional[float]
    _end: Optional[float]
    _is_verbose: bool

    def __init__(self,
                 label: str = 'Your code',
                 is_verbose: bool = True) -> None:
        """Initialize a Timer.

        `label` describes the block of code.
        """
        self.label = label
        self._is_verbose = is_verbose
        self.interval = None
        self.start = None
        self.end = None

    def __enter__(self) -> 'Timer':
        """Enter a timed context."""
        self.start = time.perf_counter()
        return self

    # The parameters have more specific types than object, but for simplicity,
    # we are declaring them as objects.
    def __exit__(self, exc_type: object, exc_value: object,
                 exc_trace: object) -> bool:
        """Exit a timed context."""
        self.end = time.perf_counter()
        self.interval = self.end - self.start
        if self._is_verbose:
            print(f'{self.label} took {self.interval} seconds')

        return False

In [34]:
for compression in [None, 'gzip', 'lzf']:
    for shuffle in [0, 1]:
        hdf5_file = f'{NOTEBOOK_NAME}/validation-{compression}-{shuffle}.h5'
        with Timer(hdf5_file):
            with h5py.File(hdf5_file, 'r') as store:
                for grp in tqdm.tqdm_notebook(store):
                    for subgroup in store[grp]:
                        assert store[grp][subgroup]['seqs'][:].shape[0]
                        assert store[grp][subgroup]['adj'][:].any()
                        assert store[grp][subgroup]['targets'][:].any()


generate_datasets/validation-None-0.h5 took 13.268215323332697 seconds



generate_datasets/validation-None-1.h5 took 15.799906645901501 seconds



generate_datasets/validation-gzip-0.h5 took 18.78152546705678 seconds



generate_datasets/validation-gzip-1.h5 took 20.182525789365172 seconds



generate_datasets/validation-lzf-0.h5 took 19.45821071602404 seconds



generate_datasets/validation-lzf-1.h5 took 19.524443845730275 seconds


In [35]:
for compression in [None, 'gzip', 'lzf']:
    for shuffle in [0, 1]:
        hdf5_file = f'/home/strokach/databin/adjacency-net/validation-{compression}-{shuffle}.h5'
        with Timer(hdf5_file):
            with h5py.File(hdf5_file, 'r') as store:
                for grp in tqdm.tqdm_notebook(store):
                    for subgroup in store[grp]:
                        assert store[grp][subgroup]['seqs'][:].shape[0]
                        assert store[grp][subgroup]['adj'][:].any()
                        assert store[grp][subgroup]['targets'][:].any()


/home/strokach/databin/adjacency-net/validation-None-0.h5 took 13.307342018000782 seconds



/home/strokach/databin/adjacency-net/validation-None-1.h5 took 15.636534132994711 seconds



/home/strokach/databin/adjacency-net/validation-gzip-0.h5 took 19.711461490020156 seconds



/home/strokach/databin/adjacency-net/validation-gzip-1.h5 took 21.033986631780863 seconds



/home/strokach/databin/adjacency-net/validation-lzf-0.h5 took 18.769805277697742 seconds



/home/strokach/databin/adjacency-net/validation-lzf-1.h5 took 20.113161090761423 seconds


In [36]:
for compression in [None, 'gzip', 'lzf']:
    for shuffle in [0, 1]:
        hdf5_file = f'/dev/shm/validation-{compression}-{shuffle}.h5'
        with Timer(hdf5_file):
            with h5py.File(hdf5_file, 'r') as store:
                for grp in tqdm.tqdm_notebook(store):
                    for subgroup in store[grp]:
                        assert store[grp][subgroup]['seqs'][:].shape[0]
                        assert store[grp][subgroup]['adj'][:].any()
                        assert store[grp][subgroup]['targets'][:].any()


/dev/shm/validation-None-0.h5 took 13.400442116893828 seconds



/dev/shm/validation-None-1.h5 took 15.682986418250948 seconds



/dev/shm/validation-gzip-0.h5 took 19.23002047231421 seconds



/dev/shm/validation-gzip-1.h5 took 20.462590687908232 seconds



/dev/shm/validation-lzf-0.h5 took 18.184145013801754 seconds



/dev/shm/validation-lzf-1.h5 took 19.25778759876266 seconds


### Conclusions

- `validation-None-0.h5` is best for fast disks like SSDs and RAM disks.
- `validation-gzip-0.h5` or `validation-lzf-0.h5` is best for slower disks like NFS and is probably the best overall (18.8 seconds vs. 15.6 seconds for RAM disk).

In [37]:
!cp {NOTEBOOK_NAME}/validation-gzip-0.h5 {NOTEBOOK_NAME}/validation.h5 

## Generate dataset

In [None]:
domain_folders = [ADJACENCY_MATRIX_PATH.joinpath(d) for d in VALIDATION_DOMAINS]
columns = ['qseq', 'residue_idx_1_corrected', 'residue_idx_2_corrected']

In [None]:
compression = 'gzip'
compression_opts=6
shuffle = 0
subsample = 100

random.seed(42)
with h5py.File(f'{NOTEBOOK_NAME}/validation.h5', 'w') as store:
    for domain_folder in tqdm.tqdm_notebook(domain_folders, total=len(domain_folders)):
        domain = domain_folder.name.partition('A%3A')[-1]
        for row_idx, row in enumerate(pagnn.iter_domain_rows(
                domain_folder, columns=columns, subsample=subsample)):
            row_key = f'/{domain}/{row_idx:05d}'
            try:
                dataset = pagnn.row_to_dataset(row, num_real=1, num_fake=1)
            except pagnn.SequenceTooShortError:
                print('Skipping short sequence...')
                continue
            # Write
            ds = store.create_group(row_key)
            for name, data in dataset._asdict().items():
                ds.create_dataset(
                    name, 
                    data=data,
                    compression=compression,
                    compression_opts=None if compression in [None, 'lzf'] else compression_opts,
                    shuffle=bool(shuffle))