## Summary

### Submitting jobs

Note: These jobs must be submitted from the <code>./notebooks</code> folder.

**Cedar**

```bash
NOTEBOOK_PATH=$(realpath 03_add_adjacency_distances.ipynb) ORIGINAL_ARRAY_TASK_COUNT=1027 sbatch --array=1,17,20,24,32,33,34,37,42,56,63,67,70,76,96,119,126,130,135,151,156,164,167,170,171,172,173,179,181,182,183,187,195,199,204,207,209,217,219,222,230,232,235,238,239,250,251,252,253,262,269,271,274,281,282,284,292,293,295,298,300,301,305,307,308,314,317,319,326,327,328,329,330,331,332,333,334,337,349,354,358,374,379,382,383,386,392,393,394,396,397,400,410,413,414,416,420,421,422,427,428,431,441,444,454,455,458,459,470,481,486,488,502,503,504,505,512,513,514,515,516,518,522,523,525,531,536,540,542,553,556,557,558,559,560,561,562,563,564,565,566,567,568,569,570,572,573,576,590,592,593,594,595,597,601,605,609,613,614,616,626,628,631,663,664,668,672,675,678,679,684,689,691,693,695,698,699,702,704,705,709,711,712,722,725,730,733,741,742,744,746,758,761,771,773,776,777,778,779,780,781,782,783,785,786,787,789,791,792,793,800,807,812,813,815,816,817,818,820,821,822,823,824,826,828,830,831,832,834,836,838,841,842,843,844,845,846,847,848,849,850,852,853,854,855,856,858,860,861,862,863,864,865,866,867,868,871,872,873,874,875,876,877,878,880,883,886,887,888,889,890,891,893,895,896,897,898,899,900,901,904,905,906,907,908,911,912,913,915,917,919,920,932,935,937,938,939,940,941,942,943,946,949,952,954,957,960,965,966,967,969,971,974,975,977,980,981,1011,1012 --time=72:00:00 --nodes=1 --tasks-per-node=48 --mem=0 --job-name=add-adjacency-distances --account=rrg-pmkim --output=/scratch/strokach/tmp/log/run-notebook-cpu-%j-%N.log ../scripts/run_notebook_cpu.sh
```

----

## Imports

In [None]:
import concurrent.futures
import concurrent.futures.process
import importlib
import logging
import os
import shlex
import shutil
import socket
import subprocess
import sys
import traceback
from functools import partial
from itertools import islice
from pathlib import Path

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psutil
import pyarrow as pa
import pyarrow.parquet as pq
import tenacity
import yaml
from kmbio import PDB
from kmtools import structure_tools

In [None]:
%matplotlib inline

pd.set_option("max_columns", 100)

In [None]:
SRC_PATH = Path.cwd().joinpath('..', 'src').resolve(strict=True)

if SRC_PATH.as_posix() not in sys.path:
    sys.path.insert(0, SRC_PATH.as_posix())

import helper
importlib.reload(helper)

## Parameters

In [None]:
NOTEBOOK_PATH = Path(os.getenv("CI_JOB_NAME", "03_add_adjacency_distances"))
NOTEBOOK_PATH

In [None]:
OUTPUT_PATH = Path(os.getenv('OUTPUT_DIR', NOTEBOOK_PATH.name)).resolve()
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH

In [None]:
Path.cwd().expanduser()

In [None]:
if "scinet" in socket.gethostname():
    CPU_COUNT = 40
else:
    CPU_COUNT = max(1, len(os.sched_getaffinity(0)) // 2)

CPU_COUNT

In [None]:
ADJACENCY_MATRIX_PARQUET_PATH = Path(os.getenv("DATAPKG_OUTPUT_DIR")).joinpath(
    "adjacency-net-v2", "v0.3", "training_dataset", "adjacency_matrix.parquet"
)

In [None]:
TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
TASK_COUNT = os.getenv("ORIGINAL_ARRAY_TASK_COUNT") or os.getenv("SLURM_ARRAY_TASK_COUNT")

TASK_ID = int(TASK_ID) if TASK_ID is not None else None
TASK_COUNT = int(TASK_COUNT) if TASK_COUNT is not None else None

TASK_ID, TASK_COUNT

In [None]:
task_id_offset = os.getenv("TASK_ID_OFFSET")
if task_id_offset is not None:
    TASK_ID += int(task_id_offset)

In [None]:
DEBUG = TASK_ID is None

if DEBUG:
    TASK_ID = 17
    TASK_COUNT = 1027
else:
    assert TASK_ID is not None
    assert TASK_COUNT is not None

TASK_ID, TASK_COUNT

In [None]:
# assert len(os.listdir(ADJACENCY_MATRIX_PARQUET_PATH)) == TASK_COUNT, (
#     len(os.listdir(ADJACENCY_MATRIX_PARQUET_PATH)),
#     TASK_COUNT
# )

In [None]:
if DEBUG:
    %load_ext autoreload
    %autoreload 2

## `DATAPKG`

In [None]:
DATAPKG = {}

In [None]:
DATAPKG["pdb-ffindex"] = {
    "pdb_mmcif_ffindex": (
        Path(os.environ["DATAPKG_OUTPUT_DIR"]).joinpath("pdb-ffindex", "2018-09-06", "pdb-mmcif")
    )
}

In [None]:
DATAPKG["pdb-ffindex"]

## Load data

In [None]:
folders = sorted([d for d in ADJACENCY_MATRIX_PARQUET_PATH.glob("database_id=*") if d.is_dir()])
folders[:3]

In [None]:
files = sorted(folders[TASK_ID - 1].glob("*.parquet"))

print(files[:2])
print(len(files))

In [None]:
df = (
    pq.ParquetFile(files[0])
    .read_row_group(0, use_pandas_metadata=True)
    .to_pandas(integer_object_nulls=True)
    .set_index("__index_level_0__")
)

In [None]:
df.head(2)

## Run pipeline

### Test on a single row

In [None]:
row = list(islice(df.itertuples(), 3))[0]

In [None]:
STRUCTURE_URL_PREFIX = f"ff://{DATAPKG['pdb-ffindex']['pdb_mmcif_ffindex']}?"
STRUCTURE_URL_PREFIX

In [None]:
# results = helper.get_adjacency_with_distances_and_orientations(
#     row, max_cutoff=12, min_cutoff=None, structure_url_prefix=STRUCTURE_URL_PREFIX
# )

In [None]:
# ar = results["distance"][0]

In [None]:
# fg, ax = plt.subplots()
# ax.hist(ar.to_pylist(), range=(0, 12), bins=100)
# None

### Test as part of a multiprocessing worker

In [None]:
def worker(data):
    row = helper.to_namedtuple(data)

    base = {}
    for column in [
        "Index",
        "uniparc_id",
        "sequence",
        "database",
        "interpro_name",
        "interpro_id",
        "domain_start",
        "domain_end",
        "domain_length",
        "structure_id",
        "model_id",
        "chain_id",
        "pc_identity",
        "alignment_length",
        "mismatches",
        "gap_opens",
        "q_start",
        "q_end",
        "s_start",
        "s_end",
        "evalue_log10",
        "bitscore",
        "qseq",
        "sseq",
    ]:
        base[column] = [data[column]]
    for column in ["a2b", "b2a", "residue_id_1", "residue_id_2", "residue_aa_1", "residue_aa_2"]:
        if data[column].dtype in (int, float):
            values = pa.array([(int(i) if pd.notnull(i) else None) for i in data[column]])
        else:
            values = pa.array(data[column].tolist())
        base[column] = [values]

    result = None
    failure = None
    try:
        result = {
            **tenacity.retry(
                reraise=True,
                retry=tenacity.retry_if_exception_type(StopIteration),
                wait=tenacity.wait_random(min=0.5, max=2),
                stop=tenacity.stop_after_attempt(5),
            )(helper.get_adjacency_with_distances_and_orientations)(
                row, max_cutoff=12, min_cutoff=None, structure_url_prefix=STRUCTURE_URL_PREFIX
            ),
            **base,
        }
        result = helper.downcast_and_compress(result)
    except Exception as error:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        traceback_string = "\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback))
        failure = {
            "error_type": [repr(type(error))],
            "error_message": [str(error)],
            "error_traceback": [traceback_string],
            **base,
        }
        failure = helper.downcast_and_compress(failure)

    return result, failure

In [None]:
# result, failure = worker(row._asdict())

In [None]:
# batch = pa.RecordBatch.from_arrays(list(result.values()), list(result.keys()))

### Run for all rows

In [None]:
def get_new_file(file, failed=False):
    file_parts = list(file.parts)
    file_parts[-4] = file_parts[-4] + "_wdistances"
    file_parts[-1] = file_parts[-1].split(".")[0] + ".arrow"
    if failed:
        file_parts.insert(-3, "failed")
    new_file = Path(*file_parts)
    return new_file

In [None]:
logging.getLogger("kmtools.structure_tools.fixes").setLevel(logging.CRITICAL)

In [None]:
total_num_rows_processed = 0
for file in tqdm(files):
    ds = pq.ParquetFile(file)

    new_file = get_new_file(file)
    new_file.parent.mkdir(parents=True, exist_ok=True)
    writer = None

    new_file_failed = get_new_file(file, failed=True)
    new_file_failed.parent.mkdir(parents=True, exist_ok=True)
    writer_failed = None

    for row_group in tqdm(range(ds.num_row_groups), leave=False):
        df = (
            ds.read_row_group(row_group, use_pandas_metadata=True)
            .to_pandas(integer_object_nulls=True)
            .set_index("__index_level_0__")
        )
        num_rows_processed = 0
        while num_rows_processed < len(df):
            try:
                with concurrent.futures.ProcessPoolExecutor(CPU_COUNT) as pool:
                    futures = pool.map(
                        worker, (t._asdict() for t in df.iloc[num_rows_processed:].itertuples()), chunksize=1
                    )
                    for result, failure in tqdm(futures, leave=False, total=len(df) - num_rows_processed):
                        num_rows_processed += 1
                        if result is None:
                            assert failure is not None
                        if result:
                            batch = pa.RecordBatch.from_arrays(list(result.values()), list(result.keys()))
                            if writer is None:
                                writer = pa.RecordBatchFileWriter(new_file, batch.schema)
                            writer.write_batch(batch)
                        if failure:
                            batch = pa.RecordBatch.from_arrays(list(failure.values()), list(failure.keys()))
                            if writer_failed is None:
                                writer_failed = pa.RecordBatchFileWriter(new_file_failed, batch.schema)
                            writer_failed.write_batch(batch)
            except concurrent.futures.process.BrokenProcessPool as e:
                print(
                    f"ProcessPool crashed while processing row_group '{row_group}' in file '{file}'. "
                    f"The error is '{type(e)}': '{e}'."
                )
        total_num_rows_processed += num_rows_processed
    if writer is not None:
        writer.close()
    if writer_failed is not None:
        writer_failed.close()

### Test that everything went ok

In [None]:
try:
    reader = pa.RecordBatchFileReader(new_file)
except pa.ArrowIOError:
    num_successful_batches = 0
else:
    num_successful_batches = reader.num_record_batches
    
num_successful_batches

In [None]:
try:
    reader_failed = pa.RecordBatchFileReader(new_file_failed)
except pa.ArrowIOError:
    num_failed_batches = 0
else:
    num_failed_batches = reader_failed.num_record_batches
    
num_failed_batches

### Write a `._SUCCESS` file

In [None]:
with new_file.parent.joinpath("._SUCCESS").open("wt"):
    pass