# Summary

# Imports

In [1]:
import os
import shutil
import subprocess
import shlex
import yaml
from pathlib import Path

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
pd.set_option("max_columns", 100)

# Parameters

In [3]:
NOTEBOOK_PATH = Path(os.getenv("CI_JOB_NAME", "training_dataset_wdistances"))
NOTEBOOK_PATH

PosixPath('training_dataset_wdistances')

In [4]:
OUTPUT_PATH = Path(os.getenv('OUTPUT_DIR', NOTEBOOK_PATH.name)).resolve()
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH

PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/datapkg/adjacency-net-v2/notebooks/training_dataset_wdistances')

# `DATAPKG`

In [5]:
DATAPKG = {}

In [6]:
DATAPKG['adjacency-net-v2'] = {
    'training_dataset': (
        Path(os.environ['DATAPKG_OUTPUT_DIR'])
        .joinpath("adjacency-net-v2", "master", "training_dataset", "adjacency_matrix.parquet")
    ),
}

# Load data

In [7]:
files = sorted(DATAPKG['adjacency-net-v2']['training_dataset'].glob("*/*.parquet"))

print(files[:3])
print(len(files))

[PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/databin/adjacency-net-v2/master/training_dataset/adjacency_matrix.parquet/database_id=G3DSA%3A1.10.10.10/part-00000-3b8a9efe-9588-498e-bb62-742bb68b4eb5-c000.snappy.parquet'), PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/databin/adjacency-net-v2/master/training_dataset/adjacency_matrix.parquet/database_id=G3DSA%3A1.10.10.180/part-00000-53479571-ef99-4995-bf26-90d1fad763a8-c000.snappy.parquet'), PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/databin/adjacency-net-v2/master/training_dataset/adjacency_matrix.parquet/database_id=G3DSA%3A1.10.10.190/part-00000-1917c3d6-f0f8-46b2-898a-0b2760f1e622-c000.snappy.parquet')]
1029


In [13]:
ds = pq.ParquetFile(files[0])
df = ds.read_row_group(0, use_pandas_metadata=True).to_pandas(integer_object_nulls=True)

In [15]:
df.head(2)

Unnamed: 0,__index_level_0__,uniparc_id,sequence,database,interpro_name,interpro_id,domain_start,domain_end,domain_length,structure_id,model_id,chain_id,pc_identity,alignment_length,mismatches,gap_opens,q_start,q_end,s_start,s_end,evalue_log10,bitscore,qseq,sseq,a2b,b2a,residue_idx_1,residue_idx_2,residue_id_1,residue_id_2,residue_aa_1,residue_aa_2,residue_idx_1_corrected,residue_idx_2_corrected
0,572204,UPI00006AEA71,MTDKNMKEYAKRIETAFSKIMKKLGPEMSKLAEGLTPPQFFVLKLL...,Gene3D,Winged helix-like DNA-binding domain superfamily,IPR036388,1,106,106,4XRF,0.0,A,44.94,89.0,45.0,3.0,16.0,101.0,11.0,98.0,-15.522879,72.0,AFSKIMKKLGPEMSKL--AEGLTPPQFFVLKLLQTNGRT-VTEIAE...,AFS-IGKKMQTELLEQMQATGLTPPQFYILKILDHYGASRATTLAK...,"[1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0,...","[1.0, 2.0, 3.0, nan, 4.0, 5.0, 6.0, 7.0, 8.0, ...","[10.0, 10.0, 10.0, 10.0, 10.0, 11.0, 11.0, 11....","[11.0, 12.0, 13.0, 14.0, 15.0, 10.0, 12.0, 13....","[11.0, 11.0, 11.0, 11.0, 11.0, 12.0, 12.0, 12....","[12.0, 13.0, 14.0, 15.0, 16.0, 11.0, 13.0, 14....","[A, A, A, A, A, F, F, F, F, F, S, S, S, S, S, ...","[F, S, I, G, K, A, S, I, G, K, A, F, I, G, K, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 2.0, 4.0, 5.0, 6.0, 0.0, 2.0, 4.0, 5.0, ..."
1,652811,UPI0000DDE113,HNTLPDFEMPNEERVHLAAESFRLLADPTRIKILWALLQGESSVAC...,Gene3D,Winged helix-like DNA-binding domain superfamily,IPR036388,35,140,106,2LKP,0.0,A,42.17,83.0,48.0,0.0,19.0,101.0,23.0,105.0,-11.522879,60.8,AESFRLLADPTRIKILWALLQGESSVACLADLVGAAPTAVSQHLAK...,ASTLQALATPSRLMILTQLRNGPLPVTDLAEAIGMEQSAVSHQLRV...,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[22.0, 22.0, 22.0, 22.0, 22.0, 23.0, 23.0, 23....","[23.0, 24.0, 25.0, 26.0, 27.0, 22.0, 24.0, 25....","[23.0, 23.0, 23.0, 23.0, 23.0, 24.0, 24.0, 24....","[24.0, 25.0, 26.0, 27.0, 28.0, 23.0, 25.0, 26....","[A, A, A, A, A, S, S, S, S, S, S, T, T, T, T, ...","[S, T, L, Q, A, A, T, L, Q, A, L, A, S, L, Q, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 2.0, 3.0, 4.0, 5.0, 0.0, 2.0, 3.0, 4.0, ..."
