# Summary

# Imports

In [88]:
import importlib
import sys
from pathlib import Path

import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import pandas as pd
import tqdm

In [2]:
%matplotlib inline

pd.set_option("max_columns", 100)

In [3]:
%run spark.ipynb

In [4]:
SRC_PATH = Path.cwd().joinpath('..', 'src').resolve(strict=True)

if SRC_PATH.as_posix() not in sys.path:
    sys.path.insert(0, SRC_PATH.as_posix())

import helper
importlib.reload(helper)

<module 'helper' from '/gpfs/fs0/scratch/p/pmkim/strokach/datapkg/adjacency-net-v2/src/helper/__init__.py'>

# Parameters

In [5]:
NOTEBOOK_PATH = Path(os.getenv("CI_JOB_NAME", "add_adjacency_distances_test"))
NOTEBOOK_PATH

PosixPath('add_adjacency_distances_test')

In [6]:
OUTPUT_PATH = Path(os.getenv('OUTPUT_DIR', NOTEBOOK_PATH.name)).resolve()
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH

PosixPath('/gpfs/fs0/scratch/p/pmkim/strokach/datapkg/adjacency-net-v2/notebooks/add_adjacency_distances_test')

In [7]:
DEBUG = "CI" not in os.environ    

TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
TASK_COUNT = os.getenv("ORIGINAL_ARRAY_TASK_COUNT") or os.getenv("SLURM_ARRAY_TASK_COUNT")

TASK_ID = int(TASK_ID) if TASK_ID is not None else None
TASK_COUNT = int(TASK_COUNT) if TASK_COUNT is not None else None

TASK_ID, TASK_COUNT

(None, None)

In [8]:
if DEBUG:
    %load_ext autoreload
    %autoreload 2

# `DATAPKG`

In [9]:
DATAPKG = {
    'training_dataset': 
        Path(os.environ['DATAPKG_OUTPUT_DIR']).joinpath(
            "adjacency-net-v2", "master", "training_dataset"),
    'training_dataset_wdistances':
        Path(os.environ['DATAPKG_OUTPUT_DIR']).joinpath(
            "adjacency-net-v2", "master", "training_dataset_wdistances"),
    'pdb_mmcif_ffindex':
        Path(os.environ['DATAPKG_OUTPUT_DIR']).joinpath(
            "pdb-ffindex", "master", "pdb_mmcif_ffindex", "pdb-mmcif"),
}

# Count rows

### Count number of input rows in `training_dataset`

In [10]:
sql_query = f"""\
select count(*)
from parquet.`{DATAPKG['training_dataset']}/adjacency_matrix.parquet`
"""

print(sql_query)

select count(*)
from parquet.`/gpfs/fs0/scratch/p/pmkim/strokach/databin/adjacency-net-v2/master/training_dataset/adjacency_matrix.parquet`



In [11]:
ds = spark.sql(sql_query)

In [12]:
df = ds.toPandas()

In [13]:
df  # 47402360

Unnamed: 0,count(1)
0,47402360


### Count number of input rows in `training_dataset_wdistances`

In [14]:
sql_query = f"""\
select count(*)
from parquet.`{DATAPKG['training_dataset_wdistances']}/adjacency_matrix.parquet`
"""

print(sql_query)

select count(*)
from parquet.`/gpfs/fs0/scratch/p/pmkim/strokach/databin/adjacency-net-v2/master/training_dataset_wdistances/adjacency_matrix.parquet`



In [15]:
ds_wdistances = spark.sql(sql_query)

In [16]:
df_wdistances = ds_wdistances.toPandas()

In [17]:
df_wdistances  # 49418401

Unnamed: 0,count(1)
0,49418401


In [18]:
47402360 / 49418401

0.9592046492965242

### Make sure there is exactly one parquet file per folder

In [None]:
import os
from os.path import join, getsize

for folder in DATAPKG['training_dataset_wdistances'].joinpath("adjacency_matrix.parquet").glob("*"):
    parquet_files = list(folder.glob("*.parquet"))
    if len(parquet_files) != 1:
        print(folder)

### Make sure we are not missing any folders

In [42]:
folder_set_1 = set([
    p.parent.name
    for p in
    DATAPKG['training_dataset'].joinpath("adjacency_matrix.parquet").glob("*/*.parquet")
])

In [43]:
folder_set_2 = set([
    p.parent.name
    for p in
    DATAPKG['training_dataset_wdistances'].joinpath("adjacency_matrix.parquet").glob("*/*.parquet")
])

In [44]:
assert (folder_set_1 - folder_set_2) == (folder_set_1 ^ folder_set_2)

### Make sure we are not missing any files

In [45]:
file_set_1 = set([
    p.name
    for p in
    DATAPKG['training_dataset'].joinpath("adjacency_matrix.parquet").glob("*/*.parquet")
])

In [46]:
file_set_2 = set([
    p.name
    for p in
    DATAPKG['training_dataset_wdistances'].joinpath("adjacency_matrix.parquet").glob("*/*.parquet")
])

In [47]:
assert (file_set_1 - file_set_2) == (file_set_1 ^ file_set_2)

### Go over every file to find cause

In [84]:
parquet_files_wdistances = list(
    DATAPKG['training_dataset_wdistances'].joinpath("adjacency_matrix.parquet").glob("*/*.parquet")
)

sql_query_template = """
select count(*)
from parquet.`{file}`
"""

data = []
for file_wdistances in tqdm.tqdm_notebook(parquet_files_wdistances, total=len(parquet_files_wdistances)):
    path_parts = list(file_wdistances.parts)
    path_parts[-4] = "training_dataset"
    file = Path(*path_parts)

    df1 = spark.sql(sql_query_template.format(file=file)).toPandas()
    df2 = spark.sql(sql_query_template.format(file=file_wdistances)).toPandas()
    
    count1 = df1.values.item()
    count2 = df2.values.item()

    data.append((file, count1, count2))

HBox(children=(IntProgress(value=0, max=1024), HTML(value='')))




In [102]:
row_counts_df = pd.DataFrame(
    data, columns=["filepath", "count", "count_wdistances"],
)
row_counts_df["filename"] = [
    p.name for p in row_counts_df["filepath"]
]
row_counts_df["dirname"] = [
    p.parent.name for p in row_counts_df["filepath"]
]
row_counts_df["filepath"] = row_counts_df["filepath"].apply(lambda p: p.as_posix())

row_counts_df.head()

Unnamed: 0,filepath,count,count_wdistances,filename,dirname
0,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,54,54,part-00000-2ae3a02a-2611-4bda-ac19-41025a3a6e7...,database_id=G3DSA%3A3.40.1000.20
1,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,28058,28058,part-00000-eea29c72-b7db-4c8b-807e-c187f1d1488...,database_id=G3DSA%3A3.30.1330.90
2,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,4538,4538,part-00000-11be5e6c-a2b9-4dab-b163-65c195fa083...,database_id=G3DSA%3A3.90.1420.10
3,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,2582,2582,part-00000-d147c9a8-08a6-4b6a-b6f2-55b5926a501...,database_id=G3DSA%3A2.60.40.350
4,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,3548,3548,part-00000-fc143921-ddbc-487a-9f77-0dc98ccd786...,database_id=G3DSA%3A1.10.1240.20


In [103]:
row_counts_df[
    (row_counts_df["count"] < row_counts_df["count_wdistances"])
].sort_values("count")

Unnamed: 0,filepath,count,count_wdistances,filename,dirname
398,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,16466,20560,part-00000-242795e5-a8d2-47e7-a0f6-c1594b2ac29...,database_id=G3DSA%3A1.10.1300.10
997,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,17399,22418,part-00000-1753af42-68f6-40a4-adb9-4e967dea91d...,database_id=G3DSA%3A1.20.1530.10
883,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,19498,32718,part-00000-9e4cfbc4-eede-40a1-a680-0c5070ac3c4...,database_id=G3DSA%3A3.90.80.10
184,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,19581,33408,part-00000-2dad638a-a8ed-43d9-b0bd-2a09f667645...,database_id=G3DSA%3A3.20.20.410
705,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,19622,25524,part-00000-1f36ff16-bee1-48b6-9d6a-bbeed6fb5a9...,database_id=G3DSA%3A1.10.640.10
657,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,20594,24442,part-00000-dda261fd-ade5-4654-9edf-a107a4a8f0b...,database_id=G3DSA%3A2.70.98.40
65,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,22054,24508,part-00000-b2a9f2bc-0798-43fb-8cfc-2c3e8b2d4e6...,database_id=G3DSA%3A3.10.200.10
515,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,22839,34170,part-00000-3b93e72a-6d8c-4fe9-9036-0771124d38c...,database_id=G3DSA%3A1.50.10.100
449,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,24027,29744,part-00000-77651157-a2d0-4504-a92f-ac0bbddd589...,database_id=G3DSA%3A3.40.50.1170
208,/gpfs/fs0/scratch/p/pmkim/strokach/databin/adj...,24482,31206,part-00000-d4bcc900-e42c-4bca-8378-c44a3634ea7...,database_id=G3DSA%3A3.90.1600.10


In [105]:
row_counts_df.at[398, "filepath"]

'/gpfs/fs0/scratch/p/pmkim/strokach/databin/adjacency-net-v2/master/training_dataset/adjacency_matrix.parquet/database_id=G3DSA%3A1.10.1300.10/part-00000-242795e5-a8d2-47e7-a0f6-c1594b2ac294-c000.snappy.parquet'

In [106]:
df = pq.read_table(
    "/gpfs/fs0/scratch/p/pmkim/strokach/databin/adjacency-net-v2/master/training_dataset/adjacency_matrix.parquet/database_id=G3DSA%3A1.10.1300.10/part-00000-242795e5-a8d2-47e7-a0f6-c1594b2ac294-c000.snappy.parquet"
).to_pandas()

In [107]:
df2 = pq.read_table(
    "/gpfs/fs0/scratch/p/pmkim/strokach/databin/adjacency-net-v2/master/training_dataset_wdistances/adjacency_matrix.parquet/database_id=G3DSA%3A1.10.1300.10/part-00000-242795e5-a8d2-47e7-a0f6-c1594b2ac294-c000.snappy.parquet"
).to_pandas()

In [110]:
df.head(2)

Unnamed: 0,__index_level_0__,uniparc_id,sequence,database,interpro_name,interpro_id,domain_start,domain_end,domain_length,structure_id,model_id,chain_id,pc_identity,alignment_length,mismatches,gap_opens,q_start,q_end,s_start,s_end,evalue_log10,bitscore,qseq,sseq,a2b,b2a,residue_idx_1,residue_idx_2,residue_id_1,residue_id_2,residue_aa_1,residue_aa_2,residue_idx_1_corrected,residue_idx_2_corrected
0,15454188,UPI00018BCA32,XXXXXXXXXNDYRTLRQGIIDMVLATEMTKHFEHVNKFVNSINKPL...,Gene3D,"3\'5\'-cyclic nucleotide phosphodiesterase, ca...",IPR036971,581,696,116,3ECM,0.0,A,94.0,100.0,6.0,0.0,10.0,109.0,171.0,270.0,-62.0,198.0,NDYRTLRQGIIDMVLATEMTKHFEHVNKFVNSINKPLAALEENRET...,NDYRTLRQGIIDMVLATEMTKHFEHVNKFVNSINKPLATLEENGET...,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[170.0, 170.0, 170.0, 170.0, 171.0, 171.0, 171...","[171.0, 172.0, 173.0, 174.0, 170.0, 172.0, 173...","[171.0, 171.0, 171.0, 171.0, 172.0, 172.0, 172...","[172.0, 173.0, 174.0, 175.0, 171.0, 173.0, 174...","[N, N, N, N, D, D, D, D, D, D, Y, Y, Y, Y, Y, ...","[D, Y, R, T, N, Y, R, T, L, R, N, D, R, T, L, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 2.0, 3.0, 4.0, 0.0, 2.0, 3.0, 4.0, 5.0, ..."
1,29180086,UPI00015349A4,TEQEDVLAKELEDVNKWGLHVFRIAELSGNRPLTVIMHTIFQERDL...,Gene3D,"3\'5\'-cyclic nucleotide phosphodiesterase, ca...",IPR036971,1,348,348,1MKD,0.0,A,100.0,328.0,0.0,0.0,1.0,328.0,1.0,328.0,-inf,680.0,TEQEDVLAKELEDVNKWGLHVFRIAELSGNRPLTVIMHTIFQERDL...,TEQEDVLAKELEDVNKWGLHVFRIAELSGNRPLTVIMHTIFQERDL...,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 2.0, 3.0, 4.0, 0.0, 2.0, 3.0, 4.0, 5.0, ...","[1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...","[2.0, 3.0, 4.0, 5.0, 1.0, 3.0, 4.0, 5.0, 6.0, ...","[T, T, T, T, E, E, E, E, E, Q, Q, Q, Q, Q, Q, ...","[E, Q, E, D, T, Q, E, D, V, T, E, E, D, V, L, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 2.0, 3.0, 4.0, 0.0, 2.0, 3.0, 4.0, 5.0, ..."


In [111]:
df2.head(2)

Unnamed: 0,uniparc_id,sequence,database,interpro_name,interpro_id,domain_start,domain_end,domain_length,structure_id,model_id,chain_id,pc_identity,alignment_length,mismatches,gap_opens,q_start,q_end,s_start,s_end,evalue_log10,bitscore,qseq,sseq,a2b,b2a,residue_idx_1,residue_idx_2,residue_id_1,residue_id_2,residue_aa_1,residue_aa_2,residue_idx_1_corrected,residue_idx_2_corrected,distances,error_adding_distances
15454188,UPI00018BCA32,XXXXXXXXXNDYRTLRQGIIDMVLATEMTKHFEHVNKFVNSINKPL...,Gene3D,"3\'5\'-cyclic nucleotide phosphodiesterase, ca...",IPR036971,581,696,116,3ECM,0.0,A,94.0,100.0,6.0,0.0,10.0,109.0,171.0,270.0,-62.0,198.0,NDYRTLRQGIIDMVLATEMTKHFEHVNKFVNSINKPLAALEENRET...,NDYRTLRQGIIDMVLATEMTKHFEHVNKFVNSINKPLATLEENGET...,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[170.0, 170.0, 170.0, 170.0, 171.0, 171.0, 171...","[171.0, 172.0, 173.0, 174.0, 170.0, 172.0, 173...","[171.0, 171.0, 171.0, 171.0, 172.0, 172.0, 172...","[172.0, 173.0, 174.0, 175.0, 171.0, 173.0, 174...","[N, N, N, N, D, D, D, D, D, D, Y, Y, Y, Y, Y, ...","[D, Y, R, T, N, Y, R, T, L, R, N, D, R, T, L, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 62, 66, 2, 3, ...","[1.3271933981359927, 3.162341385338672, 3.1021...",
29180086,UPI00015349A4,TEQEDVLAKELEDVNKWGLHVFRIAELSGNRPLTVIMHTIFQERDL...,Gene3D,"3\'5\'-cyclic nucleotide phosphodiesterase, ca...",IPR036971,1,348,348,1MKD,0.0,A,100.0,328.0,0.0,0.0,1.0,328.0,1.0,328.0,-inf,680.0,TEQEDVLAKELEDVNKWGLHVFRIAELSGNRPLTVIMHTIFQERDL...,TEQEDVLAKELEDVNKWGLHVFRIAELSGNRPLTVIMHTIFQERDL...,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 2.0, 3.0, 4.0, 0.0, 2.0, 3.0, 4.0, 5.0, ...","[1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...","[2.0, 3.0, 4.0, 5.0, 1.0, 3.0, 4.0, 5.0, 6.0, ...","[T, T, T, T, E, E, E, E, E, Q, Q, Q, Q, Q, Q, ...","[E, Q, E, D, T, Q, E, D, V, T, E, E, D, V, L, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...","[1, 2, 3, 4, 5, 6, 7, 8, 26, 27, 2, 3, 4, 5, 6...","[1.3295726313800746, 3.9149616453980354, 3.181...",


In [113]:
len(set(df["__index_level_0__"]))

16466

In [114]:
len(set(df2.index))

10280

In [115]:
len(df2.index)

20560

In [116]:
df2[df2.index == 15454188]

Unnamed: 0,uniparc_id,sequence,database,interpro_name,interpro_id,domain_start,domain_end,domain_length,structure_id,model_id,chain_id,pc_identity,alignment_length,mismatches,gap_opens,q_start,q_end,s_start,s_end,evalue_log10,bitscore,qseq,sseq,a2b,b2a,residue_idx_1,residue_idx_2,residue_id_1,residue_id_2,residue_aa_1,residue_aa_2,residue_idx_1_corrected,residue_idx_2_corrected,distances,error_adding_distances
15454188,UPI00018BCA32,XXXXXXXXXNDYRTLRQGIIDMVLATEMTKHFEHVNKFVNSINKPL...,Gene3D,"3\'5\'-cyclic nucleotide phosphodiesterase, ca...",IPR036971,581,696,116,3ECM,0.0,A,94.0,100.0,6.0,0.0,10.0,109.0,171.0,270.0,-62.0,198.0,NDYRTLRQGIIDMVLATEMTKHFEHVNKFVNSINKPLAALEENRET...,NDYRTLRQGIIDMVLATEMTKHFEHVNKFVNSINKPLATLEENGET...,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[170.0, 170.0, 170.0, 170.0, 171.0, 171.0, 171...","[171.0, 172.0, 173.0, 174.0, 170.0, 172.0, 173...","[171.0, 171.0, 171.0, 171.0, 172.0, 172.0, 172...","[172.0, 173.0, 174.0, 175.0, 171.0, 173.0, 174...","[N, N, N, N, D, D, D, D, D, D, Y, Y, Y, Y, Y, ...","[D, Y, R, T, N, Y, R, T, L, R, N, D, R, T, L, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 62, 66, 2, 3, ...","[1.3271933981359927, 3.162341385338672, 3.1021...",
15454188,UPI00018BCA32,XXXXXXXXXNDYRTLRQGIIDMVLATEMTKHFEHVNKFVNSINKPL...,Gene3D,"3\'5\'-cyclic nucleotide phosphodiesterase, ca...",IPR036971,581,696,116,3ECM,0.0,A,94.0,100.0,6.0,0.0,10.0,109.0,171.0,270.0,-62.0,198.0,NDYRTLRQGIIDMVLATEMTKHFEHVNKFVNSINKPLAALEENRET...,NDYRTLRQGIIDMVLATEMTKHFEHVNKFVNSINKPLATLEENGET...,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ...","[170.0, 170.0, 170.0, 170.0, 171.0, 171.0, 171...","[171.0, 172.0, 173.0, 174.0, 170.0, 172.0, 173...","[171.0, 171.0, 171.0, 171.0, 172.0, 172.0, 172...","[172.0, 173.0, 174.0, 175.0, 171.0, 173.0, 174...","[N, N, N, N, D, D, D, D, D, D, Y, Y, Y, Y, Y, ...","[D, Y, R, T, N, Y, R, T, L, R, N, D, R, T, L, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 62, 66, 2, 3, ...","[1.3271933981359927, 3.162341385338672, 3.1021...",


In [117]:
table = pa.Table.from_pandas(row_counts_df, preserve_index=False)
pq.write_table(table, OUTPUT_PATH.joinpath("row_counts.parquet"), flavor="spark", version="2.0")