# Summary

# Imports

In [None]:
import importlib
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

In [None]:
%matplotlib inline

pd.set_option("max_columns", 100)

In [None]:
%run spark.ipynb

In [None]:
SRC_PATH = Path.cwd().joinpath('..', 'src').resolve(strict=True)

if SRC_PATH.as_posix() not in sys.path:
    sys.path.insert(0, SRC_PATH.as_posix())

import helper
importlib.reload(helper)

# Parameters

In [None]:
NOTEBOOK_PATH = Path(os.getenv("CI_JOB_NAME", "add_adjacency_distances_test"))
NOTEBOOK_PATH

In [None]:
OUTPUT_PATH = Path(os.getenv('OUTPUT_DIR', NOTEBOOK_PATH.name)).resolve()
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH

In [None]:
DEBUG = "CI" not in os.environ    

TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
TASK_COUNT = os.getenv("ORIGINAL_ARRAY_TASK_COUNT") or os.getenv("SLURM_ARRAY_TASK_COUNT")

TASK_ID = int(TASK_ID) if TASK_ID is not None else None
TASK_COUNT = int(TASK_COUNT) if TASK_COUNT is not None else None

TASK_ID, TASK_COUNT

In [None]:
if DEBUG:
    %load_ext autoreload
    %autoreload 2

# `DATAPKG`

In [None]:
DATAPKG = {
    'training_dataset': 
        Path(os.environ['DATAPKG_OUTPUT_DIR']).joinpath(
            "adjacency-net-v2", "master", "training_dataset"),
    'training_dataset_wdistances':
        Path(os.environ['DATAPKG_OUTPUT_DIR']).joinpath(
            "adjacency-net-v2", "master", "training_dataset_wdistances"),
    'pdb_mmcif_ffindex':
        Path(os.environ['DATAPKG_OUTPUT_DIR']).joinpath(
            "pdb-ffindex", "master", "pdb_mmcif_ffindex", "pdb-mmcif"),
}

# Count rows

### Count number of input rows in `training_dataset`

In [None]:
sql_query = f"""\
select count(*)
from parquet.`{DATAPKG['training_dataset']}/adjacency_matrix.parquet`
"""

print(sql_query)

In [None]:
ds = spark.sql(sql_query)

In [None]:
df = ds.toPandas()

In [None]:
df  # 47402360

### Count number of input rows in `training_dataset_wdistances`

In [None]:
sql_query = f"""\
select count(*)
from parquet.`{DATAPKG['training_dataset_wdistances']}/adjacency_matrix.parquet`
"""

print(sql_query)

In [None]:
ds_wdistances = spark.sql(sql_query)

In [None]:
df_wdistances = ds_wdistances.toPandas()

In [None]:
df_wdistances  # 49418401

In [None]:
import os
from os.path import join, getsize

for folder in DATAPKG['training_dataset_wdistances'].joinpath("adjacency_matrix.parquet").glob("*"):
    parquet_files = list(folder.glob("*.parquet"))
    if len(parquet_files) != 1:
        print(folder)