# Summary

Generate training and validation datasets.

----

# Imports

In [1]:
import os
import random
import yaml
from pathlib import Path

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
pd.set_option("max_columns", 100)

# Parameters

In [3]:
NOTEBOOK_PATH = Path('training_validation_test_split')
NOTEBOOK_PATH

PosixPath('training_validation_test_split')

In [4]:
OUTPUT_PATH = Path(os.getenv('OUTPUT_DIR', NOTEBOOK_PATH.name)).resolve()
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH

PosixPath('/home/kimlab2/database_data/datapkg/adjacency-net-v2/notebooks/training_validation_test_split')

# `DATAPKG`

In [5]:
DATAPKG = {}

In [6]:
DATAPKG['uniparc-domain-wstructure'] = (
    Path(os.environ['DATAPKG_OUTPUT_DIR'])
    .joinpath("uniparc-domain-wstructure", "master")
)

# Load data

In [7]:
adjacency_matrix_path = (
    DATAPKG['uniparc-domain-wstructure']
    .joinpath("remove_duplicate_matches", "adjacency_matrix.parquet")
)
adjacency_matrix_path

PosixPath('/home/kimlab1/database_data/datapkg_output_dir/uniparc-domain-wstructure/master/remove_duplicate_matches/adjacency_matrix.parquet')

# Construct training / validation / test datasets

## All Gene3D domains

In [8]:
GENE3D_DOMAINS = sorted(p.name for p in adjacency_matrix_path.glob("database_id=*"))

random.seed(42)
random.shuffle(GENE3D_DOMAINS)
GENE3D_DOMAINS[:3]

['database_id=G3DSA%3A2.40.128.20',
 'database_id=G3DSA%3A3.50.40.10',
 'database_id=G3DSA%3A2.60.40.830']

In [9]:
with NOTEBOOK_PATH.joinpath("all_gene3d_domains.yaml").open("wt") as fout:
    yaml.dump(GENE3D_DOMAINS, fout, default_flow_style=False)

## Split into training / validation / test

In [10]:
breakpoint1 = len(GENE3D_DOMAINS) * 3 // 4
print(breakpoint1)
breakpoint2 = len(GENE3D_DOMAINS) * 7 // 8
print(breakpoint2)

TRAINING_DOMAINS = GENE3D_DOMAINS[:breakpoint1]
VALIDATION_DOMAINS = GENE3D_DOMAINS[breakpoint1:breakpoint2]
TEST_DOMAINS = GENE3D_DOMAINS[breakpoint2:]

print('----')
print(len(GENE3D_DOMAINS))
print(len(TRAINING_DOMAINS))
print(len(VALIDATION_DOMAINS))
print(len(TEST_DOMAINS))

1029
1201
----
1373
1029
172
172


In [11]:
assert (len(TRAINING_DOMAINS) + len(VALIDATION_DOMAINS) + len(TEST_DOMAINS)) == len(GENE3D_DOMAINS)

In [12]:
with NOTEBOOK_PATH.joinpath("training_domains.yaml").open("wt") as fout:
    yaml.dump(TRAINING_DOMAINS, fout, default_flow_style=False)
    
with NOTEBOOK_PATH.joinpath("validation_domains.yaml").open("wt") as fout:
    yaml.dump(VALIDATION_DOMAINS, fout, default_flow_style=False)
    
with NOTEBOOK_PATH.joinpath("test_domains.yaml").open("wt") as fout:
    yaml.dump(TEST_DOMAINS, fout, default_flow_style=False)