# Summary

---

# Imports

In [1]:
import urllib

In [2]:
%run _imports.ipynb

Setting the PACKAGE_NAME environment variable.
Setting the PACKAGE_VERSION environment variable.
Setting the DOCS_SECRET_KEY environment variable.
Setting the PYTHON_VERSION environment variable.
Setting the SPARK_MASTER environment variable.
Setting the SPARK_ARGS environment variable.
Setting the DB_TYPE environment variable.
Setting the DB_PORT environment variable.


2018-01-25 17:44:58.774187


In [3]:
%run _settings.ipynb

/home/kimlab2/database_data/databin/uniparc_domain/0.1/adjacency_matrix.parquet


In [4]:
%run _spark.ipynb

# Parameters

In [5]:
NOTEBOOK_NAME = 'threshold_by_pc_identity'
NOTEBOOK_PATH = Path(NOTEBOOK_NAME).absolute()

NOTEBOOK_PATH.mkdir(parents=True, exist_ok=True)

# Load datasets

## Gene3D domains

In [6]:
with open(f'generate_datasets/gene3d_domains.pickle', 'rb') as fin:
    GENE3D_DOMAINS = pickle.load(fin)

## Training / validation domains

In [7]:
with open(f'generate_datasets/training_domains.pickle', 'rb') as fin:
    TRAINING_DOMAINS = pickle.load(fin)
    
with open(f'generate_datasets/validation_domains.pickle', 'rb') as fin:
    VALIDATION_DOMAINS = pickle.load(fin)
    
with open(f'generate_datasets/test_domains.pickle', 'rb') as fin:
    TEST_DOMAINS = pickle.load(fin)

## Training / validation parquet files

In [8]:
with open(f'generate_datasets/training_parquet_files.pickle', 'rb') as fin:
    TRAINING_PARQUET_FILES = pickle.load(fin)
    
with open(f'generate_datasets/validation_parquet_files.pickle', 'rb') as fin:
    VALIDATION_PARQUET_FILES = pickle.load(fin)
    
with open(f'generate_datasets/test_parquet_files.pickle', 'rb') as fin:
    TEST_PARQUET_FILES = pickle.load(fin)

## SQL where strings

In [9]:
TRAINING_DOMAINS_STRING = "('{}')".format("', '".join(urllib.parse.unquote(d)[12:] for d in TRAINING_DOMAINS))

In [10]:
VALIDATION_DOMAINS_STRING = "('{}')".format("', '".join(urllib.parse.unquote(d)[12:] for d in VALIDATION_DOMAINS))

In [11]:
TEST_DOMAINS_STRING = "('{}')".format("', '".join(urllib.parse.unquote(d)[12:] for d in TEST_DOMAINS))

In [12]:
print(TRAINING_DOMAINS_STRING[:600])

('G3DSA:2.40.128.20', 'G3DSA:3.50.40.10', 'G3DSA:2.60.40.830', 'G3DSA:2.20.50.20', 'G3DSA:2.40.50.240', 'G3DSA:1.10.569.10', 'G3DSA:2.40.40.10', 'G3DSA:4.10.1240.10', 'G3DSA:4.10.1080.10', 'G3DSA:3.90.1170.20', 'G3DSA:3.30.70.80', 'G3DSA:4.10.20.10', 'G3DSA:1.20.1520.10', 'G3DSA:3.30.1120.40', 'G3DSA:3.90.1650.10', 'G3DSA:1.10.250.10', 'G3DSA:1.10.1390.10', 'G3DSA:3.40.1550.10', 'G3DSA:2.60.490.10', 'G3DSA:1.10.530.40', 'G3DSA:3.30.910.10', 'G3DSA:1.20.1480.10', 'G3DSA:3.30.1390.20', 'G3DSA:1.10.150.170', 'G3DSA:1.10.238.80', 'G3DSA:3.90.1360.10', 'G3DSA:1.20.90.10', 'G3DSA:3.30.920.20', 'G3DS


# Run

In [13]:
os.listdir(ADJACENCY_MATRIX_PATH)[:10]

['database_id=G3DSA%3A1.10.10.400',
 'database_id=G3DSA%3A2.40.128.120',
 'database_id=G3DSA%3A4.10.800.10',
 'database_id=G3DSA%3A1.20.10.10',
 'database_id=G3DSA%3A3.40.50.1260',
 'database_id=G3DSA%3A2.60.40.1200',
 'database_id=G3DSA%3A1.20.5.1010',
 'database_id=G3DSA%3A1.20.900.10',
 'database_id=G3DSA%3A1.20.5.210',
 'database_id=G3DSA%3A1.20.5.260']

## Run SQL queries

In [14]:
# domain_strings = {
#     'training': TRAINING_DOMAINS_STRING,
#     'validation': VALIDATION_DOMAINS_STRING,
#     'test': TEST_DOMAINS_STRING,
# }

# seen = {
# }

# for subset in ['training', 'validation', 'test']:
#     for cutoff in [0, 40, 60, 80]:
#         print(subset, cutoff, flush=True)
#         if (subset, cutoff) in seen:
#             print("skipping...")
#             continue
#         domain_string = domain_strings[subset]
#         output_path = NOTEBOOK_PATH.joinpath(f'adjacency_matrix_{subset}_gt{cutoff}.parquet').absolute()
#         output_path.mkdir(parents=True, exist_ok=True)
#         query = spark.sql(f"""\
#             SELECT *
#             FROM parquet.`{ADJACENCY_MATRIX_PATH}`
#             WHERE database_id in {domain_string}
#             AND pc_identity > {cutoff}
#         """)
#         query.write.parquet(
#             output_path.as_posix(),
#             mode='overwrite',
#             partitionBy='database_id',
#         )

## Validate results

In [15]:
domain_strings = {
    'training': TRAINING_DOMAINS_STRING,
    'validation': VALIDATION_DOMAINS_STRING,
    'test': TEST_DOMAINS_STRING,
}

seen = {
    ('training', 0)
}

for subset in ['training', 'validation', 'test']:
    for cutoff in [0, 40, 60, 80]:
        print(subset, cutoff, flush=True)
        if (subset, cutoff) in seen:
            print("skipping...")
            continue
        domain_string = domain_strings[subset]
        output_path = NOTEBOOK_PATH.joinpath(f'adjacency_matrix_{subset}_gt{cutoff}.parquet').absolute()
        output_path.mkdir(parents=True, exist_ok=True)
        count1 = spark.sql(f"""\
            SELECT COUNT(*)
            FROM parquet.`{ADJACENCY_MATRIX_PATH}`
            WHERE database_id in {domain_string}
            AND pc_identity > {cutoff}
        """).take(1)
        count2 = spark.sql(f"""\
            SELECT COUNT(*)
            FROM parquet.`{output_path}`
        """).take(1)
        assert count1[0]['count(1)'] == count2[0]['count(1)']

training 0
training 40
training 60
training 80
validation 0
validation 40
validation 60
validation 80
test 0
test 40
test 60
test 80
