# Summary

---

# Imports

In [1]:
import urllib

In [2]:
%run _imports.ipynb

Setting the PACKAGE_NAME environment variable.
Setting the PACKAGE_VERSION environment variable.
Setting the DOCS_SECRET_KEY environment variable.
Setting the PYTHON_VERSION environment variable.
Setting the SPARK_MASTER environment variable.
Setting the SPARK_ARGS environment variable.
Setting the DB_TYPE environment variable.
Setting the DB_PORT environment variable.


2018-01-25 18:50:44.737987


In [3]:
%run _settings.ipynb

/home/kimlab2/database_data/databin/uniparc_domain/0.1/adjacency_matrix.parquet


In [4]:
%run _spark.ipynb

# Parameters

In [5]:
NOTEBOOK_NAME = 'group_by_sequence_length'
NOTEBOOK_PATH = Path(NOTEBOOK_NAME).absolute()

NOTEBOOK_PATH.mkdir(parents=True, exist_ok=True)

# Load Datasets

## Gene3D domains

In [6]:
with open(f'generate_datasets/gene3d_domains.pickle', 'rb') as fin:
    GENE3D_DOMAINS = pickle.load(fin)

## Training / validation domains

In [7]:
with open(f'generate_datasets/training_domains.pickle', 'rb') as fin:
    TRAINING_DOMAINS = pickle.load(fin)
    
with open(f'generate_datasets/validation_domains.pickle', 'rb') as fin:
    VALIDATION_DOMAINS = pickle.load(fin)
    
with open(f'generate_datasets/test_domains.pickle', 'rb') as fin:
    TEST_DOMAINS = pickle.load(fin)

## Training / validation parquet files

In [8]:
with open(f'generate_datasets/training_parquet_files.pickle', 'rb') as fin:
    TRAINING_PARQUET_FILES = pickle.load(fin)
    
with open(f'generate_datasets/validation_parquet_files.pickle', 'rb') as fin:
    VALIDATION_PARQUET_FILES = pickle.load(fin)
    
with open(f'generate_datasets/test_parquet_files.pickle', 'rb') as fin:
    TEST_PARQUET_FILES = pickle.load(fin)

## Query where strings

In [9]:
TRAINING_DOMAINS_STRING = "('{}')".format("', '".join(urllib.parse.unquote(d)[12:] for d in TRAINING_DOMAINS))

In [10]:
VALIDATION_DOMAINS_STRING = "('{}')".format("', '".join(urllib.parse.unquote(d)[12:] for d in VALIDATION_DOMAINS))

In [11]:
TEST_DOMAINS_STRING = "('{}')".format("', '".join(urllib.parse.unquote(d)[12:] for d in TEST_DOMAINS))

In [12]:
print(TRAINING_DOMAINS_STRING[:600])

('G3DSA:2.40.128.20', 'G3DSA:3.50.40.10', 'G3DSA:2.60.40.830', 'G3DSA:2.20.50.20', 'G3DSA:2.40.50.240', 'G3DSA:1.10.569.10', 'G3DSA:2.40.40.10', 'G3DSA:4.10.1240.10', 'G3DSA:4.10.1080.10', 'G3DSA:3.90.1170.20', 'G3DSA:3.30.70.80', 'G3DSA:4.10.20.10', 'G3DSA:1.20.1520.10', 'G3DSA:3.30.1120.40', 'G3DSA:3.90.1650.10', 'G3DSA:1.10.250.10', 'G3DSA:1.10.1390.10', 'G3DSA:3.40.1550.10', 'G3DSA:2.60.490.10', 'G3DSA:1.10.530.40', 'G3DSA:3.30.910.10', 'G3DSA:1.20.1480.10', 'G3DSA:3.30.1390.20', 'G3DSA:1.10.150.170', 'G3DSA:1.10.238.80', 'G3DSA:3.90.1360.10', 'G3DSA:1.20.90.10', 'G3DSA:3.30.920.20', 'G3DS


# Group by sequence length

## Test

In [13]:
query = spark.sql(f"""\
SELECT
    am.*,
    CAST(ROUND(LENGTH(translate(am.qseq, '-', '')) / 20) * 20 AS INT) qseq_length_bin
FROM parquet.`{ADJACENCY_MATRIX_PATH}` am
WHERE database_id in {VALIDATION_DOMAINS_STRING}
""")

In [14]:
df = query.limit(2).toPandas()
df

Unnamed: 0,uniparc_id,sequence,database,interpro_name,interpro_id,domain_start,domain_end,__index_level_0__,domain_length,structure_id,model_id,chain_id,pc_identity,alignment_length,mismatches,gap_opens,q_start,q_end,s_start,s_end,evalue_log10,bitscore,qseq,sseq,a2b,b2a,residue_idx_1,residue_idx_2,residue_id_1,residue_id_2,residue_aa_1,residue_aa_2,residue_idx_1_corrected,residue_idx_2_corrected,database_id,qseq_length_bin
0,UPI0006697D87,AAAGALAVTLFAGVFLLPLAVILLSSLSKQWNGLLPTGFTFAHFVNAFRGAAWDSLFSSLMVGFCASLLALLCGMWAALALRQYGATLQKYLGLAFYLPSAIPSVSVGLGILVAFS...,Gene3D,MetI-like superfamily,IPR035906,10,265,849493775,256,2ONK,0.0,E,26.82,179.0,119.0,5.0,39.0,206.0,33.0,210.0,-5.69897,49.3,FTFAHFVNAFRG-AAWDSLFSSLMVGFCASLLALLCGMWAALALRQYGATLQKYLGLAFYLPSAIPSVSVGLGILVAF-------SQGPLQ-MNGTFWIVLAAHFVLISAFTFSNV...,FNFDEFLKAASDPAVWKVVLTTYYAALISTLIAVIFGTPLAYILARKSFPGKSVVEGIVDLPVVIPHTVAGIALLVVFGSSGLIGSFSPLKFVDALPGIVVAMLFVSVPIY-INQA...,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, None, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0...","[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0...","[32.0, 32.0, 33.0, 33.0, 33.0, 33.0, 33.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 35.0, 35.0, 35.0, 35.0, 35.0, 3...","[33.0, 34.0, 32.0, 34.0, 35.0, 36.0, 37.0, 32.0, 33.0, 35.0, 36.0, 37.0, 38.0, 39.0, 33.0, 34.0, 36.0, 37.0, 38.0, 3...","[56.0, 56.0, 57.0, 57.0, 57.0, 57.0, 57.0, 58.0, 58.0, 58.0, 58.0, 58.0, 58.0, 58.0, 59.0, 59.0, 59.0, 59.0, 59.0, 5...","[57.0, 58.0, 56.0, 58.0, 59.0, 60.0, 61.0, 56.0, 57.0, 59.0, 60.0, 61.0, 62.0, 63.0, 57.0, 58.0, 60.0, 61.0, 62.0, 6...","[F, F, N, N, N, N, N, F, F, F, F, F, F, F, D, D, D, D, D, D, D, E, E, E, E, E, E, E, E, F, F, F, F, F, F, F, F, L, L...","[N, F, F, F, D, E, F, F, N, D, E, F, L, K, N, F, E, F, L, K, A, N, F, D, F, L, K, A, A, N, F, D, E, L, K, A, A, F, D...","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, ...","[1.0, 2.0, 0.0, 2.0, 3.0, 4.0, 5.0, 0.0, 1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, ...",G3DSA:1.10.3720.10,160
1,UPI000851A9D7,AAAKLLLLALVAAVVVPVLHGRWGGGIWPEALTADLSAPLGEVTDWIVSNRDSHPLFLYFFGHISNAVVLSVRGVYLVLLALGWAGVTVFGAAVAWRVAGIRLALTAGVSFLLCGL...,Gene3D,MetI-like superfamily,IPR035906,29,339,234660127,311,4YMU,0.0,C,26.14,153.0,106.0,3.0,126.0,271.0,22.0,174.0,-1.431798,36.2,TLALMVVAVLASVVLGLLLGLAAGLSDRVFRIL-RPVLDTMQVLPAFAYLLPV---VLVFGIGVPG---AVLATVVYAAPPMARLTALGLRGADSGVMEAVTSLGATGRQRLLSAR...,TLKLTFLAVTIGVLMGLFIALMKMSSIKPIKLVASSYIEVIRGTPLLVQLLLIYNGLMQFGMNIPAFTAGVSALAINSSAYVAEIIRAGIQAVDPGQNEAARSLGMTHAMAMRYVI...,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0...","[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0...","[21.0, 21.0, 21.0, 21.0, 21.0, 21.0, 21.0, 21.0, 21.0, 21.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 23.0, 23.0, 2...","[22.0, 23.0, 24.0, 25.0, 26.0, 156.0, 159.0, 169.0, 170.0, 171.0, 21.0, 23.0, 24.0, 25.0, 26.0, 27.0, 156.0, 21.0, 2...","[22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 24.0, 24.0, 2...","[23.0, 24.0, 25.0, 26.0, 27.0, 157.0, 160.0, 170.0, 171.0, 172.0, 22.0, 24.0, 25.0, 26.0, 27.0, 28.0, 157.0, 22.0, 2...","[T, T, T, T, T, T, T, T, T, T, L, L, L, L, L, L, L, K, K, K, K, K, K, K, K, L, L, L, L, L, L, L, L, L, L, L, L, L, L...","[L, K, L, T, F, L, S, D, L, T, T, K, L, T, F, L, L, T, L, L, T, F, L, A, F, T, L, K, T, F, L, A, V, F, G, V, L, S, V...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...","[1.0, 2.0, 3.0, 4.0, 5.0, 128.0, 131.0, 141.0, 142.0, 143.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 128.0, 0.0, 1.0, 3.0, 4.0...",G3DSA:1.10.3720.10,140


## Partition by sequence length

In [25]:
for subset in ['training', 'validation', 'test']:
    for cutoff in [0, 40, 60, 80]:
        print(subset, cutoff, end='')
        if (subset, cutoff) in seen:
            print("\t\tskiping...", flush=True)
            continue
        else:
            print(flush=True)       
        input_path = op.abspath(f'threshold_by_pc_identity/adjacency_matrix_{subset}_gt{cutoff}.parquet')
        output_path = NOTEBOOK_PATH.joinpath(f'adjacency_matrix_{subset}_gt{cutoff}_gbseqlen.parquet').absolute()
        output_path.mkdir(parents=True, exist_ok=True)
        query = spark.sql(dedent(f"""\
            SELECT
                am.*,
                LENGTH(translate(am.qseq, '-', '')) qseq_length,
                CAST(FLOOR(LENGTH(translate(am.qseq, '-', '')) / 20) * 20 AS INT) qseq_length_bin
            FROM parquet.`{input_path}` am
            """))
        query.write.parquet(
            output_path.as_posix(),
            mode='overwrite',
            partitionBy='qseq_length_bin',
        )

training 0		skiping...
training 40		skiping...
training 60		skiping...
training 80		skiping...
validation 0		skiping...
validation 40		skiping...
validation 60		skiping...
validation 80		skiping...
test 0		skiping...
test 40
test 60
test 80
