# Summary



# Imports

In [1]:
import os
import subprocess
import shlex
import io
import re
import itertools
from pathlib import Path
from pprint import pprint

In [2]:
import psutil
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pyarrow as pa
import pyarrow.parquet as pq

In [3]:
pd.set_option('display.max_columns', 1000)

In [4]:
%matplotlib inline

# Parameters

In [5]:
NOTEBOOK_PATH = Path(os.getenv('OUTPUT_DIR', '.')).joinpath('homology_modeling_dataset').resolve()
NOTEBOOK_PATH.mkdir(parents=True, exist_ok=True)
NOTEBOOK_PATH

PosixPath('/home/kimlab2/database_data/biological-data-warehouse/adjacency-net/notebooks/scop_dataset')

In [6]:
DATA_PATH = Path(os.environ['DATA_DIR']).resolve()
DATA_PATH

PosixPath('/home/kimlab1/database_data')

In [7]:
DATABIN_PATH = Path(os.environ['DATABIN_DIR']).resolve()
DATABIN_PATH

PosixPath('/home/kimlab2/database_data/databin')

# Spark

In [8]:
import pyspark
from pyspark.sql import SparkSession

In [9]:
spark = (
    SparkSession
    .builder
    .master(f"local[{psutil.cpu_count()}]")
    .appName(NOTEBOOK_PATH.name)
#     .config('spark.driver.memory', '{free_memory / 4 * 1:n}G')
#     .config('spark.executor.memory', '10g'),
    .config('spark.memory.storageFraction', 0)
#     .config('spark.driver.maxResultSize', '10G')
    .getOrCreate()
)

In [10]:
spark

# Functions

# Databases

## `uniparc`

In [12]:
!ls {DATABIN_PATH}/uniparc/v0.1.0/

chain.parquet		    uniparc_xref2component.parquet
component.parquet	    uniparc_xref2gene_name.parquet
gene_name.parquet	    uniparc_xref2ncbi_gi.parquet
ncbi_gi.parquet		    uniparc_xref2ncbi_taxonomy_id.parquet
ncbi_taxonomy_id.parquet    uniparc_xref2protein_name.parquet
protein_name.parquet	    uniparc_xref2proteome_id.parquet
proteome_id.parquet	    uniparc_xref2uniprot_kb_accession.parquet
uniparc_domain.parquet	    uniparc_xref.parquet
uniparc.parquet		    _uniprot_kb_accession.parquet
uniparc_xref2chain.parquet


### `uniparc_xref.parquet`

#### Sample

In [12]:
ds = spark.sql(f"""\
select *
from parquet.`{DATABIN_PATH}/uniparc/v0.1.0/uniparc_xref.parquet`
limit 10
""")

In [13]:
df = ds.toPandas()

In [14]:
df

Unnamed: 0,uniparc_id,idx,db_type,db_id,version_i,active,version,created,last,__index_level_0__
0,UPI0000000011,1,PRF,3119392DP,1,Y,,2006-02-13,2009-09-01,0
1,UPI0000000011,2,PRF,3119392TK,1,Y,,2006-02-13,2009-09-01,1
2,UPI0000000011,3,PRF,3121328BV,1,Y,,2006-02-13,2009-09-01,2
3,UPI0000000011,4,PRF,3216358CT,1,Y,,2007-01-01,2009-09-01,3
4,UPI0000000011,5,PRF,3216358FBB,1,Y,,2007-01-01,2009-09-01,4
5,UPI0000000011,6,PRF,3315290CD,1,Y,,2007-12-07,2009-09-01,5
6,UPI0000000011,7,EMBL,AAA48061,1,Y,1.0,2003-03-12,2017-05-26,6
7,UPI0000000011,8,EMBL,AAB96437,1,Y,1.0,2003-03-12,2017-05-26,7
8,UPI0000000011,9,EMBL,AAQ93171,1,Y,1.0,2006-05-04,2017-05-26,8
9,UPI0000000011,10,EMBL,AAR17917,1,Y,1.0,2006-05-04,2017-05-26,9


#### Databases

In [15]:
ds = spark.sql(f"""\
select distinct(db_type)
from parquet.`{DATABIN_PATH}/uniparc/v0.1.0/uniparc_xref.parquet`
""")

In [16]:
df = ds.toPandas()

In [17]:
df

Unnamed: 0,db_type
0,PATRIC
1,EPO
2,EnsemblProtists
3,PRF
4,TROME
5,UniProtKB/TrEMBL
6,UniProtKB/Swiss-Prot protein isoforms
7,WBParaSite
8,TAIR
9,H-InvDB


#### PDB head

In [18]:
ds = spark.sql(f"""\
select *
from parquet.`{DATABIN_PATH}/uniparc/v0.1.0/uniparc_xref.parquet`
where db_type = 'PDB'
limit 10
""")

In [19]:
df = ds.toPandas()

In [20]:
df

Unnamed: 0,uniparc_id,idx,db_type,db_id,version_i,active,version,created,last,__index_level_0__
0,UPI0000000066,2,PDB,3j7p,1,Y,,2014-12-05,2017-09-15,101
1,UPI0000000066,3,PDB,3j7r,1,Y,,2014-12-05,2017-09-15,102
2,UPI0000000066,4,PDB,4ug0,1,Y,,2015-06-05,2017-09-15,103
3,UPI0000000066,5,PDB,4v6x,1,Y,,2014-12-05,2017-09-15,104
4,UPI0000000066,6,PDB,5a2q,1,Y,,2015-07-10,2017-09-15,105
5,UPI0000000066,7,PDB,5lks,1,Y,,2017-04-21,2017-09-15,106
6,UPI0000000066,8,PDB,5lzs,1,Y,,2016-11-25,2017-09-15,107
7,UPI0000000066,9,PDB,5lzt,1,Y,,2016-11-25,2017-09-15,108
8,UPI0000000066,10,PDB,5lzu,1,Y,,2016-11-25,2017-09-15,109
9,UPI0000000066,11,PDB,5lzv,1,Y,,2016-11-25,2017-09-15,110


### `uniparc_domain.parquet`

#### Sample

In [21]:
ds = spark.sql(f"""\
select *
from parquet.`{DATABIN_PATH}/uniparc/v0.1.0/uniparc_domain.parquet`
limit 10
""")

In [22]:
df = ds.toPandas()

In [23]:
df

Unnamed: 0,uniparc_id,database,database_id,interpro_name,interpro_id,domain_start,domain_end,__index_level_0__
0,UPI0000000011,ProDom,PD012198,Poxvirus I5,IPR006803,11,75,0
1,UPI0000000011,Pfam,PF04713,Poxvirus I5,IPR006803,3,75,1
2,UPI0000000011,PIRSF,PIRSF003768,Poxvirus I5,IPR006803,1,79,2
3,UPI0000000066,Gene3D,G3DSA:3.10.290.10,RNA-binding S4 domain superfamily,IPR036986,35,120,3
4,UPI0000000066,Gene3D,G3DSA:3.10.290.10,RNA-binding S4 domain superfamily,IPR036986,137,185,4
5,UPI0000000066,HAMAP,MF_00485,Ribosomal protein S4e,IPR000876,7,238,5
6,UPI0000000066,ProDom,PD002667,"Ribosomal protein S4e, central region",IPR013845,87,181,6
7,UPI0000000066,Pfam,PF00467,KOW,IPR005824,178,211,7
8,UPI0000000066,Pfam,PF00900,"Ribosomal protein S4e, central region",IPR013845,95,169,8
9,UPI0000000066,Pfam,PF01479,RNA-binding S4 domain,IPR002942,44,90,9


#### Distinct databases

In [24]:
ds = spark.sql(f"""\
select distinct(database)
from parquet.`{DATABIN_PATH}/uniparc/v0.1.0/uniparc_domain.parquet`
""")

In [25]:
df = ds.toPandas()

In [26]:
df

Unnamed: 0,database
0,PRINTS
1,PROSITE
2,PANTHER
3,ProDom
4,TIGRFAMs
5,SUPFAM
6,SFLD
7,Gene3D
8,SMART
9,Pfam


### `chain.parquet`

In [29]:
ds = spark.sql(f"""\
select *
from parquet.`{DATABIN_PATH}/uniparc/v0.1.0/chain.parquet`
limit 100
""")

In [30]:
df = ds.toPandas()

In [31]:
df

Unnamed: 0,uniparc_id,name,idx,value,__index_level_0__
0,UPI0000000066,chain,1,3j7pSE,0
1,UPI0000000066,chain,2,3j7rSE,1
2,UPI0000000066,chain,3,4ug0SE,2
3,UPI0000000066,chain,4,4v6xAE,3
4,UPI0000000066,chain,5,5a2qE,4
5,UPI0000000066,chain,6,5lksSE,5
6,UPI0000000066,chain,7,5lzsEE,6
7,UPI0000000066,chain,8,5lztEE,7
8,UPI0000000066,chain,9,5lzuEE,8
9,UPI0000000066,chain,10,5lzvEE,9


### `uniparc_xref2chain`

In [68]:
ds = spark.sql(f"""\
select 
    ux.uniparc_id, ux.db_id pdb_id,
    x2c.uniparc_xref_idx, x2c.property_idx, c.value pdb_id_with_chain
from parquet.`{DATABIN_PATH}/uniparc/v0.1.0/uniparc_xref.parquet` ux
join parquet.`{DATABIN_PATH}/uniparc/v0.1.0/uniparc_xref2chain.parquet` x2c ON (
    ux.uniparc_id = x2c.uniparc_id and ux.idx = x2c.uniparc_xref_idx)
join parquet.`{DATABIN_PATH}/uniparc/v0.1.0/chain.parquet` c ON (
    x2c.uniparc_id = c.uniparc_id and x2c.property_idx = c.idx)
where ux.db_type = "PDB"
    and ux.active = "Y"
limit 100
""")

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:34119)
Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 852, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 990, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:root:An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 0))



Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:34119)

In [67]:
df = ds.toPandas()

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:34119)
Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 852, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 990, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:34119)
Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 852, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of 

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:34119)
Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 852, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 990, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-67-d81406fbb00c>", line 1, in <module>
 

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:34119)
Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 852, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 990, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-67-d81406fbb00c>", line 1, in <module>
 

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:34119)
Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 852, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 990, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-67-d81406fbb00c>", line 1, in <module>
 

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:34119)
Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 852, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 990, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-67-d81406fbb00c>", line 1, in <module>
 

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:34119)
Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 852, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/py4j/java_gateway.py", line 990, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kimlab1/strokach/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-67-d81406fbb00c>", line 1, in <module>
 

Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:34119)

In [63]:
df

Unnamed: 0,uniparc_id,pdb_id,uniparc_xref_idx,property_idx,value
0,UPI0000000066,3j7p,2,1,3j7pSE
1,UPI0000000066,3j7r,3,2,3j7rSE
2,UPI0000000066,4ug0,4,3,4ug0SE
3,UPI0000000066,4v6x,5,4,4v6xAE
4,UPI0000000066,5a2q,6,5,5a2qE
5,UPI0000000066,5lks,7,6,5lksSE
6,UPI0000000066,5lzs,8,7,5lzsEE
7,UPI0000000066,5lzt,9,8,5lztEE
8,UPI0000000066,5lzu,10,9,5lzuEE
9,UPI0000000066,5lzv,11,10,5lzvEE


## `uniparc_domain`

In [11]:
!ls {DATABIN_PATH}/uniparc_domain/0.1/

adjacency_matrix.parquet


### `adjacency_matrix.parquet`

In [12]:
ds = spark.sql(f"""\
select *
from parquet.`{DATABIN_PATH}/uniparc_domain/0.1/adjacency_matrix.parquet`
limit 10
""")

In [11]:
df = ds.toPandas()

NameError: name 'ds' is not defined

In [None]:
df.head(2)

In [None]:
ds = spark.sql(f"""\
select *
from parquet.`{DATABIN_PATH}/uniparc_domain/0.1/adjacency_matrix.parquet`
limit 10
""")

In [None]:
df = ds.toPandas()

In [None]:
df.head(2)

## Combine

In [None]:
uniparc_domains_with_structures_ds = spark.sql(f"""\
    select
    ux.uniparc_id, ux.id pdb_id,
    am.domain_start, am.domain_end, am.domain_length, am.structure_id, am.model_id, am.chain_id, am.pc_identity,
    am.__index_level_0__
    from parquet.`{DATABIN_PATH}/uniparc/v0.1.0/uniparc_xref.parquet` ux
    join parquet.`{DATABIN_PATH}/uniparc/v0.1.0/uniparc_domain.parquet` ud
    join parquet.`{DATABIN_PATH}/uniparc_domain/0.1/adjacency_matrix.parquet` am
    where ux.db_type = "PDB"
    and ux.active = "Y"
""")