## Imports

In [1]:
import gzip
import shlex
import subprocess
import tempfile
from pathlib import Path
import contextlib

import pandas as pd
from tqdm.notebook import tqdm

import proteinsolver
import pyarrow as pa
import pyarrow.parquet as pq
from kmbio import PDB
from kmtools import structure_tools

# Parameters

In [19]:
NOTEBOOK_NAME = "07_protein_analysis_experimental"
NOTEBOOK_PATH = Path(NOTEBOOK_NAME)
NOTEBOOK_PATH.mkdir(exist_ok=True)
NOTEBOOK_PATH

PosixPath('07_protein_analysis_experimental')

In [2]:
UNIQUE_ID = "191f05de"  # No attention
# UNIQUE_ID = "0007604c"  # 5-layer graph-conv with attention, batch_size=1
# UNIQUE_ID = "91fc9ab9"  # 4-layer graph-conv with attention, batch_size=4

In [3]:
BEST_STATE_FILES = {
    #
    "191f05de": "protein_train/191f05de/e53-s1952148-d93703104.state"
}

In [4]:
DATAPKG_DATA_DIR = Path(f"~/datapkg_data_dir").expanduser().resolve()
DATAPKG_DATA_DIR

PosixPath('/home/kimlab1/database_data/datapkg_output_dir')

## Designs

In [5]:
designs = [
    # First batch
    (
        "1n5u_DopeNormalized.pdb",
        "1n5uA03",
        "DopeNormalized",
        10,
        "GSGTEEFLKFAKERLSRRFPEASEEEVERLTEIETKVKECCRAGDEKTCKECRSNLASYVCSNRDLLSDDLKECCEKPTSDISSCLSNVTKN",
    ),
    (
        "4beu_DopeHD.pdb",
        "4beuA02",
        "DopeHD",
        7,
        "MEAFRAALEAFRAALDAGVRLAALLRRAARARGLRALVPELLARGLPAVGVDSLAEARAVREAGFRGDVLFVRSATLEEIAAALATGLAVVVGSDEQAAALAALAAEAGRALAVELGLSVAGPARDGLDLSTAEGVARAAALARVAGLAVRGILDHLPDEDAERVRARLEAFRAAAALLLEAAGLPRDGLAVHVGDSRAARALPEARPDLVYVGAEL",
    ),
    (
        "4unu_Rosetta.pdb",
        "4unuA00",
        "Rosetta",
        0,
        "MAFTHPAAATATPGGTVRVACTGDKETVADRPDITWHLRRAGRAVVPVVTGTGALPAGVSPRLTGSEGGNVATLDVEGVTAEDAGVYYCSTRLTDGDVLEGPGTRVTVG",
    ),
    (
        "4z8j_DopeHD.pdb",
        "4z8jA00",
        "DopeHD",
        0,
        "MPYKVTFEKADDGFGFDVVGFAEKGGEKVDIKGEKWEPVVTVTSVKPGGAAEKAGIKEGEIIRKVKGSEVIGASFEEVVALLESGDGVIELDVIDV",
    ),
    (
        "4beu_Comments.pdb",
        "4beuA02",
        "Comments",
        4,
        "MEAFRAALEAFRAALDAEVRLAALLRRAARARGLRALVPELLARGLPAVGVDSLAEARAVREAGFRGDVLFHRSATLEEIAAALATGLAVVVGSDEQAAALAALAAEAGRALRVELGLSVAGPARDGLDLDTAEGRARAAALARVAGLEVRGILDHLPDEDAERVRARLERFRAAAALLLEAAGLPRDGLAVHVGDSRAARALPEARPDLVYVGDEL",
    ),
    (
        "4unu_Comments.pdb",
        "4unuA00",
        "Comments",
        0,
        "AAFTHPATATATPGGTVRVACTGDRETVADRPDITWHLRRAGRAVTPVVEGTGALPAGVSPRLTGSEGGNVATLDVEGVTAEDAGVYYCSTRLTDGDVLEGPGTRVTVG",
    ),
    (
        "4z8j_Comments.pdb",
        "4z8jA00",
        "Comments",
        4,
        "MPYKVTFEKADDGFGFDVVGFAEKGGEKVDIKGKKWEPVVTVKSVKPGGAAEKAGIKEGEIIRKVKGSEVRGASFEEVVALLESGDGVIELDVIDV",
    ),
    (
        "4unu_FilteredRosetta.pdb",
        "4unuA00",
        "FilteredRosetta",
        0,
        "SAFTHPAAATARPGGTVRVACTGDKETVADRPDITWHLRRAGRAVTPVVEGTGALPAGVSPRLTGSEGGNVATLDVEGVTAEDAGVYYCSTRLDDGDVLEGPGTRVTVG",
    ),
    (
        "4unu_FilteredMolPDF3.pdb",
        "4unuA00",
        "FilteredMolPDF3",
        0,
        "MPFTHPASATARPGGTVRVACRGDRETVADRPDITWHLRRAGRAVTPVVEGTGALPAGVSPRLTGSEGGNVATLDVEGVTAEDAGVYYCSTRLTDGDVLEGPGTRVTVG",
    ),
    (
        "4z8j_FilteredDopeNormalized.pdb",
        "4z8jA00",
        "FilteredDopeNormalized",
        0,
        "MPYKVTFEKTDDGFGFDVVGFKEKGGEKVDIKGKKWEPVVTVKSVKPGGAAEKAGIKEGEIIRKVKGSEVRGASFEEVVALLESGDGVIELDVIDV",
    ),
    (
        "4z8j_FilteredDopeHR2.pdb",
        "4z8jA00",
        "FilteredDopeHR2",
        0,
        "MPYKVTFEKTDDGFGFDVVGFKEKGGAKVDIKGKKWEPVVAVTSVKPGGAAEKAGIKEGEIIRKVKGSEVRGASFEEVVALLESGDGVIELDVIDV",
    ),
]

In [6]:
designs_df = pd.DataFrame(designs, columns=["filename", "domain_id", "selection_method", "score", "sequence"]).sort_values("domain_id")

In [7]:
designs_df

Unnamed: 0,filename,domain_id,selection_method,score,sequence
0,1n5u_DopeNormalized.pdb,1n5uA03,DopeNormalized,10,GSGTEEFLKFAKERLSRRFPEASEEEVERLTEIETKVKECCRAGDE...
1,4beu_DopeHD.pdb,4beuA02,DopeHD,7,MEAFRAALEAFRAALDAGVRLAALLRRAARARGLRALVPELLARGL...
4,4beu_Comments.pdb,4beuA02,Comments,4,MEAFRAALEAFRAALDAEVRLAALLRRAARARGLRALVPELLARGL...
2,4unu_Rosetta.pdb,4unuA00,Rosetta,0,MAFTHPAAATATPGGTVRVACTGDKETVADRPDITWHLRRAGRAVV...
5,4unu_Comments.pdb,4unuA00,Comments,0,AAFTHPATATATPGGTVRVACTGDRETVADRPDITWHLRRAGRAVT...
7,4unu_FilteredRosetta.pdb,4unuA00,FilteredRosetta,0,SAFTHPAAATARPGGTVRVACTGDKETVADRPDITWHLRRAGRAVT...
8,4unu_FilteredMolPDF3.pdb,4unuA00,FilteredMolPDF3,0,MPFTHPASATARPGGTVRVACRGDRETVADRPDITWHLRRAGRAVT...
3,4z8j_DopeHD.pdb,4z8jA00,DopeHD,0,MPYKVTFEKADDGFGFDVVGFAEKGGEKVDIKGEKWEPVVTVTSVK...
6,4z8j_Comments.pdb,4z8jA00,Comments,4,MPYKVTFEKADDGFGFDVVGFAEKGGEKVDIKGKKWEPVVTVKSVK...
9,4z8j_FilteredDopeNormalized.pdb,4z8jA00,FilteredDopeNormalized,0,MPYKVTFEKTDDGFGFDVVGFKEKGGEKVDIKGKKWEPVVTVKSVK...


In [8]:
Path(proteinsolver.__path__[0]).joinpath("data", "inputs",)

PosixPath('/home/kimlab1/strokach/workspace/proteinsolver/proteinsolver/data/inputs')

In [9]:
domain_id_to_sequence = {}
for file in Path(proteinsolver.__path__[0]).joinpath("data", "inputs").glob("*.pdb"):
    structure = PDB.load(file)
    sequence = structure_tools.extract_aa_sequence(structure, 0, "A")
    domain_id_to_sequence[file.stem] = sequence
    
domain_id_to_sequence

{'1n5uA03': 'KFGERAFKAWAVARLSQRFPKAEFAEVSKLVTDLTKVHTECCHGDLLECADDRADLAKYICENQDSISSKLKECCEKPLLEKSHCIAEVEND',
 '4beuA02': 'LGQFQSNIEQFKSHMNANTKICAIMKADAYGNGIRGLMPTIIAQGIPCVGVASNAEARAVRESGFKGELIRVRSASLSEMSSALDLNIEELIGTHQQALDLAELAKQSGKTLKVHIALNDGGMGRNGIDMTTEAGKKEAVSIATQPSLSVVGIMTHFPNYNADEVRAKLAQFKESSTWLMQQANLKREEITLHVANSYTALNVPEAQLDMVRPGGVL',
 '4unuA00': 'SALTQPPSASGSLGQSVTISCTGTSSDVGGYNYVSWYQQHAGKAPKVIIYEVNKRPSGVPDRFSGSKSGNTASLTVSGLQAEDEADYYCSSYEGSDNFVFGTGTKVTVL',
 '4z8jA00': 'SPRVVRIVKSESGYGFNVRGQVSEGGQLRSINGELYAPLQHVSAVLPGGAADRAGVRKGDRILEVNGVNVEGATHKQVVDLIRAGEKELILTVLSV',
 '3fndA02': 'FYGYSWEESLQGAVDDVRGIRYSGILKHLGNEAADKDNIGKTYY'}

In [10]:
for key, value in domain_id_to_sequence.items():
    print(key, len(value))

1n5uA03 92
4beuA02 217
4unuA00 109
4z8jA00 96
3fndA02 44


In [11]:
def calculate_sequence_identity(row):
    sequence_ref = domain_id_to_sequence[row.domain_id]
    assert len(sequence_ref) == len(row.sequence), row
    seq_identity = sum((aa1 == aa2) for aa1, aa2 in zip(row.sequence, sequence_ref)) / len(sequence_ref)
    return seq_identity

In [12]:
designs_df["seq_identity"] = [calculate_sequence_identity(row) for row in designs_df.itertuples()]

In [13]:
designs_df

Unnamed: 0,filename,domain_id,selection_method,score,sequence,seq_identity
0,1n5u_DopeNormalized.pdb,1n5uA03,DopeNormalized,10,GSGTEEFLKFAKERLSRRFPEASEEEVERLTEIETKVKECCRAGDE...,0.423913
1,4beu_DopeHD.pdb,4beuA02,DopeHD,7,MEAFRAALEAFRAALDAGVRLAALLRRAARARGLRALVPELLARGL...,0.37788
4,4beu_Comments.pdb,4beuA02,Comments,4,MEAFRAALEAFRAALDAEVRLAALLRRAARARGLRALVPELLARGL...,0.373272
2,4unu_Rosetta.pdb,4unuA00,Rosetta,0,MAFTHPAAATATPGGTVRVACTGDKETVADRPDITWHLRRAGRAVV...,0.357798
5,4unu_Comments.pdb,4unuA00,Comments,0,AAFTHPATATATPGGTVRVACTGDRETVADRPDITWHLRRAGRAVT...,0.357798
7,4unu_FilteredRosetta.pdb,4unuA00,FilteredRosetta,0,SAFTHPAAATARPGGTVRVACTGDKETVADRPDITWHLRRAGRAVT...,0.366972
8,4unu_FilteredMolPDF3.pdb,4unuA00,FilteredMolPDF3,0,MPFTHPASATARPGGTVRVACRGDRETVADRPDITWHLRRAGRAVT...,0.348624
3,4z8j_DopeHD.pdb,4z8jA00,DopeHD,0,MPYKVTFEKADDGFGFDVVGFAEKGGEKVDIKGEKWEPVVTVTSVK...,0.385417
6,4z8j_Comments.pdb,4z8jA00,Comments,4,MPYKVTFEKADDGFGFDVVGFAEKGGEKVDIKGKKWEPVVTVKSVK...,0.375
9,4z8j_FilteredDopeNormalized.pdb,4z8jA00,FilteredDopeNormalized,0,MPYKVTFEKTDDGFGFDVVGFKEKGGEKVDIKGKKWEPVVTVKSVK...,0.375


In [14]:
amino_acids = "GVALICMFWPDESTYQNKRH"

In [15]:
# output_filename = DATAPKG_DATA_DIR.joinpath("deep-protein-gen", "training_data_v2.fasta.gz")
# with gzip.open(output_filename, "wb") as fout:
#     num_rows = 0
#     num_rows_final = 0
#     for i in range(10):
#         filename = DATAPKG_DATA_DIR.joinpath("deep-protein-gen", f"training_data_rs{i}.parquet")
#         pq_file = pq.ParquetFile(filename)
#         for row_group in tqdm(range(pq_file.num_row_groups), desc=f"{i}", leave=True):
#             df = pq_file.read_row_group(
#                 row_group, columns=["uniparc_id", "database_id", "structure_id", "sequence"]
#             ).to_pandas()
#             num_rows += len(df)
#             df = df[df["sequence"].str.strip(amino_acids).str.len() == 0]
#             num_rows_final += len(df)
#             for row in df.itertuples():
#                 fout.write(f">{row.uniparc_id}|{row.database_id}|{row.structure_id}\n{row.sequence}\n".encode("utf-8"))
#         print(f"Lost {num_rows - num_rows_final} rows ({(num_rows - num_rows_final) / num_rows:%}).")

In [26]:
row = designs_df.iloc[1]
row

filename                                              4beu_DopeHD.pdb
domain_id                                                     4beuA02
selection_method                                               DopeHD
score                                                               7
sequence            MEAFRAALEAFRAALDAGVRLAALLRRAARARGLRALVPELLARGL...
seq_identity                                                  0.37788
Name: 1, dtype: object

In [27]:
target_db = "/home/kimlab1/database_data/datapkg_output_dir/deep-protein-gen/mmseqs2/training_data_v2"

target_db

'/home/kimlab1/database_data/datapkg_output_dir/deep-protein-gen/mmseqs2/training_data_v2'

In [28]:
with contextlib.ExitStack() as stack:
    temp_dir = stack.enter_context(tempfile.TemporaryDirectory())
    in_file = stack.enter_context(tempfile.NamedTemporaryFile(suffix=".fasta"))
    out_file = stack.enter_context(tempfile.NamedTemporaryFile(suffix=".m8"))

    with open(in_file.name, "wt") as fout:
        fout.write(f">{row.filename}|{row.domain_id}\n{row.sequence}\n")

    system_command = (
        f"mmseqs easy-search '{in_file.name}' '{target_db}' '{out_file.name}' '{temp_dir}' "
        "-s 7.5 -e 10000 --max-seqs 10000000 --alignment-mode 3 --rescore-mode 3"
    )
    print(system_command)

    cp = subprocess.run(
        shlex.split(system_command), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
    )

    with open(out_file.name, "rt") as fin:
        results = fin.read()

mmseqs easy-search '/tmp/strokach/tmpby04od7c.fasta' '/home/kimlab1/database_data/datapkg_output_dir/deep-protein-gen/mmseqs2/training_data_v2' '/tmp/strokach/tmpvvhz1rdt.m8' '/tmp/strokach/tmp91h67yox' -s 7.5 -e 10000 --max-seqs 10000000 --alignment-mode 3 --rescore-mode 3


In [29]:
print(cp.stdout)

/tmp/strokach/tmpvvhz1rdt.m8 exists and will be overwritten.
createdb /tmp/strokach/tmpby04od7c.fasta /tmp/strokach/tmp91h67yox/8816038521352333406/query --dbtype 0 --shuffle 1 --createdb-mode 0 --id-offset 0 --compressed 0 -v 3 

Converting sequences
[
Time for merging to query_h: 0h 0m 0s 1ms
Time for merging to query: 0h 0m 0s 1ms
Database type: Aminoacid
Time for merging to query.lookup: 0h 0m 0s 0ms
Time for processing: 0h 0m 0s 6ms
Tmp /tmp/strokach/tmp91h67yox/8816038521352333406/search_tmp folder does not exist or is not a directory.
prefilter /tmp/strokach/tmp91h67yox/8816038521352333406/query /home/kimlab1/database_data/datapkg_output_dir/deep-protein-gen/mmseqs2/training_data_v2.idx /tmp/strokach/tmp91h67yox/8816038521352333406/search_tmp/12175266970068841675/pref_0 --sub-mat nucl:nucleotide.out,aa:blosum62.out --seed-sub-mat nucl:nucleotide.out,aa:VTML80.out -k 0 --k-score 2147483647 --alph-size 21 --max-seq-len 65535 --max-seqs 10000000 --split 0 --split-mode 2 --split-mem

In [30]:
row

filename                                              4beu_DopeHD.pdb
domain_id                                                     4beuA02
selection_method                                               DopeHD
score                                                               7
sequence            MEAFRAALEAFRAALDAGVRLAALLRRAARARGLRALVPELLARGL...
seq_identity                                                  0.37788
Name: 1, dtype: object

In [37]:
with NOTEBOOK_PATH.joinpath("4beu_DopeHD.pdb.m8").open("wt") as fout:
    fout.write(results)

In [36]:
row

filename                                              4beu_DopeHD.pdb
domain_id                                                     4beuA02
selection_method                                               DopeHD
score                                                               7
sequence            MEAFRAALEAFRAALDAGVRLAALLRRAARARGLRALVPELLARGL...
seq_identity                                                  0.37788
Name: 1, dtype: object

In [34]:
!ls {NOTEBOOK_PATH}

1n5u_DopeNormalized_matches.m8


In [38]:
!cat {NOTEBOOK_PATH}/4beu_DopeHD.pdb.m8


4beu_DopeHD.pdb|4beuA02	UPI00093D71EC|1.20.120.530|2DI3	0.423	59	33	0	160	217	6	64	7.177E+00	35
4beu_DopeHD.pdb|4beuA02	UPI00011648E8|3.40.50.10320|2IXD	0.410	106	61	0	112	215	16	121	7.177E+00	35
4beu_DopeHD.pdb|4beuA02	UPI000779B0EB|3.90.870.10|3AJE	0.383	73	44	0	137	209	8	80	1.700E+01	34
4beu_DopeHD.pdb|4beuA02	UPI00077947ED|3.90.870.10|3AJE	0.383	73	44	0	137	209	8	80	1.700E+01	34
4beu_DopeHD.pdb|4beuA02	UPI000324865F|3.90.870.10|3AJE	0.375	72	44	0	137	208	10	81	4.019E+01	33
4beu_DopeHD.pdb|4beuA02	UPI000779A4ED|3.90.870.10|3AJE	0.418	73	42	0	137	209	8	80	5.353E+01	33
4beu_DopeHD.pdb|4beuA02	UPI000779A4ED|3.90.870.10|3AJE	0.418	73	42	0	137	209	8	80	5.353E+01	33
4beu_DopeHD.pdb|4beuA02	UPI000250508B|3.20.20.240|3REQ	0.406	64	37	0	146	208	239	302	7.128E+01	32
4beu_DopeHD.pdb|4beuA02	UPI00022D6B66|1.10.600.10|3AQ0	0.428	35	19	0	130	164	393	427	9.490E+01	32
4beu_DopeHD.pdb|4beuA02	UPI000627F003|3.90.190.20|4QF5	0.433	53	29	0	2	54	26	78	1.263E+02	31
4beu_DopeHD.pdb|4beuA02	UPI00056ACBD0|3

In [32]:
print(results)

4beu_DopeHD.pdb|4beuA02	UPI00093D71EC|1.20.120.530|2DI3	0.423	59	33	0	160	217	6	64	7.177E+00	35
4beu_DopeHD.pdb|4beuA02	UPI00011648E8|3.40.50.10320|2IXD	0.410	106	61	0	112	215	16	121	7.177E+00	35
4beu_DopeHD.pdb|4beuA02	UPI000779B0EB|3.90.870.10|3AJE	0.383	73	44	0	137	209	8	80	1.700E+01	34
4beu_DopeHD.pdb|4beuA02	UPI00077947ED|3.90.870.10|3AJE	0.383	73	44	0	137	209	8	80	1.700E+01	34
4beu_DopeHD.pdb|4beuA02	UPI000324865F|3.90.870.10|3AJE	0.375	72	44	0	137	208	10	81	4.019E+01	33
4beu_DopeHD.pdb|4beuA02	UPI000779A4ED|3.90.870.10|3AJE	0.418	73	42	0	137	209	8	80	5.353E+01	33
4beu_DopeHD.pdb|4beuA02	UPI000779A4ED|3.90.870.10|3AJE	0.418	73	42	0	137	209	8	80	5.353E+01	33
4beu_DopeHD.pdb|4beuA02	UPI000250508B|3.20.20.240|3REQ	0.406	64	37	0	146	208	239	302	7.128E+01	32
4beu_DopeHD.pdb|4beuA02	UPI00022D6B66|1.10.600.10|3AQ0	0.428	35	19	0	130	164	393	427	9.490E+01	32
4beu_DopeHD.pdb|4beuA02	UPI000627F003|3.90.190.20|4QF5	0.433	53	29	0	2	54	26	78	1.263E+02	31
4beu_DopeHD.pdb|4beuA02	UPI00056ACBD0|3

In [None]:
with tempfile.NamedTemporaryFile() as in_file, tempfile.NamedTemporaryFile() as out_file:
    with open(in_file.name, "wt") as fout:
        fout.write(f">{row.filename}|{row.domain_id}\n{row.sequence}\n")

    system_command = f"mmseqs easy-search '{in_file.name}' '{target_fasta_file}' '{out_file.name}' --alignment-mode 3"
    cp = subprocess.run(
        shlex.split(system_command), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
    )

system_command_template

In [17]:
DATAPKG_DATA_DIR

PosixPath('/home/kimlab1/database_data/datapkg_output_dir')