# Data downloading

## 1.Downing allosteric site data from shsmu.edu.cn

> `data/allosteric_site_shsmu.json` 
> 
> contains data crawling form shsmu.edu.cn(home->featrue->site).

In [None]:
# there are 1928 allosteric site in shsmu, but 1928th 3GVU can not download.

from utils.data_download import download_shsmu_as, download_rcsb, unzip

allosteric_site_index = 'data/allosteric_site_shsmu.json'

download_shsmu_as(allosteric_site_path=allosteric_site_index, outpath='data/shsmu_allosteric_site/')
unzip(origin_dir='data/shsmu_allosteric_site/', outdir='data/shsmu_allosteric_site/allosteric_site/')

!rm -r data/shsmu_allosteric_site/*.pdb.gz

download_rcsb(allosteric_site_path=allosteric_site_index, outpath='data/shsmu_allosteric_site/')

!rm -r data/shsmu_allosteric_site/3gvu.pdb.gz

unzip(origin_dir='data/shsmu_allosteric_site/', outdir='data/shsmu_allosteric_site/rscb_pdb/')

!rm -r data/shsmu_allosteric_site/*.pdb.gz

> `data/ASD_Release_201909_AS.txt` 
>
> is the description file of the allostericsite, but hard to read.

In [None]:
from utils.data_process import transform_txt_to_csv

txt_path = 'data/ASD_Release_201909_AS.txt'
csv_path = 'data/ASD_Release_201909_AS.csv'
transform_txt_to_csv(path=txt_path, outpath=csv_path)

## 2.Download pretain data form rcsb

> `data/pretrain/list_file_protein_xray_max3A_total.txt`
>
> includes pdbids with resolution of 0.5 ~ 3A obtained by X-Ray experiment.

In [None]:
# download logs are stored in succeed.log, timeout.log, wrang.log

from utils.data_download import download_pretarining_data, redownload_error_pretarining_data

pdblist = 'data/pretrain/list_file_protein_xray_max3A_total.txt'
output_path = 'data/pretrain/'

download_pretarining_data(list_path=pdblist, outpath=output_path)

> If there are links in timeout.log or wrang.log run redownload_error_pretarining_data() to redownload failed pdbs.

In [None]:
timeout_path = 'data/pretrain/timeout.log'
wrang_path = 'data/pretrain/wrang.log'

redownload_error_pretarining_data(timeout_path, wrang_path, output_path)

In [None]:
from utils.data_download import unzip

dir_path = '/mnt/g/Little-LL/pretrain/'
out_path = '/mnt/g/Little-LL/pretrain_pdb/'

unzip(dir_path, out_path)

# Data processing

In [None]:
s = "ATOM      1  N   PHE A   3     127.591  57.948 -15.781  0.50 28.16           N  "
print(s[0:4], s[21], s[17:20], len(s[22:26]), len(s[22:26].strip()))

## 1.Build a BERT tokenizer of residue

In [None]:
# optional step

from utils.data_process import build_tokenizer_dataset, tokenizer_json_to_txt

pdb_paths = '/mnt/g/Little-LL/pretrain_pdb/'
output_path ='/mnt/g/Little-LL/pretrain_tokenizer/'

build_tokenizer_dataset(path=pdb_paths, outpath=output_path)
tokenizer_json_to_txt(output_path)

cd `models/tokenizer` and run `python tokenizer_building.py`

the output tokenizer config is in `models/tokenizer/residue` 

## 2.Build the allosteric site dataset

In [5]:
from utils.data_process import build_allosteric_dataset

# origin pdb file to input data (json)
pdb_dir = 'data/shsmu_allosteric_site/rcsb_pdb/'
out_dir = 'data/allosteric_site/input/'
build_allosteric_dataset(pdb_dir, out_dir)

pdb_dir = 'data/shsmu_allosteric_site/allosteric_site/'
out_dir = 'data/allosteric_site/target/'
build_allosteric_dataset(pdb_dir, out_dir)

 57%|█████▋    | 1049/1831 [00:19<00:12, 64.03it/s]

pdbid: 3TUV	chain: B	error: 'UNK'
pdbid: 3TUV	chain: B	error: 'UNK'
pdbid: 3TUV	chain: B	error: 'UNK'


 86%|████████▌ | 1577/1831 [00:28<00:04, 60.40it/s]

pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'


100%|██████████| 1831/1831 [00:33<00:00, 54.49it/s]
100%|██████████| 1927/1927 [00:01<00:00, 1471.80it/s]
pdbid:  1W0F_pocket10_atm
chain:  A
L V S L F F V S F I L S
12
['47', '225', '222', '221', '226', '46', '225', '222', '226', '50', '221', '222']
12
[{'x': '75.902', 'y': '98.775', 'z': '17.279'}, {'x': '73.812', 'y': '98.033', 'z': '13.789'}, {'x': '77.212', 'y': '93.433', 'z': '10.360'}, {'x': '74.020', 'y': '92.171', 'z': '14.280'}, {'x': '77.165', 'y': '99.923', 'z': '10.649'}, {'x': '75.534', 'y': '104.430', 'z': '14.306'}, {'x': '74.383', 'y': '100.304', 'z': '12.884'}, {'x': '75.041', 'y': '94.485', 'z': '10.554'}, {'x': '77.822', 'y': '98.674', 'z': '9.482'}, {'x': '73.595', 'y': '96.036', 'z': '17.116'}, {'x': '73.341', 'y': '94.266', 'z': '12.388'}, {'x': '74.544', 'y': '93.444', 'z': '11.445'}]


In [1]:
from utils.pre_data import pre_single_a, pre_data

as_path = 'data/allosteric_site/target/'
pdb_path = 'data/allosteric_site/input/'
data_json = 'data/allosteric_site/data_all.json'

# pre_single_a(target_dir=as_path, pdb_dir=pdb_path, output_json=data_json)
pre_data(target_dir=as_path, pdb_dir=pdb_path, output_json=data_json)

100%|██████████| 1927/1927 [00:01<00:00, 990.21it/s] 


In [1]:
from utils.pre_data import transform_data, split_train_test

data_json = 'data/allosteric_site/data_a.json'
data_test = 'data/allosteric_site/data_test.json'
data_train = 'data/allosteric_site/data_train.json'

inputs, targets = transform_data(data_path=data_json)

split_train_test(inputs, targets, train_file=data_train, test_file=data_test)

100%|██████████| 1813/1813 [00:01<00:00, 1227.74it/s]


In [1]:
from utils.data_process import extract_from_fpocket

fpocket_path = '/home/little-ll/Demo/python/allosteric-site/data/shsmu_allosteric_site/rscb_pdb/fpocket_out_pocketPDB'
npy_save_path = '/home/little-ll/Demo/python/allosteric-site/data/shsmu_allosteric_site/rscb_pdb/fpocket_out_pocketNPY'

pockets_all = extract_from_fpocket(fpocket_path=fpocket_path, save_path=npy_save_path)


In [9]:
import numpy as np
import operator

path = '/home/little-ll/Demo/python/allosteric-site/data/shsmu_allosteric_site/rscb_pdb/fpocket_out_pocketNPY/1A3W_out.npy'
load_dict = np.load(path, allow_pickle=True).item()
print(load_dict['Pockets'])

[{'id': '1A3W_pocket2_atm', 'B': ['THR407', 'GLY490', 'THR424', 'THR403', 'HIS491', 'VAL423', 'GLY484', 'THR406', 'LEU401', 'ARG459', 'TRP452', 'SER492', 'SER404', 'ARG425', 'SER402']}, {'id': '1A3W_pocket35_atm', 'A': ['THR372', 'LYS413', 'ALA290', 'SER377', 'LEU289', 'PRO373', 'PRO375', 'THR376', 'GLU380']}, {'id': '1A3W_pocket37_atm', 'B': ['THR26', 'GLY28', 'ALA336', 'HIS54', 'ASN51', 'ILE27', 'PRO29']}, {'id': '1A3W_pocket3_atm', 'A': ['SER332', 'ASP84', 'GLY265', 'THR298', 'MET330', 'SER53', 'GLU89', 'ARG49', 'GLU242', 'ASP266', 'HIS54', 'SER213', 'ASN51', 'MET261', 'LYS240', 'ALA263', 'PHE214', 'ARG91']}, {'id': '1A3W_pocket17_atm', 'A': ['HIS434', 'SER22', 'TYR436', 'ARG19', 'PHE440', 'ARG41', 'ARG20', 'ASN46', 'GLY44', 'LYS42', 'ARG77', 'THR21', 'ALA43', 'VAL439']}, {'id': '1A3W_pocket30_atm', 'A': ['ARG369', 'ALA388', 'THR372', 'TYR365', 'SER385', 'PRO373', 'ALA384', 'THR376', 'THR381', 'TYR414']}, {'id': '1A3W_pocket26_atm', 'A': ['LEU222', 'GLU226', 'ARG225', 'VAL221', 'VAL

In [16]:
import json

with open('data/allosteric_site/data_test.json', 'r') as f:
    data = json.load(f)

max_len = 0
len_1 = []
len_2 = []
for item in data:
    i_len = len(item['input']['sequence'])
    if max_len < i_len:
        max_len = i_len
    if i_len < 1024:
        len_1.append(i_len)
    else:
        len_2.append(i_len)

print(max_len)
print(len(len_1))
print(len(len_2))

5292
361
2
