# Data downloading

## 1.Downing allosteric site data from shsmu.edu.cn

> `data/allosteric_site_shsmu.json` 
> 
> contains data crawling form shsmu.edu.cn(home->featrue->site).

In [None]:
# there are 1928 allosteric site in shsmu, but 1928th 3GVU can not download.

from utils.data_download import download_shsmu_as, download_rcsb, unzip

allosteric_site_index = 'data/allosteric_site_shsmu.json'

download_shsmu_as(allosteric_site_path=allosteric_site_index, outpath='data/shsmu_allosteric_site/')
unzip(origin_dir='data/shsmu_allosteric_site/', outdir='data/shsmu_allosteric_site/allosteric_site/')

!rm -r data/shsmu_allosteric_site/*.pdb.gz

download_rcsb(allosteric_site_path=allosteric_site_index, outpath='data/shsmu_allosteric_site/')

!rm -r data/shsmu_allosteric_site/3gvu.pdb.gz

unzip(origin_dir='data/shsmu_allosteric_site/', outdir='data/shsmu_allosteric_site/rscb_pdb/')

!rm -r data/shsmu_allosteric_site/*.pdb.gz

> `data/ASD_Release_201909_AS.txt` 
>
> is the description file of the allostericsite, but hard to read.

In [None]:
from utils.data_process import transform_txt_to_csv

txt_path = 'data/ASD_Release_201909_AS.txt'
csv_path = 'data/ASD_Release_201909_AS.csv'
transform_txt_to_csv(path=txt_path, outpath=csv_path)

## 2.Download pretain data form rcsb

> `data/pretrain/list_file_protein_xray_max3A_total.txt`
>
> includes pdbids with resolution of 0.5 ~ 3A obtained by X-Ray experiment.

In [None]:
# download logs are stored in succeed.log, timeout.log, wrang.log

from utils.data_download import download_pretarining_data, redownload_error_pretarining_data

pdblist = 'data/pretrain/list_file_protein_xray_max3A_total.txt'
output_path = 'data/pretrain/'

download_pretarining_data(list_path=pdblist, outpath=output_path)

> If there are links in timeout.log or wrang.log run redownload_error_pretarining_data() to redownload failed pdbs.

In [None]:
timeout_path = 'data/pretrain/timeout.log'
wrang_path = 'data/pretrain/wrang.log'

redownload_error_pretarining_data(timeout_path, wrang_path, output_path)

In [None]:
from utils.data_download import unzip

dir_path = '/mnt/g/Little-LL/pretrain/'
out_path = '/mnt/g/Little-LL/pretrain_pdb/'

unzip(dir_path, out_path)

# Data processing

## 1.Build a BERT tokenizer of residue

In [None]:
# optional step

from utils.data_process import build_tokenizer_dataset, tokenizer_json_to_txt

pdb_paths = '/mnt/g/Little-LL/pretrain_pdb/'
output_path ='/mnt/g/Little-LL/pretrain_tokenizer/'

build_tokenizer_dataset(path=pdb_paths, outpath=output_path)
tokenizer_json_to_txt(output_path)

cd `models/` and run `python tokenizer_building.py`

the output tokenizer config is in `models/tokenizer/` 

## 2.Build the allosteric site dataset

In [None]:
from utils.data_process import build_allosteric_dataset

# origin pdb file to input data (json)
pdb_dir = 'data/shsmu_allosteric_site/rcsb_pdb/'
out_dir = 'data/allosteric_site/input/'
build_allosteric_dataset(pdb_dir, out_dir, save_repeat_chhain=True)

pdb_dir = 'data/shsmu_allosteric_site/allosteric_site/'
out_dir = 'data/allosteric_site/target/'
build_allosteric_dataset(pdb_dir, out_dir, save_repeat_chhain=True)

In [None]:
from utils.pre_data import pre_single_a, pre_data

as_path = 'data/allosteric_site/target/'
pdb_path = 'data/allosteric_site/input/'
data_json = 'data/allosteric_site/data_all.json'

# pre_single_a(target_dir=as_path, pdb_dir=pdb_path, output_json=data_json)
pre_data(target_dir=as_path, pdb_dir=pdb_path, output_json=data_json)

In [None]:
from utils.pre_data import transform_data, split_train_test

data_json = 'data/allosteric_site/data_all.json'
data_test = 'data/allosteric_site/data_test.json'
data_train = 'data/allosteric_site/data_train.json'

inputs, targets = transform_data(data_path=data_json)

split_train_test(inputs, targets, train_file=data_train, test_file=data_test)

## 3.Build the dataset for pretrain ResidueRobertaMLM

> Transform .pdb files to json. (Do not save the same chains in a pdb)

In [None]:
from utils.data_process import build_allosteric_dataset

# origin pdb file to input data (json)
pdb_dir = '/mnt/g/Little-LL/pretrain_pdb/'
out_dir = '/mnt/g/Little-LL/pretrain_input/'
build_allosteric_dataset(pdb_dir, out_dir, save_repeat_chhain=False)

In [None]:
from utils.pre_data import pre_data_rcsb

rcsb_path = '/mnt/g/Little-LL/pretrain_input/'
rcsb_json = '/mnt/g/Little-LL/rcsb_all.json'

pre_data_rcsb(rcsb_dir=rcsb_path, output_json=rcsb_json, split=10000)

In [None]:
from utils.pre_data import transform_pretrain_data

rcsb_json_dir = '/mnt/g/Little-LL/pretrain_rcsb_all/'
rcsb_input_dir = '/mnt/g/Little-LL/pretrain_rcsb_inputs/'

transform_pretrain_data(rcsb_json_dir, rcsb_input_dir)

## 4.Pocket detecting

In [None]:
from utils.data_process import extract_from_fpocket

fpocket_path = '/home/little-ll/Demo/python/allosteric-site/data/shsmu_allosteric_site/rscb_pdb/fpocket_out_pocketPDB'
npy_save_path = '/home/little-ll/Demo/python/allosteric-site/data/shsmu_allosteric_site/rscb_pdb/fpocket_out_pocketNPY'

pockets_all = extract_from_fpocket(fpocket_path=fpocket_path, save_path=npy_save_path)


In [None]:
import numpy as np
import operator

path = '/home/little-ll/Demo/python/allosteric-site/data/shsmu_allosteric_site/rscb_pdb/fpocket_out_pocketNPY/1A3W_out.npy'
load_dict = np.load(path, allow_pickle=True).item()
print(load_dict['Pockets'])

# Test