# Data downloading

## 1.Downing allosteric site data from shsmu.edu.cn

> `data/allosteric_site_shsmu.json` 
> 
> contains data crawling form shsmu.edu.cn(home->featrue->site).

In [None]:
# there are 1928 allosteric site in shsmu, but 1928th 3GVU can not download.

from utils.data_download import download_shsmu_as, download_rcsb, unzip

allosteric_site_index = 'data/allosteric_site_shsmu.json'

download_shsmu_as(allosteric_site_path=allosteric_site_index, outpath='data/shsmu_allosteric_site/')
unzip(origin_dir='data/shsmu_allosteric_site/', outdir='data/shsmu_allosteric_site/allosteric_site/')

!rm -r data/shsmu_allosteric_site/*.pdb.gz

download_rcsb(allosteric_site_path=allosteric_site_index, outpath='data/shsmu_allosteric_site/')

!rm -r data/shsmu_allosteric_site/3gvu.pdb.gz

unzip(origin_dir='data/shsmu_allosteric_site/', outdir='data/shsmu_allosteric_site/rscb_pdb/')

!rm -r data/shsmu_allosteric_site/*.pdb.gz

> `data/ASD_Release_201909_AS.txt` 
>
> is the description file of the allostericsite, but hard to read.

In [None]:
from utils.data_process import transform_txt_to_csv

txt_path = 'data/ASD_Release_201909_AS.txt'
csv_path = 'data/ASD_Release_201909_AS.csv'
transform_txt_to_csv(path=txt_path, outpath=csv_path)

## 2.Download pretain data form rcsb

> `data/pretrain/list_file_protein_xray_max3A_total.txt`
>
> includes pdbids with resolution of 0.5 ~ 3A obtained by X-Ray experiment.

In [None]:
# download logs are stored in succeed.log, timeout.log, wrang.log

from utils.data_download import download_pretarining_data, redownload_error_pretarining_data

pdblist = 'data/pretrain/list_file_protein_xray_max3A_total.txt'
output_path = 'data/pretrain/'

download_pretarining_data(list_path=pdblist, outpath=output_path)

> If there are links in timeout.log or wrang.log run redownload_error_pretarining_data() to redownload failed pdbs.

In [None]:
timeout_path = 'data/pretrain/timeout.log'
wrang_path = 'data/pretrain/wrang.log'

redownload_error_pretarining_data(timeout_path, wrang_path, output_path)

In [None]:
from utils.data_download import unzip

dir_path = '/mnt/g/Little-LL/pretrain/'
out_path = '/mnt/g/Little-LL/pretrain_pdb/'

unzip(dir_path, out_path)

# Data processing

In [None]:
s = "ATOM      1  N   PHE A   3     127.591  57.948 -15.781  0.50 28.16           N  "
print(s[0:4], s[21], s[17:20], len(s[22:26]), len(s[22:26].strip()))

## 1.Build a BERT tokenizer of residue

In [None]:
# optional step

from utils.data_process import build_tokenizer_dataset, tokenizer_json_to_txt

pdb_paths = '/mnt/g/Little-LL/pretrain_pdb/'
output_path ='/mnt/g/Little-LL/pretrain_tokenizer/'

build_tokenizer_dataset(path=pdb_paths, outpath=output_path)
tokenizer_json_to_txt(output_path)

cd `models/tokenizer` and run `python tokenizer_building.py`

the output tokenizer config is in `models/tokenizer/residue` 

## 2.Build the allosteric site dataset

In [1]:
from utils.data_process import build_allosteric_dataset

# origin pdb file to input data (json)
pdb_dir = 'data/shsmu_allosteric_site/rcsb_pdb/'
out_dir = 'data/allosteric_site/input/'
build_allosteric_dataset(pdb_dir, out_dir)

pdb_dir = 'data/shsmu_allosteric_site/allosteric_site/'
out_dir = 'data/allosteric_site/target/'
build_allosteric_dataset(pdb_dir, out_dir)

 57%|█████▋    | 1049/1831 [00:19<00:12, 64.03it/s]

pdbid: 3TUV	chain: B	error: 'UNK'
pdbid: 3TUV	chain: B	error: 'UNK'
pdbid: 3TUV	chain: B	error: 'UNK'


 86%|████████▌ | 1577/1831 [00:28<00:04, 60.40it/s]

pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'
pdbid: 4P02	chain: D	error: 'UNK'


100%|██████████| 1831/1831 [00:33<00:00, 54.49it/s]
100%|██████████| 1927/1927 [00:01<00:00, 1471.80it/s]


In [2]:
from utils.pre_data import pre_single_a

as_path = 'data/allosteric_site/target/'
pdb_path = 'data/allosteric_site/input/'
data_json = 'data/allosteric_site/data_a.json'

pre_single_a(target_dir=as_path, pdb_dir=pdb_path, output_json=data_json)

100%|██████████| 1927/1927 [00:01<00:00, 1568.43it/s]


In [1]:
from utils.pre_data import load_data, split_train_test

data_json = 'data/allosteric_site/data_a.json'

inputs, targets = load_data(data_path=data_json)

train_set, test_set = split_train_test(inputs, targets)
print(train_set[-1])
print(len(train_set))
print(test_set[-1])
print(len(test_set))

ModuleNotFoundError: No module named 'sklearn'