# Data downloading

## 1.Downing allosteric site data from shsmu.edu.cn

> `data/allosteric_site_shsmu.json` 
> 
> contains data crawling form shsmu.edu.cn(home->featrue->site).

In [None]:
# there are 1928 allosteric site in shsmu, but 1928th 3GVU can not download.

from utils.data_download import download_shsmu_as, download_rcsb, unzip

allosteric_site_index = 'data/allosteric_site_shsmu.json'

download_shsmu_as(allosteric_site_path=allosteric_site_index, outpath='data/shsmu_allosteric_site/')
unzip(origin_dir='data/shsmu_allosteric_site/', outdir='data/shsmu_allosteric_site/allosteric_site/')

!rm -r data/shsmu_allosteric_site/*.pdb.gz

download_rcsb(allosteric_site_path=allosteric_site_index, outpath='data/shsmu_allosteric_site/')

!rm -r data/shsmu_allosteric_site/3gvu.pdb.gz

unzip(origin_dir='data/shsmu_allosteric_site/', outdir='data/shsmu_allosteric_site/rscb_pdb/')

!rm -r data/shsmu_allosteric_site/*.pdb.gz

> `data/ASD_Release_201909_AS.txt` 
>
> is the description file of the allostericsite, but hard to read.

In [None]:
from utils.data_process import transform_txt_to_csv

txt_path = 'data/ASD_Release_201909_AS.txt'
csv_path = 'data/ASD_Release_201909_AS.csv'
transform_txt_to_csv(path=txt_path, outpath=csv_path)

## 2.Download pretain data form rcsb

> `data/pretrain/list_file_protein_xray_max3A_total.txt`
>
> includes pdbids with resolution of 0.5 ~ 3A obtained by X-Ray experiment.

In [None]:
# download logs are stored in succeed.log, timeout.log, wrang.log

from utils.data_download import download_pretarining_data, redownload_error_pretarining_data

pdblist = 'data/pretrain/list_file_protein_xray_max3A_total.txt'
output_path = 'data/pretrain/'

download_pretarining_data(list_path=pdblist, outpath=output_path)

> If there are links in timeout.log or wrang.log run redownload_error_pretarining_data() to redownload failed pdbs.

In [None]:
timeout_path = 'data/pretrain/timeout.log'
wrang_path = 'data/pretrain/wrang.log'

redownload_error_pretarining_data(timeout_path, wrang_path, output_path)

In [None]:
from utils.data_download import unzip

dir_path = '/mnt/g/Little-LL/pretrain/'
out_path = '/mnt/g/Little-LL/pretrain_pdb/'

unzip(dir_path, out_path)

# Data processing

In [None]:
s = "ATOM      1  N   PHE A   3     127.591  57.948 -15.781  0.50 28.16           N  "
print(s[0:4], s[21], s[17:20], len(s[22:26]), len(s[22:26].strip()))

## 1.Build a BERT tokenizer of residue

In [None]:
# optional step

from utils.data_process import build_tokenizer_dataset, tokenizer_json_to_txt

pdb_paths = '/mnt/g/Little-LL/pretrain_pdb/'
output_path ='/mnt/g/Little-LL/pretrain_tokenizer/'

build_tokenizer_dataset(path=pdb_paths, outpath=output_path)
tokenizer_json_to_txt(output_path)

cd `models/tokenizer` and run `python tokenizer_building.py`

the output tokenizer config is in `models/tokenizer/residue` 

In [1]:
from utils.data_process import extract_residue_avg

results, positions, orders = extract_residue_avg('data/shsmu_allosteric_site/allosteric_site/1KFL.pdb')

for i in range(len(results)):
    print('pdbid: ', (results[i])[0])
    print('chain: ', (results[i])[1])
    print(' '.join((results[i])[2:]))
    print(len(orders[i]))
    print(orders[i])
    print(len(positions[i]))
    print(positions[i])

pdbid:  A_ASD0002_1_1KFL_1
chain:  A
R I P Q A D L G L S C F S V K V
16
['40', '148', '150', '151', '154', '155', '175', '178', '179', '180', '181', '209', '211', '212', '214', '221']
16
[{'x': '9.599', 'y': '53.466', 'z': '78.754'}, {'x': '17.486', 'y': '38.644', 'z': '68.225'}, {'x': '15.143', 'y': '42.596', 'z': '71.879'}, {'x': '12.309', 'y': '42.556', 'z': '68.128'}, {'x': '11.141', 'y': '45.556', 'z': '74.613'}, {'x': '7.076', 'y': '45.614', 'z': '74.256'}, {'x': '21.054', 'y': '47.272', 'z': '70.986'}, {'x': '17.434', 'y': '51.542', 'z': '70.011'}, {'x': '16.637', 'y': '49.988', 'z': '73.155'}, {'x': '12.406', 'y': '52.048', 'z': '73.487'}, {'x': '14.449', 'y': '51.720', 'z': '77.091'}, {'x': '21.473', 'y': '46.827', 'z': '65.080'}, {'x': '18.388', 'y': '41.775', 'z': '63.692'}, {'x': '15.131', 'y': '40.312', 'z': '63.228'}, {'x': '8.781', 'y': '42.614', 'z': '64.684'}, {'x': '18.593', 'y': '48.306', 'z': '62.582'}]
pdbid:  A_ASD0002_1_1KFL_1
chain:  B
N D D L R I I
7
['5', '6',