layout | title | date | categories | tags |
---|---|---|---|---|
post |
Python脚本抓取网上PDB文件 |
2015-06-11 11:29:30 -0700 |
CompCB |
CompBiol Python Internet |
- urlib2库抓文件
- gzip库解压
- 这里定义的是函数方便调用,形参为pdb-id,返回文件名.
- python利用urllib2抓网页内容并输出就可以了,没啥含量.
这里抓的是压缩包并解压,多了些操作.
#!/usr/bin/env python
#
# Provides simple functionallity to download pdb files using python.
# Returns the path to the downloaded file
import os, urllib2, gzip
def get_pdb(pdb_id):
fname = 'pdb/'+pdb_id+'.pdb'
#check if pdb is present
if os.path.exists(fname):
return fname
#check for pdb dir
if not os.path.exists('pdb/'):
os.makedirs('pdb')
#download pbd.gz
f = urllib2.urlopen("http://www.rcsb.org/pdb/files/"+pdb_id+".pdb.gz")
g = open(fname+'.gz','w')
while 1:
packet = f.read()
if not packet:
break
g.write(packet)
f.close()
g.close()
#unzip
f = gzip.open(fname+'.gz', 'r')
g = open(fname,'w')
g.write(f.read())
f.close()
g.close()
os.remove('pdb/'+pdb_id+'.pdb.gz')
return fname
- 更简单的办法是, 直接替换抓取地址为
"http://rcsb.org/pdb/files/"+pdb_id+".pdb"
,
并替换g打开的保存文件名为g = open(fname,'w')
,不需要后续的解压和删文件了
#!/usr/bin/env python
#
# Author: Platinhom; Last Updated: 2015.6.12
# Provides simple functionallity to download pdb files using python.
# Support to download many PDB by command line.
import os, sys, urllib2
if (__name__ == '__main__'):
if (len(sys.argv)< 2):
print "Please give the PDB ID!"
exit()
for pdb_id in sys.argv[1:]:
#check ID number, make sure 4 chars.
if (len(pdb_id)!=4):
print "The PDB ID: "+pdb_id+" is incorrect!"
exit()
fname = 'pdb/'+pdb_id+'.pdb';
#check if pdb is present
if os.path.exists(fname):
exit()
#check for pdb dir
if not os.path.exists('pdb/'):
os.makedirs('pdb')
#download pbd
f = urllib2.urlopen("http://www.rcsb.org/pdb/files/"+pdb_id+".pdb")
g = open(fname,'w')
while 1:
packet = f.read()
if not packet:
break
g.write(packet)
print "Got PDB: "+pdb_id+"!";
f.close()
g.close()
#end main
- 更快的当然是直接用Shell命令wget啦.
wget "http://rcsb.org/pdb/files/${pdbnum}.pdb"