layout	title	date	categories	tags
post	Python脚本抓取网上PDB文件	2015-06-11 11:29:30 -0700	CompCB	CompBiol Python Internet

Get PDB file online

urlib2库抓文件
gzip库解压

这里定义的是函数方便调用,形参为pdb-id,返回文件名.
python利用urllib2抓网页内容并输出就可以了,没啥含量.
这里抓的是压缩包并解压,多了些操作.

#!/usr/bin/env python
#
# Provides simple functionallity to download pdb files using python.
# Returns the path to the downloaded file
 
import os, urllib2, gzip
 
def get_pdb(pdb_id):
    fname = 'pdb/'+pdb_id+'.pdb'
    #check if pdb is present
    if os.path.exists(fname):
        return fname
 
    #check for pdb dir
    if not os.path.exists('pdb/'):
		os.makedirs('pdb')   
 
    #download pbd.gz
    f = urllib2.urlopen("http://www.rcsb.org/pdb/files/"+pdb_id+".pdb.gz")
    g = open(fname+'.gz','w')
    while 1:
        packet = f.read()
        if not packet:
            break
        g.write(packet)
    f.close()
    g.close()
 
    #unzip
    f = gzip.open(fname+'.gz', 'r')
    g = open(fname,'w')
    g.write(f.read())
    f.close()
    g.close()
 
    os.remove('pdb/'+pdb_id+'.pdb.gz')
 
    return fname

更简单的办法是, 直接替换抓取地址为

"http://rcsb.org/pdb/files/"+pdb_id+".pdb",

并替换g打开的保存文件名为g = open(fname,'w'),不需要后续的解压和删文件了

#!/usr/bin/env python
#
# Author: Platinhom; Last Updated: 2015.6.12
# Provides simple functionallity to download pdb files using python.
# Support to download many PDB by command line.
 
import os, sys, urllib2
if (__name__ == '__main__'):
	if (len(sys.argv)< 2):
		print "Please give the PDB ID!"
		exit()
	for pdb_id in sys.argv[1:]:
		#check ID number, make sure 4 chars.
		if (len(pdb_id)!=4):
			print "The PDB ID: "+pdb_id+" is incorrect!"
			exit()
			
		fname = 'pdb/'+pdb_id+'.pdb';
		#check if pdb is present
		if os.path.exists(fname):
			exit()
	 
		#check for pdb dir
		if not os.path.exists('pdb/'):
			os.makedirs('pdb')   
	 
		#download pbd
		f = urllib2.urlopen("http://www.rcsb.org/pdb/files/"+pdb_id+".pdb")
		g = open(fname,'w')
		while 1:
			packet = f.read()
			if not packet:
				break
			g.write(packet)
		print "Got PDB: "+pdb_id+"!";
		f.close()
		g.close()
#end main

更快的当然是直接用Shell命令wget啦.
wget "http://rcsb.org/pdb/files/${pdbnum}.pdb"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

2015-06-12-Get-PDB.md

2015-06-12-Get-PDB.md

Get PDB file online

Files

2015-06-12-Get-PDB.md

Latest commit

History

2015-06-12-Get-PDB.md

File metadata and controls

Get PDB file online