-
Notifications
You must be signed in to change notification settings - Fork 1
/
ncbi_ftp_get.py
76 lines (67 loc) · 2.57 KB
/
ncbi_ftp_get.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
get from NCBI FTP server
Input:
1. Bacterial NCBI directory 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/'
2. Bacteria list
3. Add_string ('latest_assembly_versions/')
"""
#init_string = "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Ralstonia_solanacearum/latest_assembly_versions/"
import urllib2
import os
import optparse
import random
def getstrainlist(init_string):
hfo = urllib2.urlopen(init_string)
html = hfo.read().split()
strainlist = []
for i in range(len(html)):
if (i-8)%11 == 0:
strainlist.append(html[i])
return strainlist
def sample_reduce(strainlist, sample_volume):
l = len(strainlist)
if l <= sample_volume:
return strainlist
index_list = list(range(l))
new_strainlist = []
for i in range(sample_volume):
strain = strainlist.pop(random.randint(0, l-i))
new_strainlist.append(strain)
return new_strainlist
def faa_get(strainlist, init_string, sample_volume, file_identifier):
strainlist = sample_reduce(strainlist[:], sample_volume)
for strain in strainlist:
url_dir = init_string + strain + '/'
hfo = urllib2.urlopen(url_dir)
html = hfo.read().split('\r\n')
for file in html:
file = file.split()
try:
if file_identifier in file[8] and ("_from_" not in file[8] or "_from_" in file_identifier):
print "wget\t"+url_dir + file[8]
# os.system("wget\t"+url_dir + file[8])
break
except:
pass
def download_bacs(ncbi_dir, baclist_file, add_string, sample_volume, file_identifier):
baclist = [q.strip() for q in open(baclist_file)]
for bac in baclist:
init_string = ncbi_dir + bac + '/' + add_string
try:
os.mkdir(bac)
except:
pass
os.chdir(bac)
strainlist = getstrainlist(init_string)
faa_get(strainlist, ncbi_dir + bac + '/' + add_string, sample_volume, file_identifier)
os.chdir('..')
parser = optparse.OptionParser()
parser.add_option("-n", "--ncbi_dir", help="Directory on the NCBI FTP server ", default='ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/')
parser.add_option("-b", "--baclist", help="Bacterial species list", default="baclist.txt")
parser.add_option("-a", "--add_string", help="Subdirectory name", default='latest_assembly_versions/')
parser.add_option("-k", "--sample_volume", help="Number of downloaded genomes", default="all")
parser.add_option("-i", "--file_identifier", help="String that marks the file with needed data, def=.faa.gz", default = ".faa.gz")
opt, args = parser.parse_args()
download_bacs(opt.ncbi_dir, opt.baclist, opt.add_string, opt.sample_volume, opt.file_identifier)
#strainlist = getstrainlist(init_string)
#faa_get(strainlist)