/
Rakefile
186 lines (181 loc) · 6.72 KB
/
Rakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# rake tasks to automate archaeaEST project
require "net/ftp"
require "mkmf"
require "nokogiri"
require "bio"
# require "parallel"
ncbiftp = "ftp.ncbi.nih.gov"
desc "Check for prerequisites"
task :check do
# check for various tools in PATH
# we need seq, GNU parallel, wget, fastasplitn, blat, blastn, blastdbcmd
cmds = {
"seq" => "seq is part of coreutils http://www.gnu.org/software/coreutils/",
"parallel" => "GNU Parallel is at http://www.gnu.org/software/parallel/",
"wget" => "GNU Wget is at https://www.gnu.org/software/wget/",
"fastasplitn" => "fastasplitn can be compiled from the src/ directory",
"blat" => "BLAT is at http://users.soe.ucsc.edu/~kent/src/",
"blastn" => "The latest BLAST+ is at ftp://ftp.ncbi.nih.gov/blast/executables/LATEST",
"blastdbcmd" => "The latest BLAST+ is at ftp://ftp.ncbi.nih.gov/blast/executables/LATEST"
}
cmds.each_pair do |k, v|
f = find_executable k
if f.nil?
puts "#{k} not found in path; #{v}"
end
end
end
namespace :db do
namespace :est do
desc "Fetch and uncompress human EST BLAST db"
task :fetch do
ftp = Net::FTP.new(ncbiftp)
ftp.login
ftp.chdir("blast/db")
est = ftp.list("est_human*.tar.gz")
est = est.map { |f| f.split(" ").last }
Dir.chdir(File.expand_path("../../../db/est", __FILE__))
File.open("urls.txt", "w+") do |f|
est.each do |gz|
f.puts "#{ncbiftp}::blast/db/#{gz}"
end
end
# use rsync instead
system("cat urls.txt | parallel -j10 rsync --partial --progress -av {} .")
# now need to uncompress
system("find . -name 'est_human.*.tar.gz' -exec tar zxvf {} \\;")
puts "Finished; files are in #{Dir.pwd}"
end
desc "Dump human EST BLAST db to fasta, splitting for BLAT"
task :split do
# code to dump fasta and split
Dir.chdir(File.expand_path("../../../db/est", __FILE__))
ENV['SPLITFRAGTEMPLATE'] = "est_human%3.3d"
system("blastdbcmd -db est_human -entry all -line_length 60 | fastasplitn - 20")
puts "Finished; files are in #{Dir.pwd}"
end
desc "Dump GIs in data/est_hits_gi.txt to fasta"
task :hitdump do
Dir.chdir(File.expand_path("../../..", __FILE__))
system("blastdbcmd -db db/est/est_human -entry_batch data/est_hits_gi.txt -line_length 60 -out data/est_hits.fa")
puts "Finished; sequences are in #{Dir.pwd}/data/est_hits.fa"
end
end
namespace :nt do
desc "Fetch and uncompress nt BLAST db"
task :fetch do
# fetch code here
ftp = Net::FTP.new(ncbiftp)
ftp.login
ftp.chdir("blast/db")
nt = ftp.list("nt.*.tar.gz")
nt = nt.map { |f| f.split(" ").last }
Dir.chdir(File.expand_path("../../../db/nt", __FILE__))
File.open("urls.txt", "w+") do |f|
nt.each do |gz|
f.puts "#{ncbiftp}::blast/db/#{gz}"
end
end
# use rsync instead
system("cat urls.txt | parallel -j10 rsync --partial --progress -av {} .")
# now need to uncompress
system("find . -name 'nt.*.tar.gz' -exec tar zxvf {} \\;")
puts "Finished; files are in #{Dir.pwd}"
end
end
namespace :archaea do
desc "Fetch archaea genome ffn files"
task :fetch do
Dir.chdir(File.expand_path("../../../db/archaea", __FILE__))
archaea = []
ftp = Net::FTP.new(ncbiftp)
ftp.login
ftp.chdir("genomes/GENOME_REPORTS")
ftp.gettextfile("prokaryotes.txt", nil) do |line|
line = line.chomp
fields = line.split("\t")
if fields[4] =~ /[Aa]rchae/ && fields[23] != "-"
puts "Archaea FTP path = #{fields[23]}"
archaea << fields[23]
end
end
ftp.chdir("/genomes/ASSEMBLY_BACTERIA")
File.open("urls.txt", "w+") do |f|
archaea.each do |sp|
ls = ftp.list("#{sp}/*.ffn*")
ls.each do |ffn|
fa = ffn.split(" ").last
puts "Writing #{fa} to file"
# f.puts "ftp://#{ncbiftp}/genomes/ASSEMBLY_BACTERIA/#{fa}"
# changed for rsync
f.puts "#{ncbiftp}::genomes/ASSEMBLY_BACTERIA/#{fa}"
end
end
end
# this is clunky but anything else seems to upset the FTP server
# system("for u in `cat urls.txt`; do wget -N $u; done")
# let's rsync instead!
system("cat urls.txt | parallel -j10 rsync --partial --progress -av {} .")
puts "Finished; files are in #{Dir.pwd}"
end
desc "Uncompress archaea *.scaffold.ffn.tgz files; combine with *.ffn files"
task :concat do
# uncompress/concat code here
Dir.chdir(File.expand_path("../../../db/archaea", __FILE__))
puts "Extracting tgz files to NZ.ffn..."
system("find . -name '*.tgz' -exec tar zxfO {} > NZ.ffn \\;")
puts "Combining NZ.ffn + NC*.ffn files..."
system("cat NZ.ffn NC_0*.ffn > archaea.ffn")
puts "Finished; files are in #{Dir.pwd}"
end
end
end
namespace :search do
namespace :blat do
desc "Run BLAT of human EST fasta files (query) vs archaea.ffn (subject)"
task :run do
Dir.chdir(File.expand_path("../../../", __FILE__))
system("parallel -j10 'var=$(printf '%03d' {}); blat db/archaea/archaea.ffn db/est/est_human$var -ooc=data/11.ooc data/psl/est_human$var.psl' ::: $(seq 1 10)")
system("parallel -j10 'var=$(printf '%03d' {}); blat db/archaea/archaea.ffn db/est/est_human$var -ooc=data/11.ooc data/psl/est_human$var.psl' ::: $(seq 11 20)")
puts "Finished; output PSL files are in #{Dir.pwd}/data/psl"
end
desc "Parse PSL files, extract list of unique human EST GI, write to file"
task :parse do
Dir.chdir(File.expand_path("../../../data", __FILE__))
system("cat psl/*.psl | cut -f10 | grep ^gi | cut -d '|' -f2 | sort -u > est_hits_gi.txt")
puts "Finished; GIs in file #{Dir.pwd}/est_hits_gi.txt"
end
end
namespace :blast do
desc "BLAST search human EST BLAT hits to archaea vs nt database"
task :run do
Dir.chdir(File.expand_path("../../../", __FILE__))
format = "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids"
system("blastn -db db/nt/nt -query data/est_hits.fa -out data/est_v_nt.tsv -max_target_seqs 1 -outfmt '#{format}' -num_threads 4")
puts "Finished; output in #{Dir.pwd}/data/est_v_nt.tsv"
end
desc "Parse EST v nt BLAST output; use taxid to retrieve species and kingdom"
task :parse do
Bio::NCBI.default_email = "me@me.com"
taxids = Hash.new
Dir.chdir(File.expand_path("../../../", __FILE__))
File.open("data/est_v_nt.csv", "w") do |f|
File.readlines("data/est_v_nt.tsv").each do |line|
line = line.chomp
fields = line.split("\t").map(&:strip)
if taxids.has_key? fields[12]
fields = fields + taxids[fields[12]]
else
xml = Bio::NCBI::REST::EFetch.taxonomy(fields[12], "xml")
doc = Nokogiri::XML(xml)
names = doc.xpath("//Taxon/ScientificName").children.map { |child| child.inner_text }
taxids[fields[12]] = [names[0], names[2]]
fields = fields + taxids[fields[12]]
end
f.write(fields.join(",") + "\n")
end
end
puts "Finished; output in #{Dir.pwd}/data/est_v_nt.csv"
end
end
end