/
tree.py
408 lines (345 loc) · 16.4 KB
/
tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
"""
Build a tree using a variety of methods.
"""
import os
import shutil
import sys
import time
import uuid
import Bio
from Bio import Phylo
import numpy as np
from treetime.vcf_utils import read_vcf
from pathlib import Path
from .utils import run_shell_command, nthreads_value, shquote, load_mask_sites
def find_executable(names, default = None):
"""
Return the path to the first executable found in PATH from the given list
of names.
Raises a (hopefully helpful) error if no executable is found. Provide a
value for the "default" parameter to instead return a value.
"""
exe = next(filter(shutil.which, names), default)
if exe is None:
print("Unable to find any of %s in PATH=%s" % (names, os.environ["PATH"]))
print("\nHint: You can install the missing program using conda or homebrew or apt-get.\n")
raise Exception
return exe
def build_raxml(aln_file, out_file, clean_up=True, nthreads=1, tree_builder_args=""):
'''
build tree using RAxML with parameters '-f d -m GTRCAT -c 25 -p 235813 -n tre"
'''
raxml = find_executable([
# Users who symlink/install as "raxml" can pick a specific version,
# otherwise we have our own search order based on expected parallelism.
# The variants we look for are not exhaustive, but based on what's
# provided by conda and Ubuntu's raxml packages. This may want to be
# adjusted in the future depending on use.
"raxml",
"raxmlHPC-PTHREADS-AVX2",
"raxmlHPC-PTHREADS-AVX",
"raxmlHPC-PTHREADS-SSE3",
"raxmlHPC-PTHREADS",
"raxmlHPC-AVX2",
"raxmlHPC-AVX",
"raxmlHPC-SSE3",
"raxmlHPC",
])
# RAxML outputs files appended with this random string:
# RAxML_bestTree.4ed91a, RAxML_info.4ed91a, RAxML_parsimonyTree.4ed91a, RAxML_result.4ed91a
random_string = uuid.uuid4().hex[0:6]
call = [raxml,"-T",str(nthreads)," -f d -m GTRCAT -c 25 -p 235813 -n %s -s"%(random_string), shquote(aln_file), tree_builder_args, "> RAxML_log.%s"%(random_string)]
cmd = " ".join(call)
print("Building a tree via:\n\t" + cmd +
"\n\tStamatakis, A: RAxML Version 8: A tool for Phylogenetic Analysis and Post-Analysis of Large Phylogenies."
"\n\tIn Bioinformatics, 2014\n")
try:
run_shell_command(cmd, raise_errors = True)
shutil.copy("RAxML_bestTree.%s"%(random_string), out_file)
T = Phylo.read(out_file, 'newick')
if clean_up:
os.remove("RAxML_bestTree.%s"%(random_string))
os.remove("RAxML_info.%s"%(random_string))
os.remove("RAxML_parsimonyTree.%s"%(random_string))
os.remove("RAxML_result.%s"%(random_string))
except:
print("ERROR: TREE BUILDING FAILED")
if os.path.isfile("RAxML_log.%s"%(random_string)):
print("Please see the log file for more details: {}".format("RAxML_log.%s"%(random_string)))
T=None
return T
def build_fasttree(aln_file, out_file, clean_up=True, nthreads=1, tree_builder_args=""):
'''
build tree using fasttree with parameters "-nt"
'''
log_file = out_file + ".log"
fasttree = find_executable([
# Search order is based on expected parallelism and accuracy
# (double-precision versions).
"FastTreeDblMP",
"FastTreeDbl",
"FastTreeMP",
"fasttreeMP", # Ubuntu lowercases
"FastTree",
"fasttree"
])
# By default FastTree with OpenMP support uses all available cores.
# However, it respects the standard OpenMP environment variable controlling
# this as described at <http://www.microbesonline.org/fasttree/#OpenMP>.
#
# We always set it, regardless of it the found FastTree executable contains
# "MP" in the name, because the generic "FastTree" or "fasttree" variants
# might be OpenMP-enabled too.
extra_env = {
"OMP_NUM_THREADS": str(nthreads),
}
call = [fasttree, "-nosupport", "-nt", shquote(aln_file), tree_builder_args, "1>", shquote(out_file), "2>", shquote(log_file)]
cmd = " ".join(call)
print("Building a tree via:\n\t" + cmd +
"\n\tPrice et al: FastTree 2 - Approximately Maximum-Likelihood Trees for Large Alignments." +
"\n\tPLoS ONE 5(3): e9490. https://doi.org/10.1371/journal.pone.0009490\n")
try:
run_shell_command(cmd, raise_errors = True, extra_env = extra_env)
T = Phylo.read(out_file, 'newick')
except:
print("ERROR: TREE BUILDING FAILED")
if os.path.isfile(log_file):
print("Please see the log file for more details: {}".format(log_file))
T=None
return T
def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nthreads=1, tree_builder_args=""):
'''
build tree using IQ-Tree with parameters "-fast"
arguments:
aln_file file name of input aligment
out_file file name to write tree to
'''
iqtree = find_executable([
"iqtree2",
"iqtree"
])
# create a dictionary for characters that IQ-tree changes.
# we remove those prior to tree-building and reinstantiate later
def random_string(n):
from string import ascii_uppercase as letters
return "".join([letters[i] for i in np.random.randint(len(letters), size=n)])
prefix = "DELIM"
escape_dict = {c:f'_{prefix}-{random_string(20)}_' for c in '/|()*'}
reverse_escape_dict = {v:k for k,v in escape_dict.items()}
# IQ-tree messes with taxon names. Hence remove offending characters, reinstaniate later
tmp_aln_file = aln_file.replace(".fasta", "-delim.fasta")
log_file = tmp_aln_file.replace(".fasta", ".iqtree.log")
num_seqs = 0
with open(tmp_aln_file, 'w', encoding='utf-8') as ofile, open(aln_file, encoding='utf-8') as ifile:
for line in ifile:
tmp_line = line
if line.startswith(">"):
num_seqs += 1
for c,v in escape_dict.items():
tmp_line = tmp_line.replace(c,v)
ofile.write(tmp_line)
# For compat with older versions of iqtree, we avoid the newish -fast
# option alias and instead spell out its component parts:
#
# -ninit 2
# -n 2
# -me 0.05
#
# This may need to be updated in the future if we want to stay in lock-step
# with -fast, although there's probably no particular reason we have to.
# Refer to the handling of -fast in utils/tools.cpp:
# https://github.com/Cibiv/IQ-TREE/blob/44753aba/utils/tools.cpp#L2926-L2936
fast_opts = [
"-ninit", "2",
"-n", "2",
"-me", "0.05"
]
# Use IQ-TREE's auto-scaling of threads when the user has requested more
# threads than there are sequences. This approach avoids an error from
# IQ-TREE when num_seq < nthreads (as when users request `-nthreads auto` on
# a machine with many cores and fewer input sequences) and also avoids
# requesting as many threads as there are sequences when there may be fewer
# available threads on the current machine.
if num_seqs < nthreads:
nthreads = "AUTO"
print(
"WARNING: more threads requested than there are sequences; falling back to IQ-TREE's `-nt AUTO` mode.",
file=sys.stderr
)
if substitution_model.lower() != "auto":
call = [iqtree, *fast_opts, "-nt", str(nthreads), "-s", shquote(tmp_aln_file),
"-m", substitution_model, tree_builder_args, ">", log_file]
else:
call = [iqtree, *fast_opts, "-nt", str(nthreads), "-s", shquote(tmp_aln_file), tree_builder_args, ">", shquote(log_file)]
cmd = " ".join(call)
print("Building a tree via:\n\t" + cmd +
"\n\tNguyen et al: IQ-TREE: A fast and effective stochastic algorithm for estimating maximum likelihood phylogenies."
"\n\tMol. Biol. Evol., 32:268-274. https://doi.org/10.1093/molbev/msu300\n")
if substitution_model.lower() == "auto":
print(f"Conducting a model test... see '{shquote(log_file)}' for the result. You can specify this with --substitution-model in future runs.")
try:
run_shell_command(cmd, raise_errors = True)
T = Phylo.read(tmp_aln_file+".treefile", 'newick')
shutil.copyfile(tmp_aln_file+".treefile", out_file)
for n in T.find_clades(terminal=True):
tmp_name = n.name
for v,c in reverse_escape_dict.items():
tmp_name = tmp_name.replace(v,c)
n.name = tmp_name
#this allows the user to check intermediate output, as tree.nwk will be
if clean_up:
if os.path.isfile(tmp_aln_file):
os.remove(tmp_aln_file)
for ext in [".bionj",".ckp.gz",".iqtree",".mldist",".model.gz",".treefile",".uniqueseq.phy",".model"]:
if os.path.isfile(tmp_aln_file + ext):
os.remove(tmp_aln_file + ext)
except:
print("ERROR: TREE BUILDING FAILED")
if os.path.isfile(log_file):
print("Please see the log file for more details: {}".format(log_file))
T=None
return T
def write_out_informative_fasta(compress_seq, alignment, stripFile=None):
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
sequences = compress_seq['sequences']
ref = compress_seq['reference']
positions = compress_seq['positions']
#If want to exclude sites from initial treebuild, read in here
strip_pos = load_mask_sites(stripFile) if stripFile else []
#Get sequence names
seqNames = list(sequences.keys())
#Check non-ref sites to see if informative
printPositionMap = False #If true, prints file mapping Fasta position to real position
sites = []
pos = []
for key in positions:
if key not in strip_pos:
pattern = []
for k in sequences.keys():
#looping try/except is faster than list comprehension
try:
pattern.append(sequences[k][key])
except KeyError:
pattern.append(ref[key])
origPattern = list(pattern)
if '-' in pattern or 'N' in pattern:
#remove gaps/Ns to see if otherwise informative
pattern = [value for value in origPattern if value != '-' and value != 'N']
un = np.unique(pattern, return_counts=True)
#If not all - or N, not all same base, and >1 differing base, append
if len(un[0])!=0 and len(un[0])!=1 and not (len(un[0])==2 and min(un[1])==1):
sites.append(origPattern)
pos.append("\t".join([str(len(pos)+1),str(key)]))
#Rotate and convert to SeqRecord
sites = np.asarray(sites)
align = np.rot90(sites)
seqNamesCorr = list(reversed(seqNames))
toFasta = [ SeqRecord(id=seqNamesCorr[i], seq=Seq("".join(align[i])), description='') for i in range(len(sequences.keys()))]
fasta_file = os.path.join(os.path.dirname(alignment), 'informative_sites.fasta')
#now output this as fasta to read into raxml or iqtree
SeqIO.write(toFasta, fasta_file, 'fasta')
#If want a position map, print:
if printPositionMap:
with open(fasta_file+".positions.txt", 'w', encoding='utf-8') as the_file:
the_file.write("\n".join(pos))
return fasta_file
def mask_sites_in_multiple_sequence_alignment(alignment_file, excluded_sites_file):
"""Creates a new multiple sequence alignment FASTA file from which the given
excluded sites have been removed and returns the filename of the new
alignment.
Parameters
----------
alignment_file : str
path to the original multiple sequence alignment file
excluded_sites_file : str
path to a text file containing each nucleotide position to exclude with one position per line
Returns
-------
str
path to the new FASTA file from which sites have been excluded
"""
# Load zero-based excluded sites.
excluded_sites = load_mask_sites(excluded_sites_file)
# Return the original alignment file, if no excluded sites were found.
if len(excluded_sites) == 0:
return alignment_file
# Load alignment as FASTA generator to prevent loading the whole alignment
# into memory.
alignment = Bio.SeqIO.parse(alignment_file, "fasta")
# Write the masked alignment to disk one record at a time.
alignment_file_path = Path(alignment_file)
masked_alignment_file = str(alignment_file_path.parent / ("masked_%s" % alignment_file_path.name))
with open(masked_alignment_file, "w", encoding='utf-8') as oh:
for record in alignment:
# Convert to a mutable sequence to enable masking with Ns.
sequence = record.seq.tomutable()
# Replace all excluded sites with Ns.
for site in excluded_sites:
sequence[site] = "N"
record.seq = sequence
Bio.SeqIO.write(record, oh, "fasta")
# Return the new alignment FASTA filename.
return masked_alignment_file
def register_arguments(parser):
parser.add_argument('--alignment', '-a', required=True, help="alignment in fasta or VCF format")
parser.add_argument('--method', default='iqtree', choices=["fasttree", "raxml", "iqtree"], help="tree builder to use")
parser.add_argument('--output', '-o', type=str, help='file name to write tree to')
parser.add_argument('--substitution-model', default="GTR",
help='substitution model to use. Specify \'auto\' to run ModelTest. Currently, only available for IQTREE.')
parser.add_argument('--nthreads', type=nthreads_value, default=1,
help="number of threads to use; specifying the value 'auto' will cause the number of available CPU cores on your system, if determinable, to be used")
parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to')
parser.add_argument('--exclude-sites', type=str, help='file name of one-based sites to exclude for raw tree building (BED format in .bed files, second column in tab-delimited files, or one position per line)')
parser.add_argument('--tree-builder-args', type=str, default='', help='extra arguments to be passed directly to the executable of the requested tree method (e.g., --tree-builder-args="-czb")')
def run(args):
# check alignment type, set flags, read in if VCF
is_vcf = False
ref = None
if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
# Prepare a multiple sequence alignment from the given variants VCF and
# reference FASTA.
if not args.vcf_reference:
print("ERROR: a reference Fasta is required with VCF-format alignments")
return 1
compress_seq = read_vcf(args.alignment, args.vcf_reference)
sequences = compress_seq['sequences']
ref = compress_seq['reference']
is_vcf = True
aln = sequences
elif args.exclude_sites:
# Mask excluded sites from the given multiple sequence alignment.
aln = mask_sites_in_multiple_sequence_alignment(args.alignment, args.exclude_sites)
else:
# Use the multiple sequence alignment as is.
aln = args.alignment
start = time.time()
if args.output:
tree_fname = args.output
else:
tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '.nwk'
# construct reduced alignment if needed
if is_vcf:
variable_fasta = write_out_informative_fasta(compress_seq, args.alignment, stripFile=args.exclude_sites)
fasta = variable_fasta
else:
fasta = aln
if args.substitution_model and not args.method=='iqtree':
print("Cannot specify model unless using IQTree. Model specification ignored.")
if args.method=='raxml':
T = build_raxml(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=args.tree_builder_args)
elif args.method=='iqtree':
T = build_iqtree(fasta, tree_fname, args.substitution_model, nthreads=args.nthreads, tree_builder_args=args.tree_builder_args)
elif args.method=='fasttree':
T = build_fasttree(fasta, tree_fname, nthreads=args.nthreads, tree_builder_args=args.tree_builder_args)
else:
print("ERROR: unknown tree builder provided to --method: %s" % args.method, file = sys.stderr)
return 1
end = time.time()
print("\nBuilding original tree took {} seconds".format(str(end-start)))
if T:
import json
tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f')
else:
return 1