Permalink
Browse files

evaluate confidence intervals of internal node positions, pipe tree a…

…nd alignment output into temp files, and added logger to tree. should be done for frequencies as well
  • Loading branch information...
1 parent 5470539 commit 75c959faa1abe8608bcb98a690b1a9b8679210f3 @rneher rneher committed Jan 1, 2017
Showing with 41 additions and 26 deletions.
  1. +4 −3 base/process.py
  2. +1 −1 base/sequences.py
  3. +29 −17 base/tree.py
  4. +3 −3 flu/seasonal_flu.py
  5. +4 −2 zika/zika.py
View
@@ -19,7 +19,7 @@ class process(object):
def __init__(self, input_data_path = 'data/test',
store_data_path = 'store/test',
- build_data_path = 'build/test', **kwargs):
+ build_data_path = 'build/test', verbose=2, **kwargs):
super(process, self).__init__()
print("Initializing process")
for p in [input_data_path, store_data_path, build_data_path]:
@@ -28,6 +28,7 @@ def __init__(self, input_data_path = 'data/test',
self.input_data_path = input_data_path
self.store_data_path = store_data_path
self.build_data_path = build_data_path
+ self.verbose=verbose
self.kwargs = kwargs
self.data_filenames()
@@ -235,7 +236,7 @@ def estimate_tree_frequencies(self, region='global', pivots=24):
tps = np.array([x.attributes['num_date'] for x in self.seqs.seqs.values()])
self.pivots=make_pivots(pivots, tps)
else:
- print('estimate_tree_frequencies: using self.pivots')
+ print('estimate_tree_frequencies: using self.pivots',3)
if not hasattr(self, 'tree_frequencies'):
self.tree_frequencies = {}
self.tree_frequency_confidence = {}
@@ -259,7 +260,7 @@ def build_tree(self, infile=None, nodefile=None, root='best', debug=False):
if infiles are None, the tree is build from scratch. Otherwise
the tree is loaded from file
'''
- self.tree = tree(aln=self.seqs.aln, proteins = self.proteins)
+ self.tree = tree(aln=self.seqs.aln, proteins = self.proteins, verbose=self.verbose)
if infile is None:
self.tree.build(root=root, debug=debug)
else:
View
@@ -259,7 +259,7 @@ def align(self, debug=False):
out_seqs = self.seqs.values() + [self.reference_seq]
print("align: reference in set",ref_in_set)
SeqIO.write(out_seqs, "temp_in.fasta", "fasta")
- os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta > temp_out.fasta")
+ os.system("mafft --anysymbol --thread " + str(self.nthreads) + " temp_in.fasta 1> temp_out.fasta 2>mafft_stderr")
tmp_aln = AlignIO.read('temp_out.fasta', 'fasta')
self.sequence_lookup = {seq.id:seq for seq in tmp_aln}
View
@@ -27,13 +27,14 @@ def resolve_polytomies(tree):
class tree(object):
"""tree builds a phylgenetic tree from an alignment and exports it for web visualization"""
- def __init__(self, aln, proteins=None, **kwarks):
+ def __init__(self, aln, proteins=None, verbose=2, logger=None, **kwarks):
super(tree, self).__init__()
self.aln = aln
self.nthreads = 2
self.sequence_lookup = {seq.id:seq for seq in aln}
self.nuc = kwarks['nuc'] if 'nuc' in kwarks else True
self.dump_attr = []
+ self.verbose = verbose
if proteins!=None:
self.proteins = proteins
else:
@@ -43,6 +44,12 @@ def __init__(self, aln, proteins=None, **kwarks):
self.run_dir = '_'.join(['temp', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))])
else:
self.run_dir = kwarks['run_dir']
+ if logger is None:
+ def f(x,y):
+ if y>self.verbose: print(x)
+ self.logger = f
+ else:
+ self.logger=logger
def dump(self, treefile, nodefile):
@@ -67,9 +74,7 @@ def build(self, root='midpoint', raxml=True, raxml_time_limit=0.5, raxml_bin='ra
tree_cmd = ["fasttree"]
if self.nuc: tree_cmd.append("-nt")
- tree_cmd.append("temp.fasta")
- tree_cmd.append(">")
- tree_cmd.append("initial_tree.newick")
+ tree_cmd.extend(["temp.fasta","1>","initial_tree.newick", "2>", "fasttree_stderr"])
os.system(" ".join(tree_cmd))
out_fname = "tree_infer.newick"
@@ -83,10 +88,12 @@ def build(self, root='midpoint', raxml=True, raxml_time_limit=0.5, raxml_bin='ra
resolve_polytomies(tmp_tree)
Phylo.write(tmp_tree,'initial_tree.newick', 'newick')
AlignIO.write(self.aln,"temp.phyx", "phylip-relaxed")
- print( "RAxML tree optimization with time limit", raxml_time_limit, "hours")
+ self.logger( "RAxML tree optimization with time limit %d hours"%raxml_time_limit,2)
# using exec to be able to kill process
end_time = time.time() + int(raxml_time_limit*3600)
- process = subprocess.Popen("exec " + raxml_bin + " -f d -T " + str(self.nthreads) + " -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True)
+ process = subprocess.Popen("exec " + raxml_bin + " -f d -T " + str(self.nthreads) +
+ " -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987"
+ " -t initial_tree.newick > raxml1_stdout", shell=True)
while (time.time() < end_time):
if os.path.isfile('RAxML_result.topology'):
break
@@ -105,12 +112,12 @@ def build(self, root='midpoint', raxml=True, raxml_time_limit=0.5, raxml_bin='ra
shutil.copy("initial_tree.newick", 'raxml_tree.newick')
try:
- print("RAxML branch length optimization")
+ self.logger("RAxML branch length optimization",2)
os.system(raxml_bin + " -f e -T " + str(self.nthreads)
- + " -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick")
+ + " -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick > raxml2_stdout")
shutil.copy('RAxML_result.branches', out_fname)
except:
- print("RAxML branch length optimization failed")
+ self.logger("RAxML branch length optimization failed")
shutil.copy('raxml_tree.newick', out_fname)
else:
shutil.copy('initial_tree.newick', out_fname)
@@ -124,10 +131,10 @@ def tt_from_file(self, infile, root='best', nodefile=None):
from treetime import TreeTime
from treetime import utils
self.is_timetree=False
- print('Reading tree from file',infile)
+ self.logger('Reading tree from file '+infile,2)
dates = {seq.id:seq.attributes['num_date']
for seq in self.aln if 'date' in seq.attributes}
- self.tt = TreeTime(dates=dates, tree=infile, gtr='Jukes-Cantor', aln = self.aln, verbose=4)
+ self.tt = TreeTime(dates=dates, tree=infile, gtr='Jukes-Cantor', aln = self.aln, verbose=self.verbose)
if root:
self.tt.reroot(root=root)
self.tree = self.tt.tree
@@ -144,7 +151,7 @@ def tt_from_file(self, infile, root='best', nodefile=None):
node.attr = {}
if nodefile is not None:
- print('reading node properties from file:',nodefile)
+ self.logger('reading node properties from file: '+nodefile,2)
with myopen(nodefile, 'r') as infile:
from cPickle import load
node_props = load(infile)
@@ -153,7 +160,7 @@ def tt_from_file(self, infile, root='best', nodefile=None):
for attr in node_props[n.name]:
n.__setattr__(attr, node_props[n.name][attr])
else:
- print("No node properties found for ", n.name)
+ self.logger("No node properties found for "+n.name,2)
def ancestral(self, **kwarks):
@@ -164,16 +171,21 @@ def ancestral(self, **kwarks):
node.attr = {}
- def timetree(self, Tc=0.01, infer_gtr=True, reroot='best', resolve_polytomies=True, max_iter=2, **kwarks):
- self.tt.run(infer_gtr=infer_gtr, root=reroot, Tc=Tc,
+ def timetree(self, Tc=0.01, infer_gtr=True, reroot='best', resolve_polytomies=True, max_iter=2, confidence=False, **kwarks):
+ self.logger('estimating time tree...',2)
+ self.tt.run(infer_gtr=infer_gtr, root=reroot, Tc=Tc, do_marginal=confidence,
resolve_polytomies=resolve_polytomies, max_iter=max_iter)
- print('estimating time tree...')
+ self.logger('estimating time tree...done',3)
self.dump_attr.extend(['numdate','date','sequence'])
+ to_numdate = self.tt.date2dist.to_numdate
for node in self.tree.find_clades():
if hasattr(node,'attr'):
node.attr['num_date'] = node.numdate
else:
node.attr = {'num_date':node.numdate}
+ if confidence:
+ node.attr["num_date_confidence"] = sorted(to_numdate(node.marginal_inverse_cdf([0.05,0.95])))
+
self.is_timetree=True
@@ -204,7 +216,7 @@ def geo_inference(self, attr):
places = sorted(places)
nc = len(places)
if nc<2 or nc>180:
- print("geo_inference: can't have less than 2 or more than 180 places!")
+ self.logger("geo_inference: can't have less than 2 or more than 180 places!",1)
return
alphabet = {chr(65+i):place for i,place in enumerate(places)}
View
@@ -50,7 +50,7 @@
"A/India/6352/2012", "A/HuNan/01/2014", "A/Helsinki/942/2013", "A/Guam/AF2771/2011", "A/Chile/8266/2003",
"A/Busan/15453/2009", "A/Nepal/142/2011", "A/Kenya/155/2011", "A/Guam/AF2771/2011", "A/Michigan/82/2016",
"A/Ohio/27/2016", "A/Ohio/28/2016", "A/Michigan/83/2016", "A/Michigan/84/2016", "A/Jiangsu-Tianning/1707/2013",
- "A/HuNan/1/2014", "A/Iran/227/2014", "A/Iran/234/2014", "A/Iran/140/2014"],
+ "A/HuNan/1/2014", "A/Iran/227/2014", "A/Iran/234/2014", "A/Iran/140/2014", "A/Jiangsu-Chongchuan/1830/2014"],
'h1n1pdm': [],
'vic':[],
"yam":[]
@@ -308,6 +308,7 @@ def plot_frequencies(flu, gene, mutation=None, plot_regions=None, all_muts=False
parser.add_argument('-t', '--time_interval', nargs=2, help='specify time interval rather than use --years_back')
parser.add_argument('-l', '--lineage', type = str, default = 'h3n2', help='flu lineage to process')
parser.add_argument('--new_auspice', default = False, action="store_true", help='file name for new augur')
+ parser.add_argument('--confidence', default = False, action="store_true", help='evaluate confidence intervals of internal node timing')
parser.add_argument('--load', action='store_true', help = 'recover from file')
parser.add_argument('--no_tree', default=False, action='store_true', help = "don't build a tree")
params = parser.parse_args()
@@ -385,8 +386,7 @@ def plot_frequencies(flu, gene, mutation=None, plot_regions=None, all_muts=False
flu.align()
flu.build_tree()
flu.clock_filter(n_iqd=3, plot=True)
- flu.tree.tt.debug=True
- flu.annotate_tree(Tc=0.005, timetree=True, reroot='best')
+ flu.annotate_tree(Tc=0.005, timetree=True, reroot='best', confidence=params.confidence)
flu.tree.geo_inference('region')
flu.estimate_tree_frequencies()
View
@@ -69,6 +69,8 @@
parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization')
parser.add_argument('-v', '--viruses_per_month', type = int, default = 100, help='number of viruses sampled per month')
parser.add_argument('-r', '--raxml_time_limit', type = float, default = 1.0, help='number of hours raxml is run')
+ parser.add_argument('--verbose', type = int, default = 2, help='throttle treetime output')
+ parser.add_argument('--confidence', default = False, action="store_true", help='evaluate confidence intervals of internal node timing')
parser.add_argument('--load', action='store_true', help = 'recover from file')
params = parser.parse_args()
@@ -84,7 +86,7 @@
lat_long_fname='../fauna/source-data/geo_lat_long.tsv',
proteins=['CA', 'PRO', 'MP', 'ENV', 'NS1', 'NS2A',
'NS2B', 'NS3', 'NS4A', 'NS4B', 'NS5'],
- method='SLSQP')
+ method='SLSQP', verbose=params.verbose)
if params.load:
zika.load()
else:
@@ -107,7 +109,7 @@
zika.build_tree()
zika.dump()
zika.clock_filter(n_iqd=3, plot=True)
- zika.annotate_tree(Tc=0.02, timetree=True, reroot='best')
+ zika.annotate_tree(Tc=0.02, timetree=True, reroot='best', confidence=params.confidence)
for geo_attr in geo_attributes:
zika.tree.geo_inference(geo_attr)
zika.export(controls = attribute_nesting, geo_attributes = geo_attributes,

0 comments on commit 75c959f

Please sign in to comment.