diff --git a/base/fetch_outgroups.py b/base/fetch_outgroups.py new file mode 100644 index 000000000..4600151ec --- /dev/null +++ b/base/fetch_outgroups.py @@ -0,0 +1,42 @@ +from Bio import Entrez, SeqIO +from StringIO import StringIO + +Entrez.email = "richard.neher@tuebingen.mpg.de" # Always tell NCBI who you are + +outgroups = {#'H3N2':'U26830', + #'H3N2_77':'CY113261.1', + #'H1N1pdm':'AF455680', + #'Vic':'CY018813', + #'Yam':'CY019707', + #'zika':'KX369547.1' + #'vic_pb1':'CY018819', + #'vic_pb2':'CY018820', + #'vic_pa': 'CY018818', + #'vic_ha': 'CY018813', + #'vic_np': 'CY018816', + #'vic_na': 'CY018815', + #'vic_ma': 'CY018814', + #'vic_ns': 'CY018817', + 'h1n1pdm_pb1':'AF455728', + 'h1n1pdm_pb2':'AF455736', + 'h1n1pdm_pa': 'AF455720', + 'h1n1pdm_ha': 'AF455680', + 'h1n1pdm_np': 'AF455704', + 'h1n1pdm_na': 'AF455696', + 'h1n1pdm_ma': 'AF455688', + 'h1n1pdm_ns': 'AF455712', + 'h3n2_pb1':'CY113683', + 'h3n2_pb2':'CY113684', + 'h3n2_pa': 'CY113682', + 'h3n2_ha': 'CY113677', + 'h3n2_np': 'CY113680', + 'h3n2_na': 'CY113679', + 'h3n2_ma': 'CY113678', + 'h3n2_ns': 'CY113681', + } + +for virus, genbank_id in outgroups.iteritems(): + handle = Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb") + seq= SeqIO.read(StringIO(handle.read()), format = 'genbank') + SeqIO.write(seq, 'flu/metadata/'+virus+'_outgroup.gb', format='genbank') + diff --git a/base/tree.py b/base/tree.py index d4433cddf..776a02221 100644 --- a/base/tree.py +++ b/base/tree.py @@ -182,7 +182,7 @@ def geo_inference(self, attr): infer a "mugration" model by pretending each region corresponds to a sequence state and repurposing the GTR inference and ancestral reconstruction ''' - from treetime.gtr import GTR + from treetime import GTR # Determine alphabet and store reconstructed ancestral sequences places = set() nuc_seqs = {} @@ -228,7 +228,7 @@ def geo_inference(self, attr): tmp_use_mutation_length = self.tt.use_mutation_length self.tt.use_mutation_length=False self.tt.infer_ancestral_sequences(method='ml', infer_gtr=True, - store_compressed=False, pc=5.0, marginal=True) + store_compressed=False, pc=5.0, marginal=True, normalized_rate=False) # restore the nucleotide sequence and mutations to maintain expected behavior self.tt.geogtr = self.tt.gtr diff --git a/ebola/ebola.py b/ebola/ebola.py index ec8491149..a54d8e793 100644 --- a/ebola/ebola.py +++ b/ebola/ebola.py @@ -2,6 +2,7 @@ from collections import defaultdict import sys sys.path.append('') # need to import from base +sys.path.append('/home/richard/Projects') # need to import from base from base.io_util import make_dir, remove_dir, tree_to_json, write_json, myopen from base.sequences import sequence_set, num_date from base.tree import tree @@ -140,8 +141,21 @@ ebola.load() ebola.clock_filter(n_iqd=10, plot=True) - ebola.annotate_tree(Tc=0.0005, timetree=True, reroot='best', resolve_polytomies=True) + ebola.annotate_tree(Tc=0.002, timetree=True, reroot='best', resolve_polytomies=True) for geo_attr in geo_attributes: ebola.tree.geo_inference(geo_attr) ebola.export(controls = attribute_nesting, geo_attributes = geo_attributes, color_options=color_options, panels=panels) + + + # plot an approximate skyline + from matplotlib import pyplot as plt + T = ebola.tree.tt + plt.figure() + skyline = T.merger_model.skyline(n_points = 20, gen = 50/T.date2dist.slope, + to_numdate = T.date2dist.to_numdate) + plt.ticklabel_format(useOffset=False) + plt.plot(skyline.x, skyline.y) + plt.ylabel('effective population size') + plt.xlabel('year') + plt.savefig('ebola_skyline.png') \ No newline at end of file diff --git a/flu/seasonal_flu.py b/flu/seasonal_flu.py index 8e85c8c91..70bd51092 100644 --- a/flu/seasonal_flu.py +++ b/flu/seasonal_flu.py @@ -10,23 +10,52 @@ regions = ['africa', 'south_asia', 'europe', 'china', 'north_america', 'china', 'south_america', 'japan_korea', 'oceania', 'southeast_asia'] +region_cmap = [ + ["africa", "#5097BA"], + ["south_america", "#60AA9E"], + ["west_asia", "#75B681"], + ["oceania", "#8EBC66"], + ["europe", "#AABD52"], + ["japan_korea", "#C4B945"], + ["north_america", "#D9AD3D"], + ["southeast_asia", "#E59637"], + ["south_asia", "#E67030"], + ["china", "#DF4327"] +]; + region_groups = {'NA':'north_america', 'AS':['china', 'japan_korea', 'south_asia', 'southeast_asia'], 'OC':'oceania', 'EU':'europe'} attribute_nesting = {'geographic location':['region', 'country', 'city'],} + + color_options = { "country":{"key":"country", "legendTitle":"Country", "menuItem":"country", "type":"discrete"}, - "region":{"key":"region", "legendTitle":"Region", "menuItem":"region", "type":"discrete"}, + "region":{"key":"region", "legendTitle":"Region", "menuItem":"region", "type":"discrete", "color_map":region_cmap}, "num_date":{"key":"num_date", "legendTitle":"Sampling date", "menuItem":"date", "type":"continuous"}, "ep":{"key":"ep", "legendTitle":"Epitope Mutations", "menuItem":"epitope mutations", "type":"continuous"}, "ne":{"key":"ne", "legendTitle":"Non-epitope Mutations", "menuItem":"nonepitope mutations", "type":"continuous"}, "rb":{"key":"rb", "legendTitle":"Receptor Binding Mutations", "menuItem":"RBS mutations", "type":"continuous"}, - "gt":{"key":"genotype", "legendTitle":"Genotype", "menuItem":"genotype", "type":"discrete"} + "gt":{"key":"genotype", "legendTitle":"Genotype", "menuItem":"genotype", "type":"discrete"}, + "cHI":{"key":"cHI", "legendTitle":"Antigenic advance", "menuItem":"Antigenic", "type":"continuous"} } panels = ['tree', 'entropy', 'frequencies'] +outliers = { +'h3n2':["A/Sari/388/2006", "A/SaoPaulo/36178/2015", "A/Pennsylvania/40/2010", "A/Pennsylvania/14/2010", + "A/Pennsylvania/09/2011", "A/OSAKA/31/2005", "A/Ohio/34/2012", "A/Kenya/170/2011", "A/Kenya/168/2011", + "A/Indiana/21/2013", "A/Indiana/13/2012", "A/Indiana/11/2013", "A/Indiana/08/2012", "A/Indiana/06/2013", + "A/India/6352/2012", "A/HuNan/01/2014", "A/Helsinki/942/2013", "A/Guam/AF2771/2011", "A/Chile/8266/2003", + "A/Busan/15453/2009", "A/Nepal/142/2011", "A/Kenya/155/2011", "A/Guam/AF2771/2011", "A/Michigan/82/2016", + "A/Ohio/27/2016", "A/Ohio/28/2016", "A/Michigan/83/2016", "A/Michigan/84/2016", "A/Jiangsu-Tianning/1707/2013", + "A/HuNan/1/2014", "A/Iran/227/2014", "A/Iran/234/2014", "A/Iran/140/2014"], +'h1n1pdm': [], +'vic':[], +"yam":[] +} + clade_designations = {"h3n2":{ "3c3.a":[('HA1',128,'A'), ('HA1',142,'G'), ('HA1',159,'S')], @@ -270,30 +299,36 @@ def plot_frequencies(flu, gene, mutation=None, plot_regions=None, all_muts=False plt.ion() parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization') - parser.add_argument('-v', '--viruses_per_month', type = int, default = 10, help='number of viruses sampled per month') parser.add_argument('-y', '--years_back', type = str, default = 3, help='number of years back to sample') parser.add_argument('--resolution', type = str, help ="outfile suffix, can determine -v and -y") + parser.add_argument('-v', '--viruses_per_month_seq', type = int, default = 10, help='number of viruses sampled per month') + parser.add_argument('-w', '--viruses_per_month_tree', type = int, default = 10, help='number of viruses sampled per month') parser.add_argument('-r', '--raxml_time_limit', type = float, default = 1.0, help='number of hours raxml is run') parser.add_argument('-d', '--download', action='store_true', default = False, help='load from database') parser.add_argument('-t', '--time_interval', nargs=2, help='specify time interval rather than use --years_back') parser.add_argument('-l', '--lineage', type = str, default = 'h3n2', help='flu lineage to process') + parser.add_argument('--new_auspice', default = False, action="store_true", help='file name for new augur') parser.add_argument('--load', action='store_true', help = 'recover from file') parser.add_argument('--no_tree', default=False, action='store_true', help = "don't build a tree") params = parser.parse_args() # default values for --viruses_per_month and --years_back from resolution if params.resolution == "2y": - params.viruses_per_month = 15 - params.years_back = 2 + params.viruses_per_month_tree = 15 + params.viruses_per_month_seq = 20 + params.years_back = 2 elif params.resolution == "3y": - params.viruses_per_month = 7 - params.years_back = 3 + params.viruses_per_month_tree = 7 + params.viruses_per_month_seq = 20 + params.years_back = 3 elif params.resolution == "6y": - params.viruses_per_month = 3 - params.years_back = 6 + params.viruses_per_month_tree = 3 + params.viruses_per_month_seq = 10 + params.years_back = 6 elif params.resolution == "12y": - params.viruses_per_month = 2 - params.years_back = 12 + params.viruses_per_month_tree = 2 + params.viruses_per_month_seq = 10 + params.years_back = 12 # construct time_interval from years_back if not params.time_interval: @@ -301,13 +336,18 @@ def plot_frequencies(flu, gene, mutation=None, plot_regions=None, all_muts=False date_str = "{:%Y-%m-%d}".format(datetime.today() - timedelta(days=365.25 * params.years_back)) params.time_interval = [date_str, today_str] + if params.new_auspice: + fname_prefix = "flu_"+params.lineage + else: + fname_prefix = params.lineage + input_data_path = '../fauna/data/'+params.lineage if params.resolution: - store_data_path = 'store/'+params.lineage + '_' + params.resolution +'_' - build_data_path = 'build/'+params.lineage + '_' + params.resolution +'_' + store_data_path = 'store/'+ fname_prefix + '_' + params.resolution +'_' + build_data_path = 'build/'+ fname_prefix + '_' + params.resolution +'_' else: - store_data_path = 'store/'+params.lineage + '_' - build_data_path = 'build/'+params.lineage + '_' + store_data_path = 'store/'+ fname_prefix + '_' + build_data_path = 'build/'+ fname_prefix + '_' ppy = 12 flu = flu_process(input_data_path = input_data_path, store_data_path = store_data_path, @@ -330,19 +370,22 @@ def plot_frequencies(flu, gene, mutation=None, plot_regions=None, all_muts=False flu.seqs.filter(lambda s: s.attributes['date']>=time_interval[0] and s.attributes['date']=900) + flu.seqs.filter(lambda s: s.name not in outliers[params.lineage]) - flu.subsample(params.viruses_per_month) + flu.subsample(params.viruses_per_month_seq) flu.align() flu.dump() # first estimate frequencies globally, then region specific flu.estimate_mutation_frequencies(region="global", pivots=pivots) - for region in region_groups.iteritems(): - flu.estimate_mutation_frequencies(region=region) + # for region in region_groups.iteritems(): + # flu.estimate_mutation_frequencies(region=region) if not params.no_tree: + flu.subsample(params.viruses_per_month_tree, repeated=True) flu.align() flu.build_tree() flu.clock_filter(n_iqd=3, plot=True) + flu.tree.tt.debug=True flu.annotate_tree(Tc=0.005, timetree=True, reroot='best') flu.tree.geo_inference('region') @@ -356,3 +399,15 @@ def plot_frequencies(flu, gene, mutation=None, plot_regions=None, all_muts=False flu.export(extra_attr=['serum'], controls=attribute_nesting, color_options=color_options, panels=panels) flu.HI_export() + + # plot an approximate skyline + from matplotlib import pyplot as plt + T = flu.tree.tt + plt.figure() + skyline = T.merger_model.skyline(n_points=20, gen = 50/T.date2dist.slope, + to_numdate = T.date2dist.to_numdate) + plt.ticklabel_format(useOffset=False) + plt.plot(skyline.x, skyline.y, lw=2) + plt.ylabel('effective population size') + plt.xlabel('year') + plt.savefig('%s_%s_skyline.png'%(params.lineage, params.resolution)) diff --git a/flu/seasonal_flu_all_segments.py b/flu/seasonal_flu_all_segments.py new file mode 100644 index 000000000..3301ef36a --- /dev/null +++ b/flu/seasonal_flu_all_segments.py @@ -0,0 +1,125 @@ +from __future__ import division, print_function +from collections import defaultdict +import sys +sys.path.append('') # need to import from base +from base.process import process +import numpy as np +from datetime import datetime +from base.io_util import myopen + +regions = ['africa', 'south_asia', 'europe', 'china', 'north_america', + 'china', 'south_america', 'japan_korea', 'oceania', 'southeast_asia'] + +region_groups = {'NA':'north_america', + 'AS':['china', 'japan_korea', 'south_asia', 'southeast_asia'], + 'OC':'oceania', 'EU':'europe'} + +attribute_nesting = {'geographic location':['region', 'country', 'city'],} + + +def sampling_category(x): + return (x.attributes['region'], + x.attributes['date'].year, + x.attributes['date'].month) + + +def sampling_priority(seq): + return len(seq.seq)*0.0001 - 0.01*np.sum([seq.seq.count(nuc) for nuc in 'NRWYMKSHBVD']) + + +if __name__=="__main__": + import argparse + import matplotlib.pyplot as plt + plt.ion() + + parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization') + parser.add_argument('-v', '--viruses_per_month', type = int, default = 10, help='number of viruses sampled per month') + parser.add_argument('-y', '--resolution', type = str, default = '3y', help='outfile suffix') + parser.add_argument('-r', '--raxml_time_limit', type = float, default = 1.0, help='number of hours raxml is run') + parser.add_argument('-d', '--download', action='store_true', default = False, help='load from database') + parser.add_argument('-t', '--time_interval', nargs=2, default=('2012-01-01', '2016-01-01'), + help='time interval to sample sequences from: provide dates as YYYY-MM-DD') + parser.add_argument('-l', '--lineage', type = str, default = 'h3n2', help='flu lineage to process') + parser.add_argument('--load', action='store_true', help = 'recover from file') + parser.add_argument('--no_tree', default=False, action='store_true', help = "don't build a tree") + params = parser.parse_args() + + ppy = 12 + time_interval = [datetime.strptime(x, '%Y-%m-%d').date() for x in params.time_interval] + pivots = np.arange(time_interval[0].year+(time_interval[0].month-1)/12.0, + time_interval[1].year+time_interval[1].month/12.0, 1.0/ppy) + + # load data from all segments + segment_names = ['pb1', 'pb2', 'pa', 'ha', 'np', 'na', 'm', 'ns'] + segments = {} + viruses = defaultdict(list) + for seg in segment_names: + input_data_path = '../fauna/data/'+params.lineage+'_'+seg + if seg=='m': + input_data_path+='p' + store_data_path = 'store/'+params.lineage + '_' + params.resolution + '_' + seg + '_' + build_data_path = 'build/'+params.lineage + '_' + params.resolution + '_' + seg + '_' + flu = process(input_data_path = input_data_path, store_data_path = store_data_path, + build_data_path = build_data_path, reference='flu/metadata/'+params.lineage + '_' + seg +'_outgroup.gb', + proteins=['SigPep', 'HA1', 'HA2'], + method='SLSQP', inertia=np.exp(-1.0/ppy), stiffness=2.*ppy) + + flu.load_sequences(fields={0:'strain', 2:'isolate_id', 3:'date', 4:'region', + 5:'country', 7:"city", 12:"subtype",13:'lineage'}) + + print("## loading data for segment %s, found %d number of sequences"%(seg, len(flu.seqs.all_seqs))) + for sequence in flu.seqs.all_seqs: + viruses[sequence].append(seg) + + segments[seg] = flu + + # determine strains that are complete + complete_strains = filter(lambda x:len(viruses[x])==len(segment_names), viruses.keys()) + # determine filter every segment down to the sequences for which all other segments exist + segments['ha'].seqs.filter(lambda s: s.name in complete_strains) + segments['ha'].seqs.filter(lambda s:s.attributes['date']>=time_interval[0] and s.attributes['date']=time_interval[0] and s.attributes['date']