/
export.py
449 lines (372 loc) · 17.4 KB
/
export.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
import os
import re
import time
import numpy as np
from Bio import Phylo
from collections import defaultdict
from .utils import read_metadata, read_node_data, write_json, read_config, read_lat_longs, read_colors
def convert_tree_to_json_structure(node, metadata, div=0, nextflu_schema=False, strains=None):
"""
converts the Biopython tree structure to a dictionary that can
be written to file as a json. This is called recursively.
Creates the strain property & divergence on each node
input
node -- node for which top level dict is produced.
div -- cumulative divergence (root = 0)
nextflu_schema -- use nexflu schema (e.g. node["attr"]). This is deprecated.
returns
tree in JSON structure
list of strains
"""
if nextflu_schema:
node_struct = {
'attr': {"div": div},
'strain': node.name,
'clade': node.clade
}
for attr in ['branch_length', 'tvalue', 'yvalue', 'xvalue']:
try:
node_struct[attr] = node.__getattribute__(attr)
except AttributeError:
pass
else:
node_struct = {
'strain': node.name,
'div': div
}
if strains is None:
strains = [node_struct["strain"]]
else:
strains.append(node_struct["strain"])
if node.clades:
node_struct["children"] = []
for child in node.clades:
cdiv = div + metadata[child.name]['mutation_length']
node_struct["children"].append(convert_tree_to_json_structure(child, metadata, div=cdiv, nextflu_schema=nextflu_schema, strains=strains)[0])
return (node_struct, strains)
def recursively_decorate_tree_json_nextflu_schema(node, node_metadata, decorations):
"""
This function is deprecated and is used to produce the nextflu-compatable JSON format
For given decorations, add information from node_metadata to
each node in the tree.
* decorations must have property "key" which is the key used to insert
into the node and the default key used to access node_metadata
* if decorations has property "lookup_key", this is used to access
node meta_data instead
* if decorations has property "is_attr" (and it's value is True)
then the result is inserted into node["attr"]
returns Null
"""
try:
metadata = node_metadata[node["strain"]]
metadata["strain"] = node["strain"]
except KeyError:
raise Exception("ERROR: node %s is not found in the node metadata."%n.name)
for data in decorations:
val = None
insert_key = data["key"]
try:
if "lookup_key" in data:
val = metadata[data["lookup_key"]]
else:
val = metadata[insert_key]
except KeyError:
pass
if val is not None:
if "is_attr" in data and data["is_attr"]:
node["attr"][insert_key] = val
else:
node[insert_key] = val
if "children" in node:
for child in node["children"]:
recursively_decorate_tree_json_nextflu_schema(child, node_metadata, decorations)
def tree_layout(T):
"""
calculate tree layout.
This function is deprecated, and only used for the nextflu JSON format
"""
yval=T.count_terminals()
clade = 0;
for n in T.find_clades(order='postorder'):
n.clade=clade; clade+=1;
if n.is_terminal():
n.yvalue=yval
yval-=1
else:
child_yvalues = [c.yvalue for c in n]
n.yvalue=0.5*(np.min(child_yvalues)+np.max(child_yvalues))
def process_colorings(jsn, color_mapping, nodes=None, node_metadata=None, nextflu=False):
if nextflu:
if "color_options" not in jsn:
print("WARNING: no color options were defined")
return
data = jsn["color_options"]
else:
#if "colorings" not in jsn:
if jsn is None or len(jsn)==0:
print("WARNING: no colorings were defined")
return
data = {k: {'type': 'categorical'} for k in jsn}
#Always add in genotype and date to colour by
data['gt'] = {'title': 'Genotype', 'type': 'ordinal'}
#figure out how to see if num_date is there? Is it possible to not have?
data['num_date'] = {'title': 'Sampling date', 'type': 'continuous'}
for trait, options in data.items():
if nextflu:
if "legendTitle" not in options: options["legendTitle"] = trait
if "menuItem" not in options: options["menuItem"] = trait
if "key" not in options: options["key"] = trait
else:
if "title" not in options: options["title"] = trait
if "type" not in options:
raise Exception("coloring {} missing type...".format(trait))
if trait.lower() in color_mapping:
# remember that the color maps (from the TSV) are in lower case, but this is not how they should be exported
if nodes:
values_in_tree = {node[trait] for node in nodes.values() if trait in node}
else:
values_in_tree = {data["traits"][trait]["value"] for name, data in node_metadata.items()}
case_map = {val.lower(): val for val in values_in_tree}
if nextflu:
options["color_map"] = [(case_map[m[0]], m[1]) for m in color_mapping[trait.lower()] if m[0] in case_map]
else:
options["scale"] = {case_map[m[0]]: m[1] for m in color_mapping[trait.lower()] if m[0] in case_map}
return data
def process_geographic_info(jsn, lat_long_mapping, nextflu=False, node_metadata=None, nodes=None):
if (nextflu and "geo" not in jsn) or (not nextflu and (jsn is None or len(jsn)==0)):
return {}
geo = defaultdict(dict)
traits = jsn["geo"] if nextflu else jsn #jsn["geographic_info"]
for trait in traits:
if nextflu:
demes_in_tree = {node[trait] for node in nodes.values() if trait in node}
else:
demes_in_tree = {data["traits"][trait]["value"] for name, data in node_metadata.items()}
for deme in demes_in_tree:
try:
geo[trait][deme] = lat_long_mapping[(trait.lower(),deme.lower())]
except KeyError:
print("Error. {}->{} did not have an associated lat/long value (matching performed in lower case)".format(trait, deme))
return geo
def process_annotations(node_data):
# treetime adds "annotations" to node_data
if "annotations" not in node_data: #if haven't run tree through treetime
return {}
return node_data["annotations"]
def process_panels(user_panels, j, nextflu=False):
try:
panels = j["panels"]
except KeyError:
panels = ["tree", "map", "entropy"]
if user_panels is not None and len(user_panels) != 0:
panels = user_panels
if nextflu:
geoTraits = j["geo"].keys()
annotations = j["annotations"].keys()
else:
geoTraits = j["geographic_info"].keys()
annotations = j["genome_annotations"].keys()
if "entropy" in panels and len(annotations) == 0:
panels.remove("entropy")
if "map" in panels and len(geoTraits) == 0:
panels.remove("map")
return panels
def add_tsv_metadata_to_nodes(nodes, meta_tsv, meta_json, extra_fields=['authors', 'url', 'accession']):
"""
Only used for nextflu schema compatability
Add the relevent fields from meta_tsv to the nodes
(both are dictionaries keyed off of strain names)
* the relevent fields are found by scanning the meta json
together with the extra_fields param
"""
fields = [x for x in meta_json["color_options"].keys() if x != "gt"] + meta_json["geo"] + extra_fields
for strain, node in nodes.items():
if strain not in meta_tsv:
continue
for field in fields:
if field not in node and field in meta_tsv[strain]:
node[field] = meta_tsv[strain][field]
def collect_strain_info(node_data, tsv_path):
"""
Integrate TSV metadata to the per-node metadata structure
"""
strain_info = node_data["nodes"]
meta_tsv, _ = read_metadata(tsv_path)
for strain, node in strain_info.items():
if strain in meta_tsv:
for field in meta_tsv[strain]:
node[field] = meta_tsv[strain][field]
return strain_info
def construct_author_info_nexflu(metadata, tree, nodes):
"""
author info maps the "authors" property present on tree nodes
to further information about the paper etc
"""
authorsInTree = set()
for node in tree.find_clades(order='postorder'):
if node.is_terminal and node.name in nodes and "authors" in nodes[node.name]:
authorsInTree.add(nodes[node.name]["authors"])
author_info = defaultdict(lambda: {"n": 0})
for strain, data in metadata.items():
if "authors" not in data:
print("Error - {} had no authors".format(n))
continue
if data["authors"] not in authorsInTree:
continue
authors = data["authors"]
author_info[authors]["n"] += 1
# add in extra attributes if they're present in the meta TSV (for this strain...)
for attr in ["title", "journal", "paper_url"]:
if attr in data:
if attr in author_info[authors] and data[attr].strip() != author_info[authors][attr].strip():
print("Error - {} had contradictory {}(s): {} vs {}".format(authors, attr, data[attr], author_info[authors][attr]))
author_info[authors][attr] = data[attr].strip()
return author_info
def construct_author_info_and_make_keys(node_metadata, raw_strain_info):
"""
For each node, gather the relevant author information (returned)
and create a unique key inside node_metadata
"""
author_info = {}
for strain, node in node_metadata.items():
try:
authors = raw_strain_info[strain]["authors"]
except KeyError:
continue # internal node / terminal node without authors
data = {
"authors": authors
}
if "title" in raw_strain_info[strain]:
data["title"] = raw_strain_info[strain]["title"].strip()
if "journal" in raw_strain_info[strain]:
data["journal"] = raw_strain_info[strain]["journal"].strip()
if "url" in raw_strain_info[strain]:
data["url"] = raw_strain_info[strain]["paper_url"].strip()
# make unique key...
key = authors.split()[0].lower()
if "journal" in data:
# extract the year out of the journal
matches = re.findall(r'\([0-9A-Z-]*(\d{4})\)', data["journal"])
if matches:
key += matches[-1]
if "title" in data:
key += raw_strain_info[strain]["title"].strip().split()[0].lower()
node["authors"] = key
if key not in author_info:
author_info[key] = data
elif author_info[key] != data:
print("WARNING: Contradictory author information: {} vs {}".format(author_info[key], data))
return author_info
def transfer_metadata_to_strains(strains, raw_strain_info, traits):
node_metadata = {}
for strain_name in strains:
node = {"traits":{}}
try:
raw_data = raw_strain_info[strain_name]
except KeyError:
raise Exception("ERROR: %s is not found in the node_data[\"nodes\"]"%strain_name)
# TRANSFER MUTATIONS #
if "aa_muts" in raw_data or "muts" in raw_data:
node["mutations"] = {}
if "muts" in raw_data and len(raw_data["muts"]):
node["mutations"]["nuc"] = raw_data["muts"]
if "aa_muts" in raw_data:
aa = {gene:data for gene, data in raw_data["aa_muts"].items() if len(data)}
node["mutations"].update(aa)
# TRANSFER NODE DATES #
if "numdate" in raw_data or "num_date" in raw_data: # it's ok not to have temporal information
node["num_date"] = {
"value": raw_data["num_date"] if "num_date" in raw_data else raw_data["numdate"]
}
if "num_date_confidence" in raw_data:
node["num_date"]["confidence"] = raw_data["num_date_confidence"]
# TRANSFER VACCINE INFO #
# TRANSFER LABELS #
# TRANSFER NODE_HIDDEN PROPS #
# TRANSFER AUTHORS #
# TRANSFER GENERIC PROPERTIES #
for prop in ["url", "accession"]:
if prop in raw_data:
node[prop] = raw_data[prop]
# TRANSFER TRAITS (INCLUDING CONFIDENCE & ENTROPY) #
for trait in traits:
if trait in raw_data:
node["traits"][trait] = {"value": raw_data[trait]}
if trait+"_confidence" in raw_data:
node["traits"][trait]["confidence"] = raw_data[trait+"_confidence"]
if trait+"_entropy" in raw_data:
node["traits"][trait]["entropy"] = raw_data[trait+"_entropy"]
node_metadata[strain_name] = node
return node_metadata
def add_metadata_to_tree(node, metadata):
node.update(metadata[node["strain"]])
if "children" in node:
for child in node["children"]:
add_metadata_to_tree(child, metadata)
def run(args):
T = Phylo.read(args.tree, 'newick')
node_data = read_node_data(args.node_data) # args.node_data is an array of multiple files (or a single file)
if args.nextflu_schema:
# This schema is deprecated. It remains because:
# (1) auspice can't use schema 2.0 yet, (2) nexflu doesn't use schema 2.0
# export the tree JSON first
meta_json = read_config(args.auspice_config)
meta_tsv, _ = read_metadata(args.metadata)
nodes = node_data["nodes"] # this is the per-node metadata produced by various augur modules
add_tsv_metadata_to_nodes(nodes, meta_tsv, meta_json)
tree_layout(T)
tree_json, _ = convert_tree_to_json_structure(T.root, nodes, nextflu_schema=True)
# now the messy bit about what decorations (e.g. "country", "aa_muts") do we want to add to the tree?
# see recursively_decorate_tree_json to understand the tree_decorations structure
tree_decorations = [
{"key": "num_date", "lookup_key": "numdate", "is_attr": True},
{"key": "muts", "is_attr": False},
{"key": "aa_muts", "is_attr": False}
]
traits_via_node_metadata = {k for node in nodes.values() for k in node.keys()}
traits_via_node_metadata -= {'sequence', 'mutation_length', 'branch_length', 'numdate', 'mutations', 'muts', 'aa_muts'}
for trait in traits_via_node_metadata:
tree_decorations.append({"key": trait, "is_attr": True})
recursively_decorate_tree_json_nextflu_schema(tree_json, nodes, decorations=tree_decorations)
write_json(tree_json, args.output_tree, indent=2)
# Export the metadata JSON
lat_long_mapping = read_lat_longs(args.lat_longs)
color_mapping = read_colors(args.colors)
meta_json["updated"] = time.strftime("%d %b %Y")
meta_json["virus_count"] = len(list(T.get_terminals()))
meta_json["author_info"] = construct_author_info_nexflu(meta_tsv, T, nodes)
meta_json["color_options"] = process_colorings(meta_json, color_mapping, nodes=nodes, nextflu=True)
meta_json["geo"] = process_geographic_info(meta_json, lat_long_mapping, nodes=nodes, nextflu=True)
meta_json["annotations"] = process_annotations(node_data)
meta_json["panels"] = process_panels(None, meta_json, nextflu=True)
write_json(meta_json, args.output_meta, indent=2)
return 0
## SCHEMA v2.0 ##
unified = {}
unified['title'] = args.title
unified['maintainers'] = [{'name': name, 'href':url} for name, url in zip(args.maintainers, args.maintainer_urls)]
unified["version"] = "2.0"
raw_strain_info = collect_strain_info(node_data, args.metadata)
unified["tree"], strains = convert_tree_to_json_structure(T.root, raw_strain_info)
# collect traits to transfer to the nodes (i.e. properties of node.traits)
# is safe to assume all nodes have same traits that have been mapped to tree??
all_traits = next(iter(node_data['nodes'].values()))
all_traits = list(all_traits.keys())
if args.extra_traits:
all_traits.extend(args.extra_traits)
entries = ['branch_length', 'num_date', 'numdate', 'clock_length', 'mutation_length', 'date', 'muts']
traits = [x for x in all_traits if x not in entries]
traits = [x for x in traits if '_confidence' not in x and '_entropy' not in x]
node_metadata = transfer_metadata_to_strains(strains, raw_strain_info, traits)
unified["author_info"] = construct_author_info_and_make_keys(node_metadata, raw_strain_info)
add_metadata_to_tree(unified["tree"], node_metadata)
#Is this ok if there's no author data? (I imagine not)
unified['filters'] = traits + ['authors']
color_mapping = read_colors(args.colors)
unified["colorings"] = process_colorings(traits, color_mapping, node_metadata=node_metadata)
lat_long_mapping = read_lat_longs(args.lat_longs)
unified["geographic_info"] = process_geographic_info(args.geography_traits, lat_long_mapping, node_metadata=node_metadata)
unified["updated"] = time.strftime('%Y-%m-%d')
unified["genome_annotations"] = process_annotations(node_data)
unified["panels"] = process_panels(args.panels, unified)
write_json(unified, args.output_main, indent=2)