/
vcf_utils.py
715 lines (617 loc) · 30.1 KB
/
vcf_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
import gzip
import numpy as np
from collections import defaultdict
from textwrap import fill
from . import TreeTimeError
## Functions to read in and print out VCF files
def read_vcf(vcf_file, ref_file=None):
"""
Reads in a vcf/vcf.gz file and associated
reference sequence fasta (to which the VCF file is mapped).
Parses mutations, insertions, and deletions and stores them in a nested dict,
see 'returns' for the dict structure.
Calls with heterozygous values 0/1, 0/2, etc and no-calls (./.) are
replaced with Ns at the associated sites.
Positions are stored to correspond the location in the reference sequence
in Python (numbering is transformed to start at 0)
Parameters
----------
vcf_file : string
Path to the vcf or vcf.gz file to be read in
ref_file : string, optional
Path to the fasta reference file to be read in
Returns
--------
compress_seq : nested dict
In the format: ::
{
'reference':'AGCTCGA..A',
'sequences': { 'seq1':{4:'A', 7:'-'}, 'seq2':{100:'C'} },
'insertions': { 'seq1':{4:'ATT'}, 'seq3':{1:'TT', 10:'CAG'} },
'positions': [1,4,7,10,100...]
}
references : string
String of the reference sequence read from the Fasta, to which
the variable sites are mapped
sequences : nested dict
Dict containing sequence names as keys which map to dicts
that have position as key and the single-base mutation (or deletion)
as values
insertions : nested dict
Dict in the same format as the above, which stores insertions and their
locations. The first base of the insertion is the same as whatever is
currently in that position (Ref if no mutation, mutation in 'sequences'
otherwise), so the current base can be directly replaced by the bases held here.
positions : list
Python list of all positions with a mutation, insertion, or deletion.
"""
import re
ALT_CHARS = re.compile(r"^([ACGTNacgtn]+|\*|\.)$") # straight from the VCF 4.3 spec
#Programming Note:
# Note on VCF Format
# -------------------
# 'Insertion where there are also deletions' (special handling)
# Ex:
# REF ALT Seq1 Seq2
# GC GCC,G 1/1 2/2
# Insertions formatted differently - don't know how many bp match
# the Ref (unlike simple insert below). Could be mutations, also.
# 'Deletion'
# Ex:
# REF ALT
# GC G
# Alt does not have to be 1 bp - any length shorter than Ref.
# 'Insertion'
# Ex:
# REF ALT
# A ATT
# First base _does not_ always match REF, so there may be a SNP as well
# 'No indel'
# Ex:
# REF ALT
# A G
#define here, so that all sub-functions can access them
sequences = defaultdict(dict)
insertions = defaultdict(dict) #Currently not used, but kept in case of future use.
metadata = {
'meta_lines': [], # all the VCF meta_lines, i.e. those starting with '##' (they are left unparsed)
'chrom': None, # chromosome name (we only allow one)
'ploidy': None, # ploidy count -- encoded in how the GT calls are formatted
}
#TreeTime handles 2-3 base ambig codes, this will allow that.
def getAmbigCode(bp1, bp2, bp3=""):
bps = [bp1,bp2,bp3]
bps.sort()
key = "".join(bps)
return {
'CT': 'Y',
'AG': 'R',
'AT': 'W',
'CG': 'S',
'GT': 'K',
'AC': 'M',
'AGT': 'D',
'ACG': 'V',
'ACT': 'H',
'CGT': 'B'
}[key]
#Parses a 'normal' (not hetero or no-call) call depending if insertion+deletion, insertion,
#deletion, or single bp subsitution
def parse_homozygous_call(snps, ins, pos, ref, alt):
# Replace missing allele with N(s). See commentary in test_vcf.py::TestNoCallsOrMissing for more details
if alt=='*' or alt=='.':
alt = "N" * len(ref)
#Insertion where there are also deletions (special handling)
if len(ref) > 1 and len(alt)>len(ref):
## NOTE: the loop below contains a potential double-counting bug. For example,
## REF='TCG' ALT='TCAG' (Example 5.1.3 in the VCF 4.2 spec), then when i=2
## we'll add both snps[pos+2] = 'A' as well as ins[pos+2] = 'AG'. This has been
## detailed within `test_vcf.py`, as it may also be the expected way to encode
## insertions within TreeTime
for i in range(len(ref)):
#if the pos doesn't match, store in sequences
if ref[i] != alt[i]:
snps[pos+i] = alt[i]
#if about to run out of ref, store rest:
if (i+1) >= len(ref):
ins[pos+i] = alt[i:]
#Deletion
elif len(ref) > 1:
for i in range(len(ref)):
#if ref is longer than alt, these are deletion positions
if i+1 > len(alt):
snps[pos+i] = '-'
#if not, there may be mutations
else:
if ref[i] != alt[i]:
snps[pos+i] = alt[i]
#Insertion + single-base ref
elif len(alt) > 1:
ins[pos] = alt
# If the first base of the allele doesn't match the ref then we _also_ have a mutation
if ref[0]!=alt[0]:
snps[pos] = alt[0]
#No indel
else:
snps[pos] = alt
#Parses a 'bad' (hetero or no-call) call depending on what it is
#TODO - consider the situation where the alternate allele base(s) is '*' (done for the homozygous case)
def parse_heterozygous_call(gen, snps, ins, pos, ref, ALT):
#Deletion
# REF ALT Seq1 Seq2 Seq3
# GCC G 1/1 0/1 ./.
# Seq1 (processed by parseCall, above) will become 'G--'
# Seq2 will become 'GNN'
# Seq3 will become 'GNN'
if len(ref) > 1:
#Deleted part becomes Ns
if gen[0] == '0' or gen[0] == '.':
if gen[0] == '0': #if het, get first bp
alt = str(ALT[int(gen[2])-1])
else: #if no-call, there is no alt, so just put Ns after 1st ref base
alt = ref[0]
for i in range(len(ref)):
#if ref is longer than alt, these are deletion positions
if i+1 > len(alt):
snps[pos+i] = 'N'
#if not, there may be mutations
else:
if ref[i] != alt[i]:
snps[pos+i] = (alt[i] if alt[i] != '.' else 'N') #'.' = no-call
#If not deletion, need to know call type
#if het, see if proposed alt is 1bp mutation
elif gen[0] == '0':
alt = str(ALT[int(gen[2])-1])
if len(alt)==1:
#alt = getAmbigCode(ref,alt) #if want to allow ambig
alt = 'N' #if you want to disregard ambig
snps[pos] = alt
#else a het-call insertion, so ignore.
#else it's a no-call; see if all alts have a length of 1
#(meaning a simple 1bp mutation)
elif len(ALT)==len("".join(ALT)):
alt = 'N'
snps[pos] = alt
#else a no-call insertion, so ignore.
def validate_alt(alt):
"""
from the VCF 4.3 spec:
> the ALT field must be a symbolic allele, or a breakend replacement string,
> or match the regular expression [see ALT_CHARS regex, above]
Since we only consider GT variation, the symbolic alleles and breakends aren't
relevant for us.
The spec also states:
> Tools processing VCF files are not required to preserve case in the allele String
Return the uppercase allele bases (string) _or_ None if it fails validation
"""
if ALT_CHARS.match(alt):
return alt.upper()
else:
print(f"WARNING: Encountered invalid allele base(s) {alt!r}. Skipping...")
return None
#House code is *much* faster than pyvcf because we don't care about all info
#about coverage, quality, counts, etc, which pyvcf goes to effort to parse
#(and it's not easy as there's no standard ordering). Custom code can completely
#ignore all of this.
from Bio import SeqIO
#Use different openers depending on whether compressed
opn = gzip.open if vcf_file.endswith(('.gz', '.GZ')) else open
with opn(vcf_file, mode='rt') as f:
current_block = "meta-information" # The start of VCF files is the meta-information (lines starting with ##)
header,samps,nsamp=None,None,None
for line in f:
if line.startswith("##"):
if current_block!='meta-information':
raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - all the meta-information (lines starting with ##) must appear at the top of the file.")
metadata['meta_lines'].append(line.strip())
elif line[0]=='#':
mandatory_fields = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
# Note that FORMAT isn't mandatory unless genotype data is present. But every VCF we deal with has genotype data - that's the point!
header_start = "#"+"\t".join(mandatory_fields)
if not line.startswith(header_start):
raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - the header line must start with {header_start}")
if current_block!='meta-information':
raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - the header must immediately follow the meta-information lines")
current_block = 'header'
header = line.strip().split('\t')
samps = [ x.strip() for x in header[9:] ] #ensure no leading/trailing spaces
nsamp = len(samps)
for sample_name in samps:
sequences[sample_name] = {}
else:
if current_block!='header' and current_block!='data-lines':
raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - the data lines must follow the header line")
current_block='data-lines'
l = line.strip()
if not l: # empty line
continue
dat = l.split('\t')
chrom = str(dat[0])
if metadata['chrom'] is None:
metadata['chrom'] = chrom
elif metadata['chrom']!=chrom:
raise TreeTimeError(f"The VCF file {vcf_file!r} contains multiple chromosomes. TreeTime can not yet handle this.")
pos = int(dat[1])-1 # Convert VCF 1-based to python 0-based
REF = dat[3]
ALT = dat[4].split(',') # List of alternate alleles (strings)
calls = np.array(dat[9:])
if len(calls)!=nsamp:
raise TreeTimeError(f"Malformed VCF file {vcf_file!r} - the data lines have different number of calls than the number of samples defined in the header")
# Treetime only parses GT FORMAT calls. Moreover, "the first sub-field must always be the genotype (GT) if it is present"
# according to the VCF 4.2 spec. Note that if the VCF is for one sample only then the format's a bit different, but this'll
# raise an error above because the header differs from what we assert. Other variation (CN, BND etc) is not parsed by
# TreeTime, only GT.
FORMAT = dat[8]
if not FORMAT.startswith('GT'):
continue
#get samples that differ from Ref at this site
for sname, sa in zip(samps, calls):
gt = sa.split(':')[0] # May be multiple colon-separated subfields, depending on the line's FORMAT
# `gt` is the "index" of the alternate allele for this sample.
# The format of `gt` is quite varied:
# For haploid genomes, it's simply a single int _or_ "." (meaning a call cannot be made at the locus)
# For polyploid genomes, the format is a list of {int, '.'} separated by "/" or "|"
# ('/' means unphased, '|' means phased).
# If the index is an int, it's the 1-based (!) lookup index for ALT
# Split on the valid separators - if haploid, then we'll get a list len=1
gts = gt.split('|') if '|' in gt else gt.split('/')
ploidy = len(gts)
if metadata['ploidy'] is None:
metadata['ploidy'] = ploidy
elif metadata['ploidy']!=ploidy:
raise TreeTimeError(f"The VCF file {vcf_file!r} had genotype calls of ploidy {metadata['ploidy']} but sample {sname!r} has a genotype of ploidy {ploidy}")
# if polyploid, but homozygous, then treat as if haploid
if ploidy>1 and len(set(gts))==1:
gt = gts[0]
if gt.isdigit(): # haploid, and a call has been made (i.e. it's not gt='.')
gt = int(gt)
if gt==0:
continue # reference allele!
alt = validate_alt(ALT[gt-1]) # gt is the 1-based lookup, but ALT is 0-indexed
if alt:
parse_homozygous_call(sequences[sname],insertions[sname], pos, REF, alt)
continue
if gt=='.': # haploid "call cannot be made" identifier - replace REF with N(s)
for i in range(len(REF)):
sequences[sname][pos+i] = 'N'
continue
# ---- heterozygous polyploid call ----
parse_heterozygous_call(gt, sequences[sname],insertions[sname], pos, REF, ALT)
#Gather all variable positions
#NOTE: this does not consider positions of insertions
positions = set()
for seq, muts in sequences.items():
positions.update(muts.keys())
num_insertions = sum([len(list(ins.keys())) for ins in insertions.values()])
if len(positions)==0 and num_insertions==0:
raise TreeTimeError(f"VCF file {vcf_file!r} has no data-lines which we could extract genotype information from!")
#One or more seqs are same as ref! (No non-ref calls) So haven't been 'seen' yet
if nsamp > len(sequences):
missings = set(samps).difference(sequences.keys())
for s in missings:
sequences[s] = {}
if ref_file:
refSeq = SeqIO.read(ref_file, format='fasta')
refSeq = refSeq.upper() #convert to uppercase to avoid unknown chars later
refSeqStr = str(refSeq.seq)
else:
refSeqStr = None
compress_seq = {'reference':refSeqStr,
'sequences': sequences,
'insertions': insertions,
'positions': sorted(positions),
'metadata': metadata}
return compress_seq
def write_vcf(tree_dict, file_name, mask=None):#, compress=False):
"""
Writes out a VCF-style file (which seems to be minimally handleable
by vcftools and pyvcf) of the alignment. This is created from a dict
in a similar format to what's created by :py:meth:`treetime.vcf_utils.read_vcf`
Positions of variable sites are transformed to start at 1 to match
VCF convention.
Parameters
----------
tree_dict: nested dict
A nested dict with required keys of:
'sequences': maps sampleName -> pos (0-based) -> new base (SNP)
'reference': string of reference nuc sequence
'positions': sorted list of 0-based positions with variation in sequences
And optional keys:
'inferred_const_sites': list or set, 0-based positions to skip output for.
This input is often created by :py:meth:`treetime.TreeAnc.get_tree_dict`
'metadata': dict of information to influence VCF formatting. Only
the following keys are used:
'metadata.ploidy': int. Influences how genotype calls are formatted. (default
of 2 (diploid) used if not provided)
'metadata.chrom': str. The chromosome name (default of '1' used if not provided)
file_name: str
File to which the new VCF should be written out. File names ending with
'.gz' will result in the VCF automatically being gzipped.
mask : optional, str of 0 or 1
Calls at these positions will be skipped
"""
# Programming Logic Note:
#
# For a sequence like:
# Pos 1 2 3 4 5 6
# Ref A C T T A C
# Seq1 A C - - - G
#
# In a dict it is stored:
# Seq1:{3:'-', 4:'-', 5:'-', 6:'G'} (Numbering from 1 for simplicity)
#
# In a VCF it needs to be:
# POS REF ALT Seq1
# 2 CTTA C 1/1
# 6 C G 1/1
#
# If a position is deleted (pos 3), need to get invariable position preceeding it
#
# However, in alternative case, the base before a deletion is mutant, so need to check
# that next position isn't a deletion (as otherwise won't be found until after the
# current single bp mutation is written out)
#
# When deleted position found, need to gather up all adjacent mutant positions with deletions,
# but not include adjacent mutant positions that aren't deletions (pos 6)
#
# Don't run off the 'end' of the position list if deletion is the last thing to be included
# in the VCF file
sequences = tree_dict['sequences']
ref = tree_dict['reference']
positions = tree_dict['positions']
ploidy = tree_dict.get('metadata', {}).get('ploidy', 2)
chrom_name = tree_dict.get('metadata', {}).get('chrom', '1')
sample_names = list(sequences.keys())
inferred_const_sites = set(tree_dict.get('inferred_const_sites', []))
# For every variable site in sequences, flip the format around so
# we can have fast lookups later on.
alleles = {}
num_samples = len(sample_names)
for idx, name in enumerate(sample_names):
for posn, allele in sequences[name].items():
if posn not in alleles:
alleles[posn] = np.zeros(num_samples, dtype='U')
alleles[posn][idx] = allele
# fill in reference
for posn,bases in alleles.items():
bases[bases==''] = ref[posn]
def handleDeletions(i, pi, pos, ref, delete, pattern):
refb = ref[pi]
if delete: #Need to get the position before
i-=1 #As we'll next go to this position again
pi-=1
pos = pi+1
refb = ref[pi]
#re-get pattern
pattern = []
for k,v in sequences.items():
try:
pattern.append(sequences[k][pi])
except KeyError:
pattern.append(ref[pi])
pattern = np.array(pattern).astype('U')
sites = []
sites.append(pattern)
#Gather all positions affected by deletion - but don't run off end of position list
while (i+1) < len(positions) and positions[i+1] == pi+1:
i+=1
pi = positions[i]
pattern = []
for k,v in sequences.items():
try:
pattern.append(sequences[k][pi])
except KeyError:
pattern.append(ref[pi])
pattern = np.array(pattern).astype('U')
#Stops 'greedy' behaviour from adding mutations adjacent to deletions
if any(pattern == '-'): #if part of deletion, append
sites.append(pattern)
refb = refb+ref[pi]
else: #this is another mutation next to the deletion!
i-=1 #don't append, break this loop
#Rotate them into 'calls'
align = np.asarray(sites).T
#Get rid of '-', and put '0' for calls that match ref
#Only removes trailing '-'. This breaks VCF convension, but the standard
#VCF way of handling this* is really complicated, and the situation is rare.
#(*deletions and mutations at the same locations)
fullpat = []
for pt in align:
gp = len(pt)-1
while pt[gp] == '-':
pt[gp] = ''
gp-=1
pat = "".join(pt)
if pat == refb:
fullpat.append('0')
else:
fullpat.append(pat)
pattern = np.array(fullpat)
return i, pi, pos, refb, pattern
#prepare the header of the VCF & write out
header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+sample_names
opn = gzip.open if file_name.endswith(('.gz', '.GZ')) else open
out_file = opn(file_name, 'w')
out_file.write( "##fileformat=VCFv4.2\n"+
"##source=NextStrain\n"+
"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"+
f"##contig=<ID={chrom_name}>\n")
out_file.write("\t".join(header)+"\n")
vcfWrite = []
errorPositions = []
explainedErrors = 0
mask_skip_count = 0
#Why so basic? Because we sometimes have to back up a position!
i=0
while i < len(positions):
#Get the 'pattern' of all calls at this position.
#Look out specifically for current (this pos) or upcoming (next pos) deletions
#But also distinguish these two, as handled differently.
pi = positions[i]
pos = pi+1 #change numbering to match VCF, not python, for output
refb = ref[pi] #reference base at this position
delete = False #deletion at this position - need to grab previous base (invariable)
deleteGroup = False #deletion at next position (mutation at this pos) - do not need to get prev base
if mask and mask[pi] == '1':
mask_skip_count+=1
i+=1
continue
# patterns will be empty if there was no variation in the sequences
pattern = alleles[pi] if pi in alleles else np.array([]).astype('U')
pattern2 = alleles[pi+1] if pi+1 in alleles else np.array([]).astype('U')
#If a deletion here, need to gather up all bases, and position before
if any(pattern == '-'):
if pos != 1:
deleteGroup = True
delete = True
else:
#If theres a deletion in 1st pos, VCF files do not handle this well.
#Proceed keeping it as '-' for alt (violates VCF), but warn user to check output.
#(This is rare)
print(fill("WARNING: You have a deletion in the first position of your"
" alignment. VCF format does not handle this well. Please check"
" the output to ensure it is correct."))
else:
#If a deletion in next pos, need to gather up all bases
if any(pattern2 == '-'):
deleteGroup = True
#If deletion, treat affected bases as 1 'call':
if delete or deleteGroup:
if pattern.size==0: # no variation in sequences
pattern = np.full(num_samples, refb, dtype='U')
i, pi, pos, refb, pattern = handleDeletions(i, pi, pos, ref, delete, pattern)
#If no deletion, replace ref with '0' which means the reference base is unchanged
else:
pattern[pattern==refb] = '0'
#Get the list of ALTs - minus any '0' which are unchanged reference sequences!
uniques = np.unique(pattern)
uniques = uniques[np.where(uniques!='0')]
#Convert bases to the number that matches the ALT
j=1
for u in uniques:
pattern[np.where(pattern==u)[0]] = str(j)
j+=1
#What if there's no variation at a variable site??
#This can happen when sites are modified by TreeTime - see below.
#We don't print to VCF (because no variation!)
any_variation = len(uniques)!=0
if not any_variation:
#If we expect it (it was made constant by TreeTime), it's fine.
if pi in inferred_const_sites:
explainedErrors += 1
else:
#If we don't expect, raise an error
errorPositions.append(str(pi))
#Write it out - Increment positions by 1 so it's in VCF numbering
#If no longer variable, and explained, don't write it out
if any_variation:
calls = [ "/".join([j]*ploidy) for j in pattern ]
output = [chrom_name, str(pos), ".", refb, ",".join(uniques), ".", "PASS", ".", "GT"] + calls
vcfWrite.append("\t".join(output))
i+=1
#Note: The number of 'inferred_const_sites' passed back by TreeTime will often be longer
#than the number of 'site that were made constant' that prints below. This is because given the site:
# Ref Alt Seq
# G A AANAA
#This will be converted to 'AAAAA' and listed as an 'inferred_const_sites'. However, for VCF
#purposes, because the site is 'variant' against the ref, it is variant, as expected, and so
#won't be counted in the below list, which is only sites removed from the VCF.
if mask_skip_count:
print(f"{mask_skip_count} positions were skipped due to the provided mask")
if 'inferred_const_sites' in tree_dict and explainedErrors != 0:
print(fill("Sites that were constant except for ambiguous bases were made" +
" constant by TreeTime. This happened {} times. These sites are".format(explainedErrors) +
" now excluded from the VCF."))
if len(errorPositions) != 0:
print ("\n***WARNING: vcf_utils.py")
print(fill("\n{} sites were found that had no alternative bases.".format(str(len(errorPositions)))+
" If this data has been run through TreeTime and contains ambiguous bases,"
" try calling get_tree_dict with var_ambigs=True to see if this clears the error."))
print(fill("\nAlternative causes:"
"\n- Not all sequences in your alignment are in the tree"
" (if you are running TreeTime via commandline this is most likely)"
"\n- In TreeTime, can be caused by overwriting variants in tips with small branch lengths (debug)"
"\n\nThese are the positions affected (numbering starts at 0):"))
print(fill(", ".join(errorPositions)))
out_file.write("\n".join(vcfWrite))
out_file.close()
def process_sparse_alignment(aln, ref, ambiguous_char):
return process_alignment_dictionary(aln, ref, ambiguous_char)
def process_alignment_dictionary(aln, ref, ambiguous_char):
"""
prepare the dictionary specifying differences from a reference sequence
to construct the reduced alignment with variable sites only. NOTE:
- sites can be constant but different from the reference
- sites can be constant plus a ambiguous sites
assigns
-------
- self.nonref_positions: at least one sequence is different from ref
Returns
-------
reduced_alignment_const
reduced alignment accounting for non-variable postitions
alignment_patterns_const
dict pattern -> (pos in reduced alignment, list of pos in full alignment)
variable_positions
list of variable positions needed to construct remaining
"""
# number of sequences in alignment
nseq = len(aln)
inv_map = defaultdict(list)
for k,v in aln.items():
for pos, bs in v.items():
inv_map[pos].append(bs)
nonref_positions = np.sort(list(inv_map.keys()))
constant_up_to_ambiguous = []
nonref_const = []
nonref_alleles = []
ambiguous_const = []
variable_pos = []
for pos, bs in inv_map.items(): #loop over positions and patterns
bases = list(np.unique(bs))
if len(bs) == nseq: #every sequence is different from reference
if (len(bases)<=2 and ambiguous_char in bases) or len(bases)==1:
# all sequences different from reference, but only one state
# (other than ambiguous_char) in column
nonref_const.append(pos)
if len(bases)==1:
nonref_alleles.append(bases[0])
else:
nonref_alleles.append([x for x in bases if x!=ambiguous_char][0])
if ambiguous_char in bases: #keep track of sites 'made constant'
constant_up_to_ambiguous.append(pos)
else:
# at least two non-reference alleles
variable_pos.append(pos)
else: # not every sequence different from reference
if len(bases)==1 and bases[0]==ambiguous_char:
ambiguous_const.append(pos)
constant_up_to_ambiguous.append(pos) #keep track of sites 'made constant'
else:
# at least one non ambiguous non-reference allele not in
# every sequence
variable_pos.append(pos)
refMod = np.copy(ref)
# place constant non reference positions by their respective allele
refMod[nonref_const] = nonref_alleles
# mask variable positions
states = np.unique(refMod)
refMod[variable_pos] = '.'
# for each base in the gtr, make constant alignment pattern and
# assign it to all const positions in the modified reference sequence
constant_columns = []
constant_patterns = {}
for base in states:
if base==ambiguous_char:
continue
p = np.repeat(base, nseq)
pos = list(np.where(refMod==base)[0])
#if the alignment doesn't have a const site of this base, don't add! (ex: no '----' site!)
if len(pos):
constant_patterns["".join(p.astype('U'))] = [len(constant_columns), pos]
constant_columns.append(p)
return {"constant_columns": constant_columns,
"constant_patterns": constant_patterns,
"variable_positions": variable_pos,
"nonref_positions": nonref_positions,
"constant_up_to_ambiguous": constant_up_to_ambiguous}