-
Notifications
You must be signed in to change notification settings - Fork 89
/
CRISPRessoCORE.py
4759 lines (4097 loc) · 296 KB
/
CRISPRessoCORE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
CRISPResso2 - Kendell Clement and Luca Pinello 2020
Software pipeline for the analysis of genome editing outcomes from deep sequencing data
(c) 2020 The General Hospital Corporation. All Rights Reserved.
'''
import sys
running_python3 = False
if sys.version_info > (3, 0):
running_python3 = True
import argparse
from collections import defaultdict
from copy import deepcopy
from concurrent.futures import ProcessPoolExecutor, wait
from functools import partial
import errno
import gzip
import json
import zipfile
import os
import re
import subprocess as sb
import traceback
from CRISPResso2 import CRISPRessoCOREResources
from CRISPResso2 import CRISPRessoReport
from CRISPResso2 import CRISPRessoShared
from CRISPResso2 import CRISPRessoPlot
from CRISPResso2 import CRISPResso2Align
from CRISPResso2 import CRISPRessoMultiProcessing
from datetime import datetime
present = datetime.now()
#d1 = datetime.strptime('21/07/2019','%d/%m/%Y')
#if present > d1:
# print('\nYour version of CRISPResso2 is out of date. Please download a new version.\n')
# sys.exit(1)
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(CRISPRessoShared.LogStreamHandler())
error = logger.critical
warn = logger.warning
debug = logger.debug
info = logger.info
####Support functions###
_ROOT = os.path.abspath(os.path.dirname(__file__))
def check_library(library_name):
try:
return __import__(library_name)
except:
error('You need to install %s module to use CRISPResso!' % library_name)
sys.exit(1)
def which(program):
import os
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
fpath, fname = os.path.split(program)
if fpath:
if is_exe(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
path = path.strip('"')
exe_file = os.path.join(path, program)
if is_exe(exe_file):
return exe_file
return None
def check_program(binary_name,download_url=None):
if not which(binary_name):
error('You need to install and have the command #####%s##### in your PATH variable to use CRISPResso!\n Please read the documentation!' % binary_name)
if download_url:
error('You can download it here:%s' % download_url)
sys.exit(1)
def get_avg_read_length_fastq(fastq_filename):
cmd=('z' if fastq_filename.endswith('.gz') else '' ) +('cat < \"%s\"' % fastq_filename)+\
r''' | awk 'BN {n=0;s=0;} NR%4 == 2 {s+=length($0);n++;} END { printf("%d\n",s/n)}' '''
p = sb.Popen(cmd, shell=True, stdout=sb.PIPE)
return int(p.communicate()[0].strip())
def get_n_reads_fastq(fastq_filename):
p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < \"%s\" | wc -l" % fastq_filename, shell=True, stdout=sb.PIPE)
return int(float(p.communicate()[0])/4.0)
def get_n_reads_bam(bam_filename,bam_chr_loc=""):
cmd = "samtools view -c " + bam_filename + " " + bam_chr_loc
p = sb.Popen(cmd, shell=True, stdout=sb.PIPE)
try:
retval = int(float(p.communicate()[0]))
except ValueError:
raise CRISPRessoShared.InstallationException('Error when running the command:' + cmd + '\nCheck that samtools is installed correctly.')
return retval
#import time
#start = time.time()
matplotlib=check_library('matplotlib')
#end = time.time()
#start = time.time()
from matplotlib import font_manager as fm
CRISPRessoPlot.setMatplotlibDefaults()
#end = time.time()
#start = time.time()
plt=check_library('pylab')
#end = time.time()
from matplotlib import font_manager as fm
import matplotlib.gridspec as gridspec
pd=check_library('pandas')
np=check_library('numpy')
check_program('flash')
#start = time.time()
sns=check_library('seaborn')
#end = time.time()
sns.set_context('poster')
sns.set(font_scale=2.2)
sns.set_style('white')
#########################################
def get_pe_scaffold_search(prime_edited_ref_sequence, prime_editing_pegRNA_extension_seq, prime_editing_pegRNA_scaffold_seq, prime_editing_pegRNA_scaffold_min_match_length):
"""
For prime editing, determines the scaffold string to search for (the shortest substring of args.prime_editing_pegRNA_scaffold_seq not in the prime-edited reference sequence)
params:
prime_edited_ref_sequence: reference sequence of the prime-edited sequence
prime_editing_extension_seq: RNA sequence of extension sequence
prime_editing_pegRNA_scaffold_seq: sequence of the scaffold sequence
prime_editing_pegRNA_scaffold_min_match_length: minimum number of bases required to match between scaffold and read to count as scaffold-incorporated
returns:
tuple of(
index of location in ref to find scaffold seq if it exists
shortest dna sequence to identify scaffold sequence
)
"""
info('Processing pegRNA scaffold sequence...')
#first, define the sequence we are looking for (extension plus the first base(s) of the scaffold)
scaffold_dna = CRISPRessoShared.reverse_complement(prime_editing_pegRNA_scaffold_seq)
extension_seq_dna_top_strand = prime_editing_pegRNA_extension_seq.upper().replace('U', 'T')
prime_editing_extension_seq_dna = CRISPRessoShared.reverse_complement(extension_seq_dna_top_strand)
scaffold_start_loc = prime_edited_ref_sequence.index(prime_editing_extension_seq_dna)+len(prime_editing_extension_seq_dna)
#next find min length scaffold that when combined with min extension sequence is not in edited sequence
len_scaffold_to_use = prime_editing_pegRNA_scaffold_min_match_length # length of the scaffold sequence to be added to the extension sequence, start at 1bp
scaffold_dna_search = prime_editing_extension_seq_dna + scaffold_dna[0:len_scaffold_to_use]
while scaffold_dna_search in prime_edited_ref_sequence:
if len_scaffold_to_use > len(scaffold_dna):
raise CRISPRessoShared.BadParameterException('The DNA scaffold provided is found in the unedited reference sequence. Please provide a longer scaffold sequence.')
len_scaffold_to_use += 1
scaffold_dna_search = prime_editing_extension_seq_dna + scaffold_dna[0:len_scaffold_to_use]
info('Searching for scaffold-templated reads with the sequence: \'' + str(scaffold_dna[0:len_scaffold_to_use]) +'\' starting at position '+ str(scaffold_start_loc) + ' in reads that align to the prime-edited sequence')
return (scaffold_start_loc, scaffold_dna[0:len_scaffold_to_use])
def get_new_variant_object(args, fastq_seq, refs, ref_names, aln_matrix, pe_scaffold_dna_info):
"""
Gets the payload object for a read that hasn't been seen in the cache yet
params:
args: CRISPResso2 args
fastq_seq: read sequence to align
refs: dict with info for all refs
ref_names: list of ref names
aln_matrix: alignment matrix for needleman wunsch
pe_scaffold_dna_info: for prime-editing tuple of(
index of location in ref to find scaffold seq if it exists
shortest dna sequence to identify scaffold sequence
)
returns:
variant payload
"""
aln_scores = []
best_match_score = -1
best_match_s1s = []
best_match_s2s = []
best_match_names = []
best_match_strands = []
ref_aln_details = []
for idx, ref_name in enumerate(ref_names):
#get alignment and score from cython
#score = 100 * #matchedBases / length(including gaps)
seed_i = 0
found_forward_count = 0
found_reverse_count = 0
aln_strand = '+'
while seed_i < args.aln_seed_count and seed_i < len(refs[ref_name]['fw_seeds']):
if refs[ref_name]['fw_seeds'][seed_i] in fastq_seq: #is forward
found_forward_count += 1
if refs[ref_name]['rc_seeds'][seed_i] in fastq_seq: #is rc
found_reverse_count += 1
seed_i += 1
if found_forward_count > args.aln_seed_min and found_reverse_count == 0:
fws1, fws2, fwscore=CRISPResso2Align.global_align(fastq_seq, refs[ref_name]['sequence'], matrix=aln_matrix, gap_incentive=refs[ref_name]['gap_incentive'], gap_open=args.needleman_wunsch_gap_open, gap_extend=args.needleman_wunsch_gap_extend,)
s1 = fws1
s2 = fws2
score = fwscore
elif found_forward_count == 0 and found_reverse_count > args.aln_seed_min:
rvs1, rvs2, rvscore=CRISPResso2Align.global_align(CRISPRessoShared.reverse_complement(fastq_seq), refs[ref_name]['sequence'], matrix=aln_matrix, gap_incentive=refs[ref_name]['gap_incentive'], gap_open=args.needleman_wunsch_gap_open, gap_extend=args.needleman_wunsch_gap_extend,)
s1 = rvs1
s2 = rvs2
score = rvscore
aln_strand = '-'
else:
fws1, fws2, fwscore=CRISPResso2Align.global_align(fastq_seq, refs[ref_name]['sequence'], matrix=aln_matrix, gap_incentive=refs[ref_name]['gap_incentive'], gap_open=args.needleman_wunsch_gap_open, gap_extend=args.needleman_wunsch_gap_extend,)
rvs1, rvs2, rvscore=CRISPResso2Align.global_align(CRISPRessoShared.reverse_complement(fastq_seq), refs[ref_name]['sequence'], matrix=aln_matrix, gap_incentive=refs[ref_name]['gap_incentive'], gap_open=args.needleman_wunsch_gap_open, gap_extend=args.needleman_wunsch_gap_extend,)
s1 = fws1
s2 = fws2
score = fwscore
if (rvscore > fwscore):
s1 = rvs1
s2 = rvs2
score = rvscore
aln_strand = '-'
# print "for " + ref_name + " got fws1: " + str(fws1) + " and fws2: " + str(fws2) + " score: " +str(fwscore)
aln_scores.append(score)
ref_aln_details.append((ref_name, s1, s2, score))
#reads are matched to the reference to which they best align. The 'min_aln_score' is calculated using only the changes in 'include_idxs'
if score > best_match_score and score > refs[ref_name]['min_aln_score']:
best_match_score = score
best_match_s1s = [s1]
best_match_s2s = [s2]
best_match_names = [ref_name]
best_match_strands = [aln_strand]
elif score == best_match_score:
best_match_s1s.append(s1)
best_match_s2s.append(s2)
best_match_names.append(ref_name)
best_match_strands.append(aln_strand)
if best_match_score > 0:
new_variant = {}
new_variant['count'] = 1
new_variant['aln_ref_names'] = best_match_names
new_variant['aln_scores'] = aln_scores
new_variant['ref_aln_details'] = ref_aln_details
new_variant['best_match_score'] = best_match_score
class_names = []
for idx in range(len(best_match_names)):
best_match_name = best_match_names[idx]
if args.use_legacy_insertion_quantification:
payload = CRISPRessoCOREResources.find_indels_substitutions_legacy(best_match_s1s[idx], best_match_s2s[idx], refs[best_match_name]['include_idxs'])
else:
payload = CRISPRessoCOREResources.find_indels_substitutions(best_match_s1s[idx], best_match_s2s[idx], refs[best_match_name]['include_idxs'])
payload['ref_name'] = best_match_name
payload['aln_scores'] = aln_scores
# If there is an insertion/deletion/substitution in the quantification window, the read is modified.
is_modified = False
if not args.ignore_deletions and payload['deletion_n'] > 0:
is_modified = True
elif not args.ignore_insertions and payload['insertion_n'] > 0:
is_modified = True
elif not args.ignore_substitutions and payload['substitution_n'] > 0:
is_modified = True
if is_modified:
class_names.append(best_match_name+"_MODIFIED")
payload['classification'] = 'MODIFIED'
else:
class_names.append(best_match_name+"_UNMODIFIED")
payload['classification'] = 'UNMODIFIED'
payload['aln_seq'] = best_match_s1s[idx]
payload['aln_ref'] = best_match_s2s[idx]
payload['aln_strand'] = best_match_strands[idx]
new_variant['variant_'+best_match_name] = payload
new_variant['class_name'] = "&".join(class_names)
else:
new_variant = {}
new_variant['count'] = 1
new_variant['aln_scores'] = aln_scores
new_variant['ref_aln_details'] = ref_aln_details
new_variant['best_match_score'] = best_match_score
return new_variant #return new variant with best match score of 0, but include the scores of insufficient alignments
#handle ambiguous alignments
if len(best_match_names) > 1:
if args.assign_ambiguous_alignments_to_first_reference: #if ambiguous, and this flag is set, just assign to the first amplicon
new_variant['class_name'] = class_names[0]
new_variant['aln_ref_names'] = [best_match_names[0]]
elif not args.expand_ambiguous_alignments: #got 'Ambiguous' -- don't count toward total (e.g. indels at each position for the ref)
new_variant['class_name'] = 'AMBIGUOUS'
#search for prime editing scaffold
#if it's there, assign this to be assigned to the reference '"Scaffold-incorporated"'
if args.prime_editing_pegRNA_scaffold_seq and 'Prime-edited' in best_match_names: #any scaffold extensions must be closer to the prime-edited sequence
pe_read_possible_scaffold_loc = new_variant['variant_Prime-edited']['ref_positions'].index(pe_scaffold_dna_info[0]-1) + 1
if new_variant['variant_Prime-edited']['aln_seq'][pe_read_possible_scaffold_loc:(pe_read_possible_scaffold_loc+len(pe_scaffold_dna_info[1]))] == pe_scaffold_dna_info[1]:
# print('comparingHERE ' + new_variant['variant_Prime-edited']['aln_seq'][pe_read_possible_scaffold_loc:(pe_read_possible_scaffold_loc+len(pe_scaffold_dna_info[1])+5)] + ' from ' + new_variant['variant_Prime-edited']['aln_seq'] + ' and ' + new_variant['variant_Prime-edited']['aln_ref'])
new_variant['aln_ref_names'] = ["Scaffold-incorporated"]
new_variant['class_name'] = "Scaffold-incorporated"
old_payload = deepcopy(new_variant['variant_Prime-edited']) #keep prime-edited allele and alignment
old_payload['ref_name'] = "Scaffold-incorporated"
new_variant['variant_'+"Scaffold-incorporated"] = old_payload
return new_variant
def process_fastq(fastq_filename, variantCache, ref_names, refs, args):
"""process_fastq processes each of the reads contained in a fastq file, given a cache of pre-computed variants
fastqIn: name of fastq (e.g. output of FLASH)
This file can be gzipped or plain text
variantCache: dict with keys: sequence
dict with keys:
'count' : number of time sequence was observed
'aln_ref_names' : names of reference it was aligned to
'aln_scores' : score of alignment to each reference
'aln_details' # details (seq1, seq2, score) of alignment to each other reference sequence
'class_name' : string with class names it was aligned to
'best_match_score' : score of best match (0 if no alignments matched above amplicon threshold)
for each reference, there is a key: variant_ref_name with a payload object
# payload object:
The payload is a dict with keys:
### from CRISPRessoCOREResources.find_indels_substitutions
# 'all_insertion_positions' #arr with 1's where there are insertions (including those outside of include_idxs quantification window)
# 'all_insertion_left_positions' #arr with 1's to the left of where the insertion occurs
# 'insertion_positions' # arr with 1's where there are insertions (1bp before and 1bp after insertion) that overlap with include_idxs quantification window
# 'insertion_coordinates' # one entry per insertion, tuple of (start,end)
# 'insertion_sizes'
# 'insertion_n'
# 'all_deletion_positions' #arr with 1's where there are insertions
# 'deletion_positions' #arr with 1's where there are insertions that overlap the include_idxs quantification window
# 'deletion_coordinates' # one entry per deletion
# 'deletion_sizes' # correspond to entries in 'deletion_coordinates'
# 'deletion_n'
# 'all_substitution_positions'
# 'substitution_positions'
# 'substitution_n'
# 'substitution_values'
# 'ref_positions'
### added in this function
# 'ref_name' # name of sequence that it most closely aligns to
# 'classification' # MODIFIED or UNMODIFIED or AMBIGUOUS
# 'aln_scores' # scores of alignment to each other reference sequence
# 'aln_seq' # NW-aligned sequence
# 'aln_ref' # NW-aligned sequence of corresponding reference (ref_name)
refNameList: list of reference names
refs: dictionary of sequences name>ref object
##ref object:
# 'name'
# 'sequence'
# 'sequence_length'
# 'min_aln_score' #sequence must align with at least this score
# 'gap_incentive' #incentive for gaps at each position of the reference - to force gaps at the cut points, the indices of these cut points are set to 1 i.e. gap_incentive[4] = 1 would incentivise alignments with insertions to the right of the 4th character in the reference, or deletions of the 4th character in the reference.
# 'contains_guide'
# 'sgRNA_intervals'
# 'sgRNA_sequences'
# 'sgRNA_cut_points'
# 'sgRNA_plot_cut_points'#whether cut points should be plotted (not plotted for base editing, prime editing flap)
# 'sgRNA_plot_idxs' #indices along reference sequence for which to plot the allele plot (allele frequency plot around sgRNA)
# 'sgRNA_names' #names of sgRNAs (in case there are multiple matches for a single sgRNA)
# 'sgRNA_mismatches' #indices along guide that are mismatched against a 'flexiguide_seq'
# 'sgRNA_orig_sequences' #original sgRNA sequences as passed in as parameters (e.g. not including flexiguide changes or case changes)
# 'contains_coding_seq'
# 'exon_positions'
# 'exon_intervals'
# 'exon_len_mods': the modification to the original exon length (if we copied the exon positions from another reference, this reference could introduce an indel, resulting in a non-zero length modification)
# 'splicing_positions'
# 'include_idxs' # sorted numpy array
# 'exclude_idxs'
# 'plot_idxs' #sorted numpy array
# 'plot_name' #unique plotting name
# 'idx_cloned_from' #if this reference didn't contain a guide (or exon sequence), it was aligned to 'idx_cloned_from' reference, and cut_points, plot_cut_points, gap_incentive, sgRNA_intervals, inculde_idx, ane exon information were cloned from it (at the appropriate indices)
Examples of these seqences can include:
-the amplicon sequence
-the repaired CRISPR expected output
-allelic varaints if two variants are known to exist
"""
N_TOT_READS = 0
N_CACHED_ALN = 0 # read was found in cache
N_CACHED_NOTALN = 0 #read was found in 'not aligned' cache
N_COMPUTED_ALN = 0 # not in cache, aligned to at least 1 sequence with min cutoff
N_COMPUTED_NOTALN = 0 #not in cache, not aligned to any sequence with min cutoff
aln_matrix_loc = os.path.join(_ROOT, args.needleman_wunsch_aln_matrix_loc)
CRISPRessoShared.check_file(aln_matrix_loc)
aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc)
pe_scaffold_dna_info = (0, None) #scaffold start loc, scaffold seq to search
if args.prime_editing_pegRNA_scaffold_seq != "":
pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length)
not_aln = {} #cache for reads that don't align
if fastq_filename.endswith('.gz'):
fastq_handle = gzip.open(fastq_filename, 'rt')
else:
fastq_handle=open(fastq_filename)
while(fastq_handle.readline()):
#read through fastq in sets of 4
fastq_seq = fastq_handle.readline().strip()
fastq_plus = fastq_handle.readline()
fastq_qual = fastq_handle.readline()
if (N_TOT_READS % 10000 == 0):
info("Processing reads; N_TOT_READS: %d N_COMPUTED_ALN: %d N_CACHED_ALN: %d N_COMPUTED_NOTALN: %d N_CACHED_NOTALN: %d"%(N_TOT_READS, N_COMPUTED_ALN, N_CACHED_ALN, N_COMPUTED_NOTALN, N_CACHED_NOTALN))
N_TOT_READS+=1
#if the sequence has been seen and can't be aligned, skip it
if (fastq_seq in not_aln):
N_CACHED_NOTALN += 1
continue
#if the sequence is already associated with a variant in the variant cache, pull it out
if (fastq_seq in variantCache):
N_CACHED_ALN+=1
variantCache[fastq_seq]['count'] += 1
#otherwise, create a new variant object, and put it in the cache
else:
new_variant = get_new_variant_object(args, fastq_seq, refs, ref_names, aln_matrix, pe_scaffold_dna_info)
if new_variant['best_match_score'] <= 0:
N_COMPUTED_NOTALN+=1
not_aln[fastq_seq] = 1
else:
N_COMPUTED_ALN+=1
variantCache[fastq_seq] = new_variant
fastq_handle.close()
info("Finished reads; N_TOT_READS: %d N_COMPUTED_ALN: %d N_CACHED_ALN: %d N_COMPUTED_NOTALN: %d N_CACHED_NOTALN: %d"%(N_TOT_READS, N_COMPUTED_ALN, N_CACHED_ALN, N_COMPUTED_NOTALN, N_CACHED_NOTALN))
aln_stats = {"N_TOT_READS" : N_TOT_READS,
"N_CACHED_ALN" : N_CACHED_ALN,
"N_CACHED_NOTALN" : N_CACHED_NOTALN,
"N_COMPUTED_ALN" : N_COMPUTED_ALN,
"N_COMPUTED_NOTALN" : N_COMPUTED_NOTALN
}
return(aln_stats)
def process_bam(bam_filename, bam_chr_loc, output_bam, variantCache, ref_names, refs, args):
"""
process_bam processes each of the reads contained in a bam file, given a cache of pre-computed variants
bam_filename: name of bam (e.g. output of bowtie2)
bam_chr_loc: positions in the input bam to read from
output_bam: name out bam to write to (includes crispresso alignment info)
variantCache: dict with keys: sequence dict with keys (described in process_fastq)
ref_names: list of reference names
refs: dictionary of sequences name>ref object
args: crispresso2 args
"""
N_TOT_READS = 0
N_CACHED_ALN = 0 # read was found in cache
N_CACHED_NOTALN = 0 #read was found in 'not aligned' cache
N_COMPUTED_ALN = 0 # not in cache, aligned to at least 1 sequence with min cutoff
N_COMPUTED_NOTALN = 0 #not in cache, not aligned to any sequence with min cutoff
aln_matrix_loc = os.path.join(_ROOT, args.needleman_wunsch_aln_matrix_loc)
CRISPRessoShared.check_file(aln_matrix_loc)
aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc)
pe_scaffold_dna_info = (0, None) #scaffold start loc, scaffold sequence
if args.prime_editing_pegRNA_scaffold_seq != "":
pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length)
not_aln = {} #cache for reads that don't align
output_sam = output_bam+".sam"
with open(output_sam, "w") as sam_out:
#first, write header to sam
proc = sb.Popen(['samtools', 'view', '-H', bam_filename], stdout=sb.PIPE, stderr=sb.PIPE, encoding='utf-8')
for line in proc.stdout:
sam_out.write(line)
crispresso_cmd_to_write = ' '.join(sys.argv)
sam_out.write('@PG\tID:crispresso2\tPN:crispresso2\tVN:'+CRISPRessoShared.__version__+'\tCL:"'+crispresso_cmd_to_write+'"\n')
if bam_chr_loc != "":
proc = sb.Popen(['samtools', 'view', bam_filename, bam_chr_loc], stdout=sb.PIPE, encoding='utf-8')
else:
proc = sb.Popen(['samtools', 'view', bam_filename], stdout=sb.PIPE, encoding='utf-8')
for sam_line in proc.stdout:
sam_line_els = sam_line.rstrip().split("\t")
fastq_seq = sam_line_els[9]
if (N_TOT_READS % 10000 == 0):
info("Processing reads; N_TOT_READS: %d N_COMPUTED_ALN: %d N_CACHED_ALN: %d N_COMPUTED_NOTALN: %d N_CACHED_NOTALN: %d"%(N_TOT_READS, N_COMPUTED_ALN, N_CACHED_ALN, N_COMPUTED_NOTALN, N_CACHED_NOTALN))
N_TOT_READS+=1
#if the sequence has been seen and can't be aligned, skip it
if (fastq_seq in not_aln):
N_CACHED_NOTALN += 1
sam_line_els.append(not_aln[fastq_seq]) #Crispresso2 alignment: NA
sam_out.write("\t".join(sam_line_els)+"\n")
continue
#if the sequence is already associated with a variant in the variant cache, pull it out
if (fastq_seq in variantCache):
N_CACHED_ALN+=1
variantCache[fastq_seq]['count'] += 1
#sam_line_els[5] = variantCache[fastq_seq]['sam_cigar']
sam_line_els.append(variantCache[fastq_seq]['crispresso2_annotation'])
sam_out.write("\t".join(sam_line_els)+"\n")
#otherwise, create a new variant object, and put it in the cache
else:
new_variant = get_new_variant_object(args, fastq_seq, refs, ref_names, aln_matrix, pe_scaffold_dna_info)
if new_variant['best_match_score'] <= 0:
N_COMPUTED_NOTALN+=1
crispresso_sam_optional_fields = "c2:Z:ALN=NA" +\
" ALN_SCORES=" + ('&'.join([str(x) for x in new_variant['aln_scores']])) +\
" ALN_DETAILS=" + ('&'.join([','.join([str(y) for y in x]) for x in new_variant['ref_aln_details']]))
sam_line_els.append(crispresso_sam_optional_fields)
not_aln[fastq_seq] = crispresso_sam_optional_fields
sam_out.write("\t".join(sam_line_els)+"\n")
else:
N_COMPUTED_ALN+=1
class_names = []
ins_inds = []
del_inds = []
sub_inds = []
edit_strings = []
# for idx, best_match_name in enumerate(best_match_names):
for idx, best_match_name in enumerate(new_variant['aln_ref_names']):
payload=new_variant['variant_'+best_match_name]
del_inds.append([str(x[0][0])+"("+str(x[1])+")" for x in zip(payload['deletion_coordinates'], payload['deletion_sizes'])])
ins_vals = []
for ins_coord,ins_size in zip(payload['insertion_coordinates'],payload['insertion_sizes']):
ins_start = payload['ref_positions'].index(ins_coord[0])
ins_vals.append(payload['aln_seq'][ins_start+1:ins_start+1+ins_size])
ins_inds.append([str(x[0][0])+"("+str(x[1])+"+"+x[2]+")" for x in zip(payload['insertion_coordinates'], payload['insertion_sizes'], ins_vals)])
sub_inds.append(payload['substitution_positions'])
edit_strings.append('D'+str(int(payload['deletion_n']))+';I'+str(int(payload['insertion_n']))+';S'+str(int(payload['substitution_n'])))
crispresso_sam_optional_fields = "c2:Z:ALN="+("&".join(new_variant['aln_ref_names'])) +\
" CLASS="+new_variant['class_name']+\
" MODS="+("&".join(edit_strings))+\
" DEL="+("&".join([';'.join(x) for x in del_inds])) +\
" INS="+("&".join([';'.join(x) for x in ins_inds])) +\
" SUB=" + ("&".join([';'.join([str(y) for y in x]) for x in sub_inds])) +\
" ALN_REF=" + ('&'.join([new_variant['variant_'+name]['aln_ref'] for name in new_variant['aln_ref_names']])) +\
" ALN_SEQ=" + ('&'.join([new_variant['variant_'+name]['aln_seq'] for name in new_variant['aln_ref_names']]))
sam_line_els.append(crispresso_sam_optional_fields)
#cigar strings are in reference to the given amplicon, not to the genomic sequence to which this read is aligned..
#first_variant = new_variant['variant_'+new_variant['aln_ref_names'][0]]
#sam_cigar = ''.join(CRISPRessoShared.unexplode_cigar(''.join([CRISPRessoShared.CIGAR_LOOKUP[x] for x in zip(first_variant['aln_seq'],first_variant['aln_ref'])])))
#sam_line_els[5] = sam_cigar
#new_variant['sam_cigar'] = sam_cigar
new_variant['crispresso2_annotation'] = crispresso_sam_optional_fields
sam_out.write("\t".join(sam_line_els)+"\n")
variantCache[fastq_seq] = new_variant
output_sam = output_bam+".sam"
cmd = 'samtools view -Sb '+output_sam + '>'+output_bam + ' && samtools index ' + output_bam
bam_status=sb.call(cmd, shell=True)
if bam_status:
raise CRISPRessoShared.BadParameterException(
'Bam creation failed. Command used: {0}'.format(cmd),
)
info("Finished reads; N_TOT_READS: %d N_COMPUTED_ALN: %d N_CACHED_ALN: %d N_COMPUTED_NOTALN: %d N_CACHED_NOTALN: %d"%(N_TOT_READS, N_COMPUTED_ALN, N_CACHED_ALN, N_COMPUTED_NOTALN, N_CACHED_NOTALN))
aln_stats = {"N_TOT_READS": N_TOT_READS,
"N_CACHED_ALN": N_CACHED_ALN,
"N_CACHED_NOTALN": N_CACHED_NOTALN,
"N_COMPUTED_ALN": N_COMPUTED_ALN,
"N_COMPUTED_NOTALN": N_COMPUTED_NOTALN,
}
return(aln_stats)
def process_fastq_write_out(fastq_input, fastq_output, variantCache, ref_names, refs, args):
"""
process_fastq_write_out processes each of the reads contained in a fastq input file, given a cache of pre-computed variants. All reads are read in, analyzed, and written to output with annotation
fastq_input: input fastq
fastq_output: fastq to write out
variantCache: dict with keys: sequence dict with keys (described in process_fastq)
ref_names: list of reference names
refs: dictionary of sequences name>ref object
args: crispresso2 args
"""
N_TOT_READS = 0
N_CACHED_ALN = 0 # read was found in cache
N_CACHED_NOTALN = 0 #read was found in 'not aligned' cache
N_COMPUTED_ALN = 0 # not in cache, aligned to at least 1 sequence with min cutoff
N_COMPUTED_NOTALN = 0 #not in cache, not aligned to any sequence with min cutoff
aln_matrix_loc = os.path.join(_ROOT, args.needleman_wunsch_aln_matrix_loc)
CRISPRessoShared.check_file(aln_matrix_loc)
aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc)
pe_scaffold_dna_info = (0, None) #scaffold start loc, scaffold sequence
if args.prime_editing_pegRNA_scaffold_seq != "":
pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length)
not_aln = {} #cache for reads that don't align
not_aln[''] = "" #add empty sequence to the not_aln in case the fastq has an extra newline at the end
if fastq_input.endswith('.gz'):
fastq_input_handle=gzip.open(fastq_input, 'rt')
else:
fastq_input_handle=open(fastq_input)
fastq_out_handle = gzip.open(fastq_output, 'wt')
fastq_id = fastq_input_handle.readline()
while(fastq_id):
#read through fastq in sets of 4
fastq_seq = fastq_input_handle.readline().strip()
fastq_plus = fastq_input_handle.readline().strip()
fastq_qual = fastq_input_handle.readline()
if (N_TOT_READS % 10000 == 0):
info("Processing reads; N_TOT_READS: %d N_COMPUTED_ALN: %d N_CACHED_ALN: %d N_COMPUTED_NOTALN: %d N_CACHED_NOTALN: %d"%(N_TOT_READS, N_COMPUTED_ALN, N_CACHED_ALN, N_COMPUTED_NOTALN, N_CACHED_NOTALN))
N_TOT_READS+=1
#if the sequence has been seen and can't be aligned, skip it
if fastq_seq in not_aln:
N_CACHED_NOTALN += 1
fastq_out_handle.write(fastq_id+fastq_seq+"\n"+fastq_plus+not_aln[fastq_seq]+"\n"+fastq_qual) #not_aln[fastq_seq] is alignment: NA
elif fastq_seq in variantCache: #if the sequence is already associated with a variant in the variant cache, pull it out
N_CACHED_ALN+=1
variantCache[fastq_seq]['count'] += 1
fastq_out_handle.write(fastq_id+fastq_seq+"\n"+fastq_plus+variantCache[fastq_seq]['crispresso2_annotation']+"\n"+fastq_qual)
#otherwise, create a new variant object, and put it in the cache
else:
new_variant = get_new_variant_object(args, fastq_seq, refs, ref_names, aln_matrix, pe_scaffold_dna_info)
if new_variant['best_match_score'] <= 0:
N_COMPUTED_NOTALN+=1
crispresso2_annotation = " ALN=NA" +\
" ALN_SCORES=" + ('&'.join([str(x) for x in new_variant['aln_scores']])) +\
" ALN_DETAILS=" + ('&'.join([','.join([str(y) for y in x]) for x in new_variant['ref_aln_details']]))
not_aln[fastq_seq] = crispresso2_annotation
fastq_out_handle.write(fastq_id+fastq_seq+"\n"+fastq_plus+crispresso2_annotation+"\n"+fastq_qual)
else:
N_COMPUTED_ALN+=1
ins_inds = []
del_inds = []
sub_inds = []
edit_strings = []
# for idx, best_match_name in enumerate(best_match_names):
for idx, best_match_name in enumerate(new_variant['aln_ref_names']):
payload=new_variant['variant_'+best_match_name]
del_inds.append([str(x[0][0])+"("+str(x[1])+")" for x in zip(payload['deletion_coordinates'], payload['deletion_sizes'])])
ins_vals = []
for ins_coord,ins_size in zip(payload['insertion_coordinates'],payload['insertion_sizes']):
ins_start = payload['ref_positions'].index(ins_coord[0])
ins_vals.append(payload['aln_seq'][ins_start+1:ins_start+1+ins_size])
ins_inds.append([str(x[0][0])+"("+str(x[1])+"+"+x[2]+")" for x in zip(payload['insertion_coordinates'], payload['insertion_sizes'], ins_vals)])
sub_inds.append(payload['substitution_positions'])
edit_strings.append('D'+str(int(payload['deletion_n']))+';I'+str(int(payload['insertion_n']))+';S'+str(int(payload['substitution_n'])))
crispresso2_annotation = " ALN="+("&".join(new_variant['aln_ref_names'])) +\
" ALN_SCORES=" + ('&'.join([str(x) for x in new_variant['aln_scores']])) +\
" ALN_DETAILS=" + ('&'.join([','.join([str(y) for y in x]) for x in new_variant['ref_aln_details']])) +\
" CLASS="+new_variant['class_name']+\
" MODS="+("&".join(edit_strings))+\
" DEL="+("&".join([';'.join(x) for x in del_inds])) +\
" INS="+("&".join([';'.join(x) for x in ins_inds])) +\
" SUB=" + ("&".join([';'.join([str(y) for y in x]) for x in sub_inds])) +\
" ALN_REF=" + ('&'.join([new_variant['variant_'+name]['aln_ref'] for name in new_variant['aln_ref_names']])) +\
" ALN_SEQ=" + ('&'.join([new_variant['variant_'+name]['aln_seq'] for name in new_variant['aln_ref_names']]))
new_variant['crispresso2_annotation'] = crispresso2_annotation
fastq_out_handle.write(fastq_id+fastq_seq+"\n"+fastq_plus+crispresso2_annotation+"\n"+fastq_qual)
variantCache[fastq_seq] = new_variant
#last step of loop = read next line
fastq_id = fastq_input_handle.readline()
fastq_input_handle.close()
fastq_out_handle.close()
info("Finished reads; N_TOT_READS: %d N_COMPUTED_ALN: %d N_CACHED_ALN: %d N_COMPUTED_NOTALN: %d N_CACHED_NOTALN: %d"%(N_TOT_READS, N_COMPUTED_ALN, N_CACHED_ALN, N_COMPUTED_NOTALN, N_CACHED_NOTALN))
aln_stats = {"N_TOT_READS": N_TOT_READS,
"N_CACHED_ALN": N_CACHED_ALN,
"N_CACHED_NOTALN": N_CACHED_NOTALN,
"N_COMPUTED_ALN": N_COMPUTED_ALN,
"N_COMPUTED_NOTALN": N_COMPUTED_NOTALN,
}
return(aln_stats)
def process_single_fastq_write_bam_out(fastq_input, bam_output, bam_header, variantCache, ref_names, refs, args):
"""
process_fastq_write_out processes each of the reads contained in a fastq input file, given a cache of pre-computed variants. All reads are read in, analyzed, and written to output with annotation
**Note that in this mode, refs must contain the following values for each reference:
aln_chr: location where reads will be aligned - this is the reference sequence
aln_start: alignment start position of reference sequence
aln_end: alignment end position of reference sequence
aln_strand: strand reference aligned to
**Note that we expect one input fastq file (R1 and R2 merged for paired sequencing). At some point, we may want to accept paired ends and record each read separately.
fastq_input: input fastq
bam_output: bam to write out
bam_header: bam header to write to output bam file (eg including crispresso version and command used)
variantCache: dict with keys: sequence dict with keys (described in process_fastq)
ref_names: list of reference names
refs: dictionary of sequences name>ref object
args: crispresso2 args
"""
N_TOT_READS = 0
N_CACHED_ALN = 0 # read was found in cache
N_CACHED_NOTALN = 0 # read was found in 'not aligned' cache
N_COMPUTED_ALN = 0 # not in cache, aligned to at least 1 sequence with min cutoff
N_COMPUTED_NOTALN = 0 # not in cache, not aligned to any sequence with min cutoff
aln_matrix_loc = os.path.join(_ROOT, args.needleman_wunsch_aln_matrix_loc)
CRISPRessoShared.check_file(aln_matrix_loc)
aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc)
pe_scaffold_dna_info = (0, None) # scaffold start loc, scaffold sequence
if args.prime_editing_pegRNA_scaffold_seq != "":
pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length)
not_aln = {} # cache for reads that don't align
not_aln[''] = "" # add empty sequence to the not_aln in case the fastq has an extra newline at the end
if fastq_input.endswith('.gz'):
fastq_input_handle = gzip.open(fastq_input, 'rt')
else:
fastq_input_handle = open(fastq_input)
sam_out = bam_output+".sam"
sam_out_handle = open(sam_out, 'wt')
# write sam output header
sam_out_handle.write(bam_header)
fastq_id = fastq_input_handle.readline().strip()[1:]
while(fastq_id):
# read through fastq in sets of 4
fastq_seq = fastq_input_handle.readline().strip()
fastq_plus = fastq_input_handle.readline().strip()
fastq_qual = fastq_input_handle.readline().strip()
if (N_TOT_READS % 10000 == 0):
info("Processing reads; N_TOT_READS: %d N_COMPUTED_ALN: %d N_CACHED_ALN: %d N_COMPUTED_NOTALN: %d N_CACHED_NOTALN: %d"%(N_TOT_READS, N_COMPUTED_ALN, N_CACHED_ALN, N_COMPUTED_NOTALN, N_CACHED_NOTALN))
N_TOT_READS += 1
# if the sequence has been seen and can't be aligned, skip it
if fastq_seq in not_aln:
N_CACHED_NOTALN += 1
new_sam_entry = not_aln[fastq_seq][:]
new_sam_entry[0] = fastq_id
new_sam_entry[10] = fastq_qual
sam_out_handle.write("\t".join(new_sam_entry)+"\n") # not_aln[fastq_seq] is alignment: NA
elif fastq_seq in variantCache: # if the sequence is already associated with a variant in the variant cache, pull it out
N_CACHED_ALN += 1
variantCache[fastq_seq]['count'] += 1
new_sam_entry = variantCache[fastq_seq]['sam_entry'][:]
new_sam_entry[0] = fastq_id
new_sam_entry[10] = fastq_qual
sam_out_handle.write("\t".join(new_sam_entry)+"\n") # write cached alignment with modified read id and qual
# otherwise, create a new variant object, and put it in the cache
else:
new_variant = get_new_variant_object(args, fastq_seq, refs, ref_names, aln_matrix, pe_scaffold_dna_info)
if new_variant['best_match_score'] <= 0:
N_COMPUTED_NOTALN += 1
new_sam_entry = [
fastq_id, # read id
'4', # flag = unmapped 0x4
'*', # aln chr
'0', # aln loc
'0', # quality
'*', # cigar
'*', # next
'0', # next
'0', # tlen
fastq_seq, # seq
fastq_qual # qual
]
crispresso_sam_optional_fields = "c2:Z:ALN=NA" +\
" ALN_SCORES=" + ('&'.join([str(x) for x in new_variant['aln_scores']])) +\
" ALN_DETAILS=" + ('&'.join([','.join([str(y) for y in x]) for x in new_variant['ref_aln_details']]))
new_sam_entry.append(crispresso_sam_optional_fields)
not_aln[fastq_seq] = new_sam_entry
sam_out_handle.write("\t".join(new_sam_entry)+"\n") # write cached alignment with modified read id and qual
else:
N_COMPUTED_ALN += 1
ins_inds = []
del_inds = []
sub_inds = []
edit_strings = []
# for idx, best_match_name in enumerate(best_match_names):
for idx, best_match_name in enumerate(new_variant['aln_ref_names']):
payload = new_variant['variant_'+best_match_name]
del_inds.append([str(x[0][0])+"("+str(x[1])+")" for x in zip(payload['deletion_coordinates'], payload['deletion_sizes'])])
ins_vals = []
for ins_coord, ins_size in zip(payload['insertion_coordinates'], payload['insertion_sizes']):
ins_start = payload['ref_positions'].index(ins_coord[0])
ins_vals.append(payload['aln_seq'][ins_start+1:ins_start+1+ins_size])
ins_inds.append([str(x[0][0])+"("+str(x[1])+"+"+x[2]+")" for x in zip(payload['insertion_coordinates'], payload['insertion_sizes'], ins_vals)])
sub_inds.append(payload['substitution_positions'])
edit_strings.append('D'+str(int(payload['deletion_n']))+';I'+str(int(payload['insertion_n']))+';S'+str(int(payload['substitution_n'])))
first_ref_name = new_variant['aln_ref_names'][0]
first_payload = new_variant['variant_' + first_ref_name]
exploded_cigar = ''.join([CRISPRessoShared.CIGAR_LOOKUP[x] for x in zip(first_payload['aln_seq'], first_payload['aln_ref'])])
sam_cigar_els = CRISPRessoShared.unexplode_cigar(exploded_cigar)
sam_cigar = ''.join(sam_cigar_els)
sam_flag = '0'
if first_payload['aln_strand'] == '-':
sam_flag = '16'
this_aln_loc = refs[first_ref_name]['aln_start']
# #if cigar starts with deletions
# if sam_cigar_els[0].endswith('D'):
# num = int(sam_cigar_els[0][0:-1])
# this_aln_loc += num
# elif sam_cigar_els[0].endswith('I'):
# num = int(sam_cigar_els[0][0:-1])
# this_aln_loc -= num
seq_to_write = fastq_seq
qual_to_write = fastq_qual
cigar_to_write = sam_cigar
if refs[first_ref_name]['aln_strand'] == '-':
if sam_flag == '16':
sam_flag = '0'
else:
sam_flag = '16'
cigar_to_write = ''.join(sam_cigar_els[::-1])
seq_to_write = CRISPRessoShared.reverse_complement(fastq_seq)
qual_to_write = fastq_qual[::-1]
new_sam_entry = [
fastq_id, # read id
sam_flag, # flag = mapped
refs[first_ref_name]['aln_chr'], # aln chr
str(this_aln_loc), # aln loc
str(int(new_variant['best_match_score'])), # quality
cigar_to_write, # cigar
'*', # next
'0', # next
'0', # tlen
seq_to_write, # seq
qual_to_write # qual
]
crispresso_sam_optional_fields = "c2:Z:ALN="+("&".join(new_variant['aln_ref_names'])) +\
" ALN_SCORES=" + ('&'.join([str(x) for x in new_variant['aln_scores']])) +\
" ALN_DETAILS=" + ('&'.join([','.join([str(y) for y in x]) for x in new_variant['ref_aln_details']])) +\
" CLASS="+new_variant['class_name']+\
" MODS="+("&".join(edit_strings))+\
" DEL="+("&".join([';'.join(x) for x in del_inds])) +\
" INS="+("&".join([';'.join(x) for x in ins_inds])) +\
" SUB=" + ("&".join([';'.join([str(y) for y in x]) for x in sub_inds])) +\
" ALN_REF=" + ('&'.join([new_variant['variant_'+name]['aln_ref'] for name in new_variant['aln_ref_names']])) +\
" ALN_SEQ=" + ('&'.join([new_variant['variant_'+name]['aln_seq'] for name in new_variant['aln_ref_names']]))
new_sam_entry.append(crispresso_sam_optional_fields)
new_variant['sam_entry'] = new_sam_entry
sam_out_handle.write("\t".join(new_sam_entry)+"\n") # write cached alignment with modified read id and qual
variantCache[fastq_seq] = new_variant
#last step of loop = read next line
fastq_id = fastq_input_handle.readline().strip()[1:]
fastq_input_handle.close()
sam_out_handle.close()
sort_and_index_cmd = 'samtools sort ' + sam_out + ' -o ' + bam_output + ' && samtools index ' + bam_output
sort_bam_status = sb.call(sort_and_index_cmd, shell=True)
if sort_bam_status:
raise CRISPRessoShared.BadParameterException(
'Bam sort failed. Command used: {0}'.format(sort_and_index_cmd)
)
if not args.debug:
os.remove(sam_out)
info("Finished reads; N_TOT_READS: %d N_COMPUTED_ALN: %d N_CACHED_ALN: %d N_COMPUTED_NOTALN: %d N_CACHED_NOTALN: %d"%(N_TOT_READS, N_COMPUTED_ALN, N_CACHED_ALN, N_COMPUTED_NOTALN, N_CACHED_NOTALN))
aln_stats = {"N_TOT_READS": N_TOT_READS,
"N_CACHED_ALN": N_CACHED_ALN,
"N_CACHED_NOTALN": N_CACHED_NOTALN,
"N_COMPUTED_ALN": N_COMPUTED_ALN,
"N_COMPUTED_NOTALN": N_COMPUTED_NOTALN,
}
return(aln_stats)
def split_interleaved_fastq(fastq_filename, output_filename_r1, output_filename_r2):
if fastq_filename.endswith('.gz'):
fastq_handle = gzip.open(fastq_filename, 'rt')
else:
fastq_handle=open(fastq_filename)
try:
fastq_splitted_outfile_r1 = gzip.open(output_filename_r1, 'wt')
fastq_splitted_outfile_r2 = gzip.open(output_filename_r2, 'wt')
[fastq_splitted_outfile_r1.write(line) if (i % 8 < 4) else fastq_splitted_outfile_r2.write(line) for i, line in enumerate(fastq_handle)]
except:
raise CRISPRessoShared.BadParameterException('Error in splitting read pairs from a single file')
return output_filename_r1, output_filename_r2
def normalize_name(name, fastq_r1, fastq_r2, bam_input):
"""Normalize the name according to the inputs and clean it.
Parameters
----------
name : str
The name optionally provided by the user.
fastq_r1 : str
The path to the first fastq file.
fastq_r2 : str
The path to the second fastq file.
bam_input : str
The path to the bam file.
Returns
-------
str
The normalized name.
"""
get_name_from_fasta = lambda x: os.path.basename(x).replace('.fastq', '').replace('.gz', '').replace('.fq', '')
get_name_from_bam = lambda x: os.path.basename(x).replace('.bam', '')
if not name:
if fastq_r2:
return '%s_%s' % (get_name_from_fasta(fastq_r1), get_name_from_fasta(fastq_r2))
elif fastq_r1:
return '%s' % get_name_from_fasta(fastq_r1)
elif bam_input != '':
return '%s' % get_name_from_bam(bam_input)
else:
clean_name=CRISPRessoShared.slugify(name)
if name!= clean_name:
warn('The specified name %s contained invalid characters and was changed to: %s' % (name, clean_name))
return clean_name
def main():
def print_stacktrace_if_debug():
debug_flag = False
if 'args' in vars() and 'debug' in args:
debug_flag = args.debug
if debug_flag:
traceback.print_exc(file=sys.stdout)
error(traceback.format_exc())
else:
debug(traceback.format_exc())
try:
start_time = datetime.now()
start_time_string = start_time.strftime('%Y-%m-%d %H:%M:%S')
description = ['~~~CRISPResso 2~~~', '-Analysis of genome editing outcomes from deep sequencing data-']