-
Notifications
You must be signed in to change notification settings - Fork 0
/
dupdemux.py
271 lines (226 loc) · 8.51 KB
/
dupdemux.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
import argparse
import gzip
import logging
import re
from collections import Counter
from enum import Enum
from pathlib import Path
import pysam
BARCODE_RE = re.compile(r"(barcode\d\d|unclassified)")
class ReadType(Enum):
DUPLEX = 1
SIMPLEX_WITH_OFFSPRING = -1
SIMPLEX_NO_OFFSPRING = 0
def read_summary_file(summary_file: str) -> dict[str, str]:
with open(summary_file) as fp:
headers = next(fp).strip().split("\t")
try:
read_id_idx = headers.index("read_id")
except ValueError:
raise ValueError("read_id column not found in summary file")
try:
barcode_idx = headers.index("barcode")
except ValueError:
raise ValueError("barcode column not found in summary file")
id2barcode = {}
for line in fp:
fields = line.strip().split("\t")
read_id = fields[read_id_idx]
barcode = fields[barcode_idx]
id2barcode[read_id] = barcode
return id2barcode
def extract_barcode(s: str) -> str:
m = BARCODE_RE.search(s)
if not m:
raise ValueError(f"Invalid barcode: {s}")
return m.group(1)
def to_fastq(read: pysam.AlignedSegment) -> str:
qual = "".join(map(lambda x: chr(x + 33), read.query_qualities))
seq = read.query_sequence
tags = " ".join(f"{tag}={value}" for tag, value in read.tags)
return f"@{read.query_name} {tags}\n{seq}\n+\n{qual}\n"
def main():
parser = argparse.ArgumentParser(description="Duplex Read Demultiplexer")
parser.add_argument("summary_file", help="Path to the sequencing summary file")
parser.add_argument(
"bam_file",
help="Path to the [BS]AM file [default: stdin]. Stdin must be a BAM file.",
default="-",
)
parser.add_argument(
"-d", "--duplex", action="store_true", help="Keep only duplex reads"
)
parser.add_argument(
"-o",
"--outdir",
help="Output barcodes files to this directory [default: demux]",
default="demux",
type=Path,
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="Enable verbose logging"
)
parser.add_argument(
"-U",
"--greedy",
action="store_true",
help="If a duplex read's barcodes do not match, and one is unclassified, use the classified barcode",
)
parser.add_argument(
"-f",
"--fastq",
action="store_true",
help="Output fastq files instead of BAM files",
)
parser.add_argument(
"-z",
"--gzip",
action="store_true",
help="Compress output fastq files with gzip",
)
parser.add_argument(
"-s",
"--fastq-suffix",
help="Suffix to add to fastq files [default: .fastq]",
default=".fastq",
)
args = parser.parse_args()
# Set logging level based on verbose flag
logger = logging.getLogger()
logger.setLevel(logging.DEBUG if args.verbose else logging.INFO)
# Set logging format to include level name
formatter = logging.Formatter(
"[%(asctime)s] [%(levelname)s]: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)
summary_file = args.summary_file
mode = "rb"
if args.bam_file != "-" and args.bam_file.endswith(".sam"):
mode = "r"
# we don't check for sequences because these are generally unaligned BAMs
bam_file = pysam.AlignmentFile(args.bam_file, mode=mode, check_sq=False)
keep_duplex = args.duplex
outdir = args.outdir.resolve()
if outdir.exists() and not outdir.is_dir():
raise ValueError(f"{outdir} exists and is not a directory")
if not outdir.exists():
outdir.mkdir(parents=True)
handles = {}
# Read the summary file
id2barcode = read_summary_file(summary_file)
logging.info(f"Read {len(id2barcode)} reads from {summary_file}")
# Iterate over the BAM file and write the output
num_mismatched_barcodes = 0
num_orphan_reads = 0
barcode_counts = Counter()
logger.info("Demultiplexing reads...")
for read in bam_file:
try:
dx = read.get_tag("dx")
except KeyError:
raise KeyError("dx tag not found in BAM file")
read_type = ReadType(dx)
if keep_duplex and read_type != ReadType.DUPLEX:
continue
if read_type != ReadType.DUPLEX:
barcode = id2barcode.get(read.query_name)
if barcode is None:
logger.debug(
f"Barcode not found for simplex read: {read.query_name} - "
"this is a known issue https://github.com/nanoporetech/dorado/issues/474 - "
"marking as unclassified"
)
barcode = "unclassified"
num_orphan_reads += 1
barcode = extract_barcode(barcode)
else:
if ";" not in read.query_name:
raise ValueError(
f"Duplex read IDs should have a ';' delimiter: {read.query_name}"
)
try:
id1, id2 = read.query_name.split(";")
except ValueError:
raise ValueError(
f"Duplex read IDs should have exactly two read IDs: {read.query_name}"
)
barcode1 = id2barcode.get(id1)
if barcode1 is None:
logger.debug(
f"Barcode not found for duplex read: {id1} - "
"this is a known issue https://github.com/nanoporetech/dorado/issues/474 - "
"marking as unclassified"
)
barcode1 = "unclassified"
num_orphan_reads += 1
else:
barcode1 = extract_barcode(barcode1)
barcode2 = id2barcode.get(id2)
if barcode2 is None:
logger.debug(
f"Barcode not found for duplex read: {id2} - "
"this is a known issue https://github.com/nanoporetech/dorado/issues/474 - "
"marking as unclassified"
)
barcode2 = "unclassified"
num_orphan_reads += 1
barcode2 = extract_barcode(barcode2)
if barcode1 != barcode2:
if "unclassified" in (barcode1, barcode2) and args.greedy:
if barcode1 == "unclassified":
barcode = barcode2
else:
barcode = barcode1
logger.debug(
f"Barcodes do not match for duplex read: {read.query_name} - {barcode1} != {barcode2} - "
"but one is unclassified - using classified barcode"
)
else:
logger.debug(
f"Barcodes do not match for duplex read: {read.query_name} - {barcode1} != {barcode2} - marking as unclassified"
)
num_mismatched_barcodes += 1
barcode = "unclassified"
else:
barcode = barcode1
read.set_tag("BC", barcode, value_type="Z")
fp = handles.get(barcode)
if fp is None:
path = str(outdir / f"{barcode}")
if args.fastq:
path += args.fastq_suffix
if args.gzip:
path += ".gz"
handles[barcode] = gzip.open(path, "wt")
else:
handles[barcode] = open(path, "w")
else:
path += ".bam"
handles[barcode] = pysam.AlignmentFile(
path, mode="wb", template=bam_file
)
fp = handles[barcode]
if args.fastq:
read = to_fastq(read)
fp.write(read)
barcode_counts[barcode] += 1
logger.warning(
f"Number of duplex reads with mismatched barcodes: {num_mismatched_barcodes} - use --verbose to see details"
)
logger.warning(
f"Number of unknown read IDs: {num_orphan_reads} - use --verbose to see details"
)
logger.info(f"Total number of reads assessed: {sum(barcode_counts.values())}")
bam_file.close()
logger.info("Barcode counts:")
# sort by barcode
for barcode, count in sorted(barcode_counts.items(), key=lambda x: x[0]):
logger.info(f"{barcode}\t{count}")
# close all the files
for handle in handles.values():
handle.close()
logger.info("Done")
if __name__ == "__main__":
main()