/
RgiIO.py
158 lines (146 loc) · 6.98 KB
/
RgiIO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python
import csv
import re
import math
from .Interfaces import hAMRonizedResultIterator
from hAMRonization.constants import (
NUCLEOTIDE_VARIANT,
AMINO_ACID_VARIANT,
GENE_PRESENCE,
)
required_metadata = [
"analysis_software_version",
"reference_database_version",
"input_file_name",
]
class RgiIterator(hAMRonizedResultIterator):
def __init__(self, source, metadata):
metadata["analysis_software_name"] = "rgi"
metadata["reference_database_name"] = "CARD"
metadata["genetic_variation_type"] = "Gene presence detected"
self.metadata = metadata
with open(source) as fh:
header = next(fh)
# i.e. RGI-bwt
if "Resistomes & Variants: Observed in Genome(s)" in header.strip().split(
"\t"
):
self.field_mapping = {
"ARO Term": "gene_symbol",
"ARO Accession": "reference_accession",
"Reference Model Type": "genetic_variation_type",
"Reference DB": "reference_database_name",
"Alleles with Mapped Reads": None,
"Reference Allele(s) Identity "
"to CARD Reference Protein (%)": "sequence_identity",
"Resistomes & Variants: Observed in Genome(s)": None,
"Resistomes & Variants: Observed in Plasmid(s)": None,
"Resistomes & Variants: Observed Pathogen(s)": None,
"Completely Mapped Reads": None,
"Mapped Reads with Flanking Sequence": None,
"All Mapped Reads": None,
"Average Percent Coverage": "coverage_percentage",
"Average Length Coverage (bp)": "input_gene_length",
"Average MAPQ (Completely Mapped Reads)": None,
"Number of Mapped Baits": None,
"Number of Mapped Baits with Reads": None,
"Average Number of reads per Bait": None,
"Number of reads per Bait " "Coefficient of Variation (%)": None,
"Number of reads mapping to baits "
"and mapping to complete gene": None,
"Number of reads mapping to baits and "
"mapping to complete gene (%)": None,
"Mate Pair Linkage (# reads)": None,
"Reference Length": "reference_gene_length",
"AMR Gene Family": "gene_name",
"Drug Class": "drug_class",
"Resistance Mechanism": "resistance_mechanism",
}
else:
# normal RGI mode
self.field_mapping = {
"ORF_ID": None,
"Contig": "input_sequence_id",
"Start": "input_gene_start",
"Stop": "input_gene_stop",
"Orientation": "strand_orientation",
"Cut_Off": None,
"Pass_Bitscore": None,
"Best_Hit_Bitscore": None,
"Best_Hit_ARO": "gene_symbol",
"Best_Identities": "sequence_identity",
"ARO": "reference_accession",
"Model_type": "genetic_variation_type",
"SNPs_in_Best_Hit_ARO": None,
"_nucleotide_mutation": "nucleotide_mutation",
"_amino_acid_mutation": "amino_acid_mutation",
"Other_SNPs": None,
"Drug Class": "drug_class",
"Resistance Mechanism": "resistance_mechanism",
"AMR Gene Family": "gene_name",
"Predicted_DNA": None,
"Predicted_Protein": None,
"CARD_Protein_Sequence": None,
"Percentage Length of " "Reference Sequence": "coverage_percentage",
"ID": None,
"Model_ID": None,
"Nudged": None,
"Note": None,
}
# if RGI is run on ORFs then Contig should be None
# and input_sequence_id should the ORF_ID i.e., reverse of
# rgi run on contig input
# this checks for that
try:
first_result = next(fh)
first_result = first_result.strip().split('\t')
if first_result[1] == '':
self.field_mapping['ORF_ID'] = 'input_sequence_id'
self.field_mapping['Contig'] = None
except StopIteration:
pass
super().__init__(source, self.field_mapping, self.metadata)
def parse(self, handle):
"""
Read each and return it
"""
# skip any manually specified fields for later
reader = csv.DictReader(handle, delimiter="\t")
for result in reader:
# rgi-bwt mode doesn't support variant mutations
if "Model_type" not in result:
result["_nucleotide_mutation"] = None
result["_amino_acid_mutation"] = None
result["Reference Model Type"] = GENE_PRESENCE
# normal RGI model
else:
result["_nucleotide_mutation"] = None
result["_amino_acid_mutation"] = None
if result["SNPs_in_Best_Hit_ARO"] == "n/a":
result["SNPs_in_Best_Hit_ARO"] = None
hgvs_mutations = []
if (
result["Model_type"] == "protein variant model"
or result["Model_type"] == "protein overexpression model"
):
result["Model_type"] = AMINO_ACID_VARIANT
if result["SNPs_in_Best_Hit_ARO"]:
for mutation in result["SNPs_in_Best_Hit_ARO"].split(","):
hgvs_mutations.append(f"p.{mutation}")
result["_amino_acid_mutation"] = ",".join(hgvs_mutations)
elif result["Model_type"] == "rrna variant model":
result["Model_type"] = NUCLEOTIDE_VARIANT
if result["SNPs_in_Best_Hit_ARO"]:
for mutation in result["SNPs_in_Best_Hit_ARO"].split(","):
_, ref, pos, alt, _ = re.split(r"(\D+)(\d+)(\D+)", mutation)
hgvs_mutations.append(f"n.{pos}{ref}>{alt}")
result["_nucleotide_mutation"] = ",".join(hgvs_mutations)
else:
result["Model_type"] = GENE_PRESENCE
# round down average length of coverage so its comparable to other
# target lengths
if "Average Length Coverage (bp)" in result:
result["Average Length Coverage (bp)"] = math.floor(
float(result["Average Length Coverage (bp)"])
)
yield self.hAMRonize(result, self.metadata)