-
Notifications
You must be signed in to change notification settings - Fork 8
/
mass_calc.py
244 lines (207 loc) · 7.43 KB
/
mass_calc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import numpy as np
from typing import List, Tuple
from alphabase.constants.aa import (
calc_AA_masses,
calc_AA_masses_for_same_len_seqs,
calc_sequence_masses_for_same_len_seqs,
)
from alphabase.constants.modification import (
calc_modification_mass,
calc_modification_mass_sum,
calc_mod_masses_for_same_len_seqs,
)
from alphabase.constants.atom import MASS_H2O
def calc_diff_modification_mass(
pep_len: int, mass_diffs: List[float], mass_diff_sites: List[int]
) -> np.ndarray:
"""
For open-search, we may also get modification
mass diffs other than mod names. This function calculate
modification masses from these diff masses.
Parameters
----------
pep_len : int
nAA
mass_diffs : List[float]
mass diffs on the peptide
mass_diff_sites : List[int]
localized sites of corresponding mass diffs
Returns
-------
np.ndarray
1-D array with length=`peplen`.
Masses of modifications (mass diffs) through the peptide,
`0` if sites has no modifications
"""
masses = np.zeros(pep_len)
for site, mass in zip(mass_diff_sites, mass_diffs):
if site == 0:
masses[site] += mass
elif site == -1:
masses[site] += mass
else:
masses[site - 1] += mass
return masses
def calc_mod_diff_masses_for_same_len_seqs(
nAA: int, aa_mass_diffs_list: List[List[float]], mod_sites_list: List[List[int]]
) -> np.ndarray:
"""
Calculate diff modification masses for the given peptide length (`nAA`),
For open-search, we may also get modification
mass diffs other than mod names. This function calculate
modification masses from these diff masses.
Parameters
----------
nAA : int
peptide length
mod_names_list : List[List[str]]
list of modification list
mod_sites_list : List[List[int]]
list of modification site list corresponding
to `mod_names_list`.
* `site=0` refers to an N-term modification
* `site=-1` refers to a C-term modification
* `1<=site<=peplen` refers to a normal modification
Returns
-------
np.ndarray
2-D array with shape=`(nAA, pep_count or len(mod_names_list)))`.
Masses of modifications through all the peptides,
`0` if sites has no modifications
"""
masses = np.zeros((len(aa_mass_diffs_list), nAA))
for i, (aa_mass_diffs, mod_sites) in enumerate(
zip(aa_mass_diffs_list, mod_sites_list)
):
for mod_diff, site in zip(aa_mass_diffs, mod_sites):
if site == 0:
masses[i, site] += mod_diff
elif site == -1:
masses[i, site] += mod_diff
else:
masses[i, site - 1] += mod_diff
return masses
def calc_b_y_and_peptide_mass(
sequence: str,
mod_names: List[str],
mod_sites: List[int],
aa_mass_diffs: List[float] = None,
aa_mass_diff_sites: List[int] = None,
) -> Tuple[np.ndarray, np.ndarray, float]:
"""
It is highly recommend to use
`calc_b_y_and_peptide_masses_for_same_len_seqs`
as it is much faster
"""
residue_masses = calc_AA_masses(sequence)
mod_masses = calc_modification_mass(len(sequence), mod_names, mod_sites)
residue_masses += mod_masses
if aa_mass_diffs is not None:
mod_masses = calc_diff_modification_mass(
len(sequence), aa_mass_diffs, aa_mass_diff_sites
)
residue_masses += mod_masses
# residue_masses = residue_masses[np.newaxis, ...]
b_masses = np.cumsum(residue_masses)
b_masses, pepmass = b_masses[:-1], b_masses[-1]
pepmass += MASS_H2O
y_masses = pepmass - b_masses
return b_masses, y_masses, pepmass
def calc_peptide_masses_for_same_len_seqs(
sequences: np.ndarray, mod_list: List[str], mod_diff_list: List[str] = None
) -> np.ndarray:
"""
Calculate peptide masses for peptide sequences with same lengths.
We need 'same_len' here because numpy can process AA sequences
with same length very fast.
See `alphabase.aa.calc_sequence_masses_for_same_len_seqs`
Parameters
----------
mod_list : List[str]
list of modifications,
e.g. `['Oxidation@M;Phospho@S','Phospho@S;Deamidated@N']`
mass_diff_list : List[str]
List of modifications as mass diffs,
e.g. `['15.9xx;79.9xxx','79.9xx;0.98xx']`
Returns
-------
np.ndarray
peptide masses (1-D array, H2O already added)
"""
seq_masses = calc_sequence_masses_for_same_len_seqs(sequences)
mod_masses = np.zeros_like(seq_masses)
for i, mods in enumerate(mod_list):
if len(mods) > 0:
mod_masses[i] = calc_modification_mass_sum(mods.split(";"))
if mod_diff_list is not None:
for i, mass_diffs in enumerate(mod_diff_list):
if len(mass_diffs) > 0:
mod_masses[i] += np.sum([float(mass) for mass in mass_diffs.split(";")])
return seq_masses + mod_masses
def calc_b_y_and_peptide_masses_for_same_len_seqs(
sequences: np.ndarray,
mod_list: List[List[str]],
site_list: List[List[int]],
mod_diff_list: List[List[float]] = None,
mod_diff_site_list: List[List[int]] = None,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Calculate b/y fragment masses and peptide masses
for peptide sequences with same lengths.
We need 'same_len' here because numpy can process AA sequences
with same length very fast.
Parameters
----------
sequence : np.ndarray of str
np.ndarray of peptie sequences with same length.
mod_list : List[List[str]]
list of modifications ,
e.g. `[['Oxidation@M','Phospho@S'],['Phospho@S','Deamidated@N']]`
site_list : List[List[int]]
list of modification sites
corresponding to `mod_list`, e.g. `[[3,6],[4,17]]`
mod_diff_list : List[List[float]]
list of modifications,
e.g. `[[15.994915,79.966331],[79.966331,0.984016]]`
mod_diff_site_list : List[List[int]]
list of modification mass diff sites
corresponding to `mod_list`, e.g. `[[3,6],[4,17]]`
Returns
-------
np.ndarray
neutral b fragment masses (2-D array)
np.ndarray
neutral y fragmnet masses (2-D array)
np.ndarray
neutral peptide masses (1-D array)
"""
aa_masses = calc_AA_masses_for_same_len_seqs(sequences)
nAA = len(sequences[0])
# mod_masses = np.zeros_like(aa_masses)
# for i, (mods, sites) in enumerate(zip(mod_list, site_list)):
# if len(mods) != 0:
# mod_masses[i,:] = calc_modification_mass(
# seq_len,
# mods,
# sites,
# )
mod_masses = calc_mod_masses_for_same_len_seqs(nAA, mod_list, site_list)
if mod_diff_list is not None:
mod_masses += calc_mod_diff_masses_for_same_len_seqs(
nAA, mod_diff_list, mod_diff_site_list
)
# for i, (mass_diffs, sites) in enumerate(zip(
# mass_diff_list, mass_diff_site_list
# )):
# if len(mass_diffs) != 0:
# mod_masses[i,:] += calc_diff_modification_mass(
# seq_len,
# mass_diffs,
# sites,
# )
aa_masses += mod_masses
b_masses = np.cumsum(aa_masses, axis=1)
b_masses, pepmass = b_masses[:, :-1], b_masses[:, -1:]
pepmass += MASS_H2O
y_masses = pepmass - b_masses
return b_masses, y_masses, pepmass.flatten()