/
heuristic_filters.py
299 lines (255 loc) · 11.3 KB
/
heuristic_filters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
"""
^O Replacing occurrences of NN NN and NN NN NN with NN and vice versa.
^O Replacing occurrences of JJ NN with NN and vice versa.
^O Replace occurrences of 'the TARGET', 'a TARGET' and 'an TARGET' with
TARGET, and vice versa.
^O Replacing occurrences of 'the NN', 'a NN' and 'an NN' with NN, and vice
versa.
^O Replacing occurrences of JJ JJ with JJ and vice versa. (For example, a
wonderful beautiful place, a wonderful place)
^O Replacing NNS with NN and vice versa, where NNS is the tag for plural
nouns.
^O Replacing occurrences of 'the' with 'a' and vice versa.
^O Replacing occurrences of NN with NP and vice versa, where NP is the tag
for Proper Nouns.
^O Replacing occurrences of NP NP with NP and vice versa.
^O Replacing occurrences of RB VV with VV, where RB is the tag for Adverbs.
(For example: TARGET refers to e^Nciently studying for 5 hours, TARGET
refers to studying for 5 hours.)
"""
import os
import sys
from collections import Counter
from typing import Any, Dict, List, Optional, Tuple
def unk_filter(
intent_preds: List[int],
slot_preds: List[List[str]]
) -> Tuple[List[int], List[List[str]]]:
new_intent_preds, new_slot_preds = [], []
for intent_pred, slot_pred, in zip(intent_preds, slot_preds):
new_slot_pred = slot_pred
new_intent_pred = intent_pred
new_slot_pred = ["O" if sp == "UNK" else sp for sp in new_slot_pred]
new_intent_preds.append(new_intent_pred)
new_slot_preds.append(new_slot_pred)
return new_intent_preds, new_slot_preds
def common_filters(
intent_preds: List[int],
slot_preds: List[List[str]]
) -> Tuple[List[int], List[List[str]]]:
"""
Apply common filters for the predictions
"""
new_intent_preds, new_slot_preds = [], []
for intent_pred, slot_pred, in zip(intent_preds, slot_preds):
new_slot_pred = slot_pred
new_intent_pred = intent_pred
# 1. [slot] Filter out term / definition only cases.
pred_counter = dict(Counter(slot_pred))
term_exist, def_exist = False, False
for c in pred_counter:
if c.endswith("TERM"):
term_exist = True
if c.endswith("DEF"):
def_exist = True
if not (term_exist and def_exist):
new_slot_pred = ["O" for p in slot_pred]
# 2. [intent] Change intent label if no term + def detected.
if not(term_exist and def_exist):
new_intent_pred = 0
# 3. [slot] Replace UNK with O.
new_slot_pred = ["O" if sp == "UNK" else sp for sp in new_slot_pred]
# 4. Change I-TERM I-DEF starting cases.
temp_new_slot_pred = new_slot_pred.copy()
term_start, def_start = False, False
for sid, sp in enumerate(temp_new_slot_pred):
if not term_start and sp == "I-TERM":
new_slot_pred[sid] = "B-TERM"
if sp.endswith("TERM"):
term_start = True
else:
term_start = False
if not def_start and sp == "I-DEF":
new_slot_pred[sid] = "B-DEF"
if sp.endswith("DEF"):
def_start = True
else:
def_start = False
new_intent_preds.append(new_intent_pred)
new_slot_preds.append(new_slot_pred)
return new_intent_preds, new_slot_preds
def term_def_filters(
intent_preds: List[int],
slot_preds : List[List[str]],
) -> Tuple[List[int], List[List[str]]]:
new_intent_preds, new_slot_preds = [], []
for intent_pred, slot_pred in zip(intent_preds, slot_preds):
new_slot_pred = slot_pred
new_intent_pred = intent_pred
# [slot] Fill out missing term/def within threshold.
temp_new_slot_pred = new_slot_pred.copy()
for sid, sp in enumerate(temp_new_slot_pred):
if sid < len(new_slot_pred) - 2 and sp.endswith("TERM"):
if temp_new_slot_pred[sid + 1] == "O" and temp_new_slot_pred[
sid + 2
].endswith("TERM"):
new_slot_pred[sid + 1] = "I-TERM"
temp_new_slot_pred = new_slot_pred.copy()
for sid, sp in enumerate(temp_new_slot_pred):
if sid < len(new_slot_pred) - 2 and sp.endswith("DEF"):
if temp_new_slot_pred[sid + 1] == "O" and temp_new_slot_pred[
sid + 2
].endswith("DEF"):
new_slot_pred[sid + 1] = "I-DEF"
temp_new_slot_pred = new_slot_pred.copy()
for sid, sp in enumerate(temp_new_slot_pred):
if sid < len(new_slot_pred) - 3 and sp.endswith("DEF"):
if (
temp_new_slot_pred[sid + 1] == "O"
and temp_new_slot_pred[sid + 2] == "O"
and temp_new_slot_pred[sid + 3].endswith("DEF")
):
new_slot_pred[sid + 1] = "I-DEF"
new_slot_pred[sid + 2] = "I-DEF"
new_intent_preds.append(new_intent_pred)
new_slot_preds.append(new_slot_pred)
return new_intent_preds, new_slot_preds
def sym_nick_filters(
intent_preds: List[int],
slot_preds : List[List[str]],
raw: List[List[str]]
) -> Tuple[List[int], List[List[str]]]:
new_intent_preds, new_slot_preds = [], []
for intent_pred, slot_pred, raw_data in zip(intent_preds, slot_preds, raw):
new_slot_pred = slot_pred
new_intent_pred = intent_pred
#1. [slot] Replace mis-labelled non SYMBOL as TERM
temp_new_slot_pred = new_slot_pred.copy()
for sid, sp in enumerate(temp_new_slot_pred):
if sp.endswith("TERM") and 'SYMBOL' not in raw_data[sid]:
new_slot_pred[sid] = 'O'
# 2. Change TERMs in between DEFs.
temp_new_slot_pred = new_slot_pred.copy()
term_start, def_start = False, False
for sid, sp in enumerate(temp_new_slot_pred):
if sp.endswith("DEF"):
def_start = True
else:
def_start = False
if sp.endswith("TERM"):
if def_start:
new_slot_pred[sid] = 'I-DEF'
def_start = True
else:
term_start = True
else:
term_start = False
# Remove intent in case a term was removed and there are none left
pred_counter = dict(Counter(slot_pred))
term_exist, def_exist = False, False
for c in pred_counter:
if c.endswith("TERM"):
term_exist = True
if c.endswith("DEF"):
def_exist = True
if not (term_exist and def_exist):
new_slot_pred = ["O" for p in slot_pred]
#[intent] Change intent label if no term + def detected.
if not(term_exist and def_exist):
new_intent_pred = 0
new_intent_preds.append(new_intent_pred)
new_slot_preds.append(new_slot_pred)
return new_intent_preds, new_slot_preds
def sym_nick_query_filters(
intent_preds: List[int],
slot_preds : List[List[str]],
raw: List[List[str]],
raw_processed : List[List[str]],
query_string: str = '||||',
symbol_length_threshold: int = 30
) -> Tuple[List[int], List[List[str]]]:
new_intent_preds, new_slot_preds = [], []
for intent_pred, slot_pred, raw_data, raw_processed_data in zip(intent_preds, slot_preds, raw, raw_processed):
new_slot_pred = slot_pred
new_intent_pred = intent_pred
#1. [slot] Replace mis-labelled non SYMBOL as TERM
temp_new_slot_pred = new_slot_pred.copy()
for sid, sp in enumerate(temp_new_slot_pred):
if sp.endswith("TERM") and 'SYMBOL' not in raw_data[sid]:
new_slot_pred[sid] = 'O'
#1b. [slot] Replace SYMBOL outside the query with O
temp_new_slot_pred = new_slot_pred.copy()
for sid, sp in enumerate(temp_new_slot_pred):
if sp.endswith("TERM") and 'SYMBOL' in raw_data[sid] and (sid>0 or sid<len(temp_new_slot_pred)-1):
if raw_data[sid-1]!= query_string and raw_data[sid+1] != query_string:
new_slot_pred[sid] = 'O'
elif sp.endswith("TERM") and 'SYMBOL' in raw_data[sid] and (sid==0 or sid==len(temp_new_slot_pred)-1):
new_slot_pred[sid] = 'O'
#1c. [slot] Replace SYMBOL with only numbers or symbols
temp_new_slot_pred = new_slot_pred.copy()
for sid, sp in enumerate(temp_new_slot_pred):
if sp.endswith("TERM") and 'SYMBOL' in raw_data[sid]:
symbol_without_num_special_chars = [x for x in raw_processed_data[sid] if x.isalpha()]
if len(symbol_without_num_special_chars)==0:
new_slot_pred[sid] = 'O'
#1d. [slot] Replace SYMBOL that's really long
temp_new_slot_pred = new_slot_pred.copy()
for sid, sp in enumerate(temp_new_slot_pred):
if sp.endswith("TERM") and 'SYMBOL' in raw_data[sid]:
if len(raw_processed_data[sid]) > symbol_length_threshold:
new_slot_pred[sid] = 'O'
# 2. Change TERMs in between DEFs.
temp_new_slot_pred = new_slot_pred.copy()
term_start, def_start = False, False
for sid, sp in enumerate(temp_new_slot_pred):
if sp.endswith("DEF"):
def_start = True
else:
def_start = False
if sp.endswith("TERM"):
if def_start:
new_slot_pred[sid] = 'I-DEF'
def_start = True
else:
term_start = True
else:
term_start = False
# Remove intent in case a term was removed and there are none left
pred_counter = dict(Counter(slot_pred))
term_exist, def_exist = False, False
for c in pred_counter:
if c.endswith("TERM"):
term_exist = True
if c.endswith("DEF"):
def_exist = True
if not (term_exist and def_exist):
new_slot_pred = ["O" for p in slot_pred]
#[intent] Change intent label if no term + def detected.
if not(term_exist and def_exist):
new_intent_pred = 0
new_intent_preds.append(new_intent_pred)
new_slot_preds.append(new_slot_pred)
return new_intent_preds, new_slot_preds
def abbr_exp_filters(
intent_preds: List[int],
slot_preds : List[List[str]],
) -> Tuple[List[int], List[List[str]]]:
return intent_preds, slot_preds
def heuristic_filters(
intent_preds: Dict[str,List[int]],
slot_preds: Dict[str,List[List[str]]],
raw: List[List[str]],
task : str,
raw_processed: List[List[str]],
) -> Tuple[Dict[str,List[int]], Dict[str,List[List[str]]]]:
"""
Apply various heuristic filters based on the data type [AI2020(abbr-exp), DocDef2(sym-nick), W00(term-def)]
"""
data_types = task.split('+')
for data_type in data_types:
intent_preds[data_type], slot_preds[data_type] = common_filters(intent_preds[data_type], slot_preds[data_type])
if data_type.startswith('DocDefQueryInplace2'):
intent_preds[data_type], slot_preds[data_type] = sym_nick_query_filters(intent_preds[data_type], slot_preds[data_type], raw, raw_processed)
elif data_type == 'W00':
intent_preds[data_type], slot_preds[data_type] = term_def_filters(intent_preds[data_type], slot_preds[data_type])
return intent_preds, slot_preds