-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
47 lines (33 loc) · 1.22 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from collections import Counter
from tqdm import tqdm
from core.book.paragraph import Paragraph
from utils_ceb import CEBApi
from utils_gd import GuttenbergDialogApi
def iter_paragraphs_with_n_speakers(speakers, iter_paragraphs, n_speakers=1):
""" Iterate text paragraphs which contains only N mentioned speakers
based on the Character-based-embedding API.
"""
assert(isinstance(speakers, set))
s_count = Counter()
pbar = tqdm(iter_paragraphs, "Iter Paragraphs")
for p in pbar:
assert(isinstance(p, Paragraph))
pbar.set_postfix({
'kept': s_count["kept"],
'total': s_count['total']
})
s_count["total"] += 1
terms = p.Text.split()
p_speakers = []
for term in terms:
if not GuttenbergDialogApi.is_character(term):
continue
if term[0] == '{' and term[-1] == '}':
term = term[1:-1]
if CEBApi.speaker_variant_to_speaker(term) in speakers:
p_speakers.append(term)
if len(p_speakers) != n_speakers:
continue
# handle paragraphs devoted to a single character.
s_count["kept"] += 1
yield p, p_speakers