-
Notifications
You must be signed in to change notification settings - Fork 17
/
utils.py
74 lines (59 loc) · 2.7 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
Greynir: Natural language processing for Icelandic
Copyright (C) 2020 Miðeind ehf.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/.
"""
from typing import List, Tuple, Dict
import tokenizer
from reynir.binparser import BIN_Token, Tok
from reynir import bintokenizer
def prep_text_for_tokenizer(text: str) -> str:
return "[[ " + " ]] [[ ".join(text.split("\n")) + " ]]"
def index_text(text: str) -> Tuple[Dict[int, List[int]], Dict[int, str]]:
""" Segments contiguous (Icelandic) text into paragraphs and sentences
and returns:
dictionary of sentence indices to sentences
dictionary of paragraph index to constituent sentence indices"""
text = prep_text_for_tokenizer(text)
tok_stream = bintokenizer.tokenize(text)
pgs = tokenizer.paragraphs(tok_stream)
pg_idx_to_sent_idx = dict() # type: Dict[int, List[int]]
sent_idx_to_sent = dict() # type: Dict[int, str]
curr_sent_idx = 0
curr_pg_idx = 0
for pg in pgs:
sent_idxs = []
for _, sent in pg:
curr_sent = list(filter(BIN_Token.is_understood, sent)) # type: List[Tok]
curr_sent_text = tokenizer.normalized_text_from_tokens(curr_sent)
sent_idxs.append(curr_sent_idx)
sent_idx_to_sent[curr_sent_idx] = curr_sent_text
curr_sent_idx += 1
pg_idx_to_sent_idx[curr_pg_idx] = sent_idxs
curr_pg_idx += 1
return pg_idx_to_sent_idx, sent_idx_to_sent
def split_text(text: str) -> List[List[str]]:
""" Segments contiguous (Icelandic) text into paragraphs and sentences
and returns a list of lists
"""
text = prep_text_for_tokenizer(text)
tok_stream = bintokenizer.tokenize(text)
pgs = tokenizer.paragraphs(tok_stream)
data = [] # type: List[List[str]]
for pg in pgs:
pg_data = [] # type: List[str]
for _, sentence in pg:
sentence = list(filter(BIN_Token.is_understood, sentence))
sentence_text = tokenizer.normalized_text_from_tokens(sentence)
pg_data.append(sentence_text)
data.append(pg_data)
return data