-
Notifications
You must be signed in to change notification settings - Fork 0
/
segmenter.py
278 lines (233 loc) · 8.58 KB
/
segmenter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
"""
Script used for the segmentation of czech corpora into
Linear Dependency Segments
"""
import conll3
import pandas as pd
import re
def is_finite_verb(tree, i):
"""
Are considered finite verbs tokens with xpos:
- Vc
- Vs
- VB
- Vi
- And Vp when their parent is not an AUX (otherwise the auxiliary is head of the clause)
"""
if tree[i]["xpos"][:2] in ["Vc", "Vs", "VB", "Vi"]:
return True
elif tree[i]["xpos"][:2] == "Vp":
idgov, _ = tree.idgovRel(i)
if idgov == 0:
return True
else:
upos_gov = tree[idgov]["tag"]
if upos_gov != "AUX":
return True
else:
return False
elif tree[i]["tag"] == "AUX":
for k in tree[i]['kids']:
if tree[k]["xpos"][:2] == "Vp":
return True
return False
else:
return False
def is_finite_verb_or_sconj(tree,i):
"""
Clauses begin with a finite verbe (not preceded by sconj) or a SCONJ that governs a finite verb
"""
if tree[i]["tag"] == "SCONJ":
kids = tree[i]["kids"]
for k in kids:
if is_finite_verb(tree, k):
return True
return False
elif is_finite_verb(tree,i):
idgov, _ = tree.idgovRel(i)
if idgov == 0:
return True
elif tree[idgov]["tag"] == "SCONJ":
return False
else:
return True
def get_descendants(tree, i):
"""
Given a tree and an id finds all descendants (recursive)
in:
- object of type conll3.Tree
out:
[1,3,4]
"""
descendants = []
children = tree.get_kids(i)
descendants.extend(children)
if children:
for c in children:
x = get_descendants(tree, c)
if x:
descendants.extend(x)
return descendants
def get_descendants_until_Fin_Verb(tree, i):
"""
Given a tree and an id finds all descendants (recursive).
Stop when you encounter a Finite Verb
This is used to segment into clauses
in:
- object of type conll3.Tree
out:
[1,3,4]
"""
descendants = []
children = tree.get_kids(i)
children = [c for c in children if not is_finite_verb_or_sconj(tree, c)]
descendants.extend(children)
if children:
for c in children:
x = get_descendants_until_Fin_Verb(tree, c)
if x:
x = [z for z in x if not is_finite_verb_or_sconj(tree, z)]
descendants.extend(x)
return descendants
def clause_segmentation(tree):
clauses = []
tree.addkids()
fin_verbs = [i for i in tree if is_finite_verb_or_sconj(tree, i)]
if not fin_verbs:
return []
else:
for root in fin_verbs:
descendants = get_descendants_until_Fin_Verb(tree, root)
clause = sorted(descendants+[root])
clauses.append(clause)
return clauses
def is_syntactic_bigram(tree, id_1, id_2):
idgov_1, _ = tree.idgovRel(id_1)
if idgov_1 == id_2:
return True
idgov_2, _ = tree.idgovRel(id_2)
if idgov_2 == id_1:
return True
return False
def is_complete(tree):
for i in tree:
idgov, _ = tree.idgovRel(i)
if idgov == -1:
return False
return True
def syntactically_linked_ngrams_1(tree, clause):
"""
segments are cut off when one word isn't linked to its right neighbour
in the linear order of the clause
"""
# initialization
segments = [[clause[0]]]
for i in range(1, len(clause)):
# print(i)
if is_syntactic_bigram(tree, clause[i], clause[i-1]):
segments[-1].append(clause[i])
else:
segments.append([clause[i]])
return segments
def syntactically_linked_ngrams_2(tree, clause):
"""
segments are cut off when one word isn't linked to its right neighbour
in the linear order of the sentence
"""
# initialization
segments = [[clause[0]]]
for i in range(1, len(clause)):
if clause[i-1]+1 == clause[i] and is_syntactic_bigram(tree, clause[i], clause[i-1]):
segments[-1].append(clause[i])
else:
segments.append([clause[i]])
return segments
## 1 - Small example to show how the functions work
# conll_example = """
# # generator = UDPipe 2, https://lindat.mff.cuni.cz/services/udpipe
# # udpipe_model = czech-pdt-ud-2.6-200830
# # udpipe_model_licence = CC BY-NC-SA
# # newdoc
# # newpar
# # sent_id = 1
# # text = Včera jsem řekla, že Jana je nemocná.
# 1 Včera včera ADV Db------------- _ 3 advmod _ TokenRange=0:5
# 2 jsem být AUX VB-S---1P-AA--- Mood=Ind|Number=Sing|Person=1|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act 3 aux _ TokenRange=6:10
# 3 řekla říci VERB VpQW---XR-AA--- Aspect=Perf|Gender=Fem,Neut|Number=Plur,Sing|Polarity=Pos|Tense=Past|VerbForm=Part|Voice=Act 0 root _ SpaceAfter=No|TokenRange=11:16
# 4 , , PUNCT Z:------------- _ 8 punct _ TokenRange=16:17
# 5 že že SCONJ J,------------- _ 8 mark _ TokenRange=18:20
# 6 Jana Jana PROPN NNFS1-----A---- Case=Nom|Gender=Fem|NameType=Giv|Number=Sing|Polarity=Pos 8 nsubj _ TokenRange=21:25
# 7 je být AUX VB-S---3P-AA--- Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act 8 cop _ TokenRange=26:28
# 8 nemocná nemocný ADJ AAFS1----1A---- Case=Nom|Degree=Pos|Gender=Fem|Number=Sing|Polarity=Pos 3 ccomp _ SpaceAfter=No|TokenRange=29:36
# 9 . . PUNCT Z:------------- _ 3 punct _ SpaceAfter=No|TokenRange=36:37
# """
# # transform the conll into a tree
# tree_example = conll3.conll2tree(conll_example)
# # remove the punctuation
# tree_example = conll3.unpunctATree_2(tree_example)
# print sentence
# print(tree_example.sentence())
# # segment into clauses
# clauses = clause_segmentation(tree_example)
# print("clause segmentation: ", clauses)
# # segment the 2nd clause into linear dependency segments (method 1)
# segments = syntactically_linked_ngrams_1(tree_example, clauses[1])
# print("2nd clause segmented into LDS using method 1: ", segments)
# # segment the 2nd clause into linear dependency segments (method 2)
# segments = syntactically_linked_ngrams_2(tree_example, clauses[1])
# print("2nd clause segmented into LDS using method 2: ", segments)
## 2- Main program : create a tsv file with the clause segmentation and linear dependency segmentation
# # folder with the conllu files
# input_folder = "../czech-sud-merged/"
# # create the list of trees
# new_trees = conll3.conllFolder2trees_unpuncted(input_folder)
# # method 1 :
# output_name_1 = "segmentation_results_sud_method1_v2_pdt_fictree.tsv"
# results = []
# clause_c = 0
# segment_c = 0
# for t_id, t in enumerate(new_trees):
# if not is_complete(t):
# continue
# results.append(["sentence", t_id, "None", "None", t.sentence()])
# clauses = clause_segmentation(t)
# # this will print sentences with no clauses
# if not clauses:
# print(t.sentence())
# for c in clauses:
# results.append(["clause", t_id, clause_c, "None", " ".join([t[x]["t"] for x in c])])
# # syntactically linked bigrams (neighbours in clause)
# segments = syntactically_linked_ngrams_1(t, c)
# # print(segments)
# for s in segments:
# results.append(["segment", t_id, clause_c, segment_c, " ".join([t[x]["t"] for x in s])])
# segment_c += 1
# clause_c += 1
# # output the results of method 1
# df = pd.DataFrame.from_records(results, columns=["type", "sentence_id", "clause_id", "segment_id", "text"])
# df.to_csv(output_name_1, sep="\t", index=False)
# # method 2
# output_name_2 = "segmentation_results_sud_method2_v2_pdt_fictree.tsv"
# results = []
# clause_c = 0
# segment_c = 0
# for t_id, t in enumerate(new_trees):
# if not is_complete(t):
# continue
# results.append(["sentence", t_id, "None", "None", t.sentence()])
# clauses = clause_segmentation(t)
# # this will print sentences with no clauses
# if not clauses:
# print(t.sentence())
# for c in clauses:
# results.append(["clause", t_id, clause_c, "None", " ".join([t[x]["t"] for x in c])])
# # syntactically linked bigrams (neighbours in clause)
# segments = syntactically_linked_ngrams_2(t, c)
# # print(segments)
# for s in segments:
# results.append(["segment", t_id, clause_c, segment_c, " ".join([t[x]["t"] for x in s])])
# segment_c += 1
# clause_c += 1
# # output the results of method 2
# df = pd.DataFrame.from_records(results, columns=["type", "sentence_id", "clause_id", "segment_id", "text"])
# df.to_csv(output_name_2, sep="\t", index=False)