In [14]:
import re
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
import pandas as pd

In [15]:
# This file is part of UDPipe <http://github.com/ufal/udpipe/>.
#
# Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
# Mathematics and Physics, Charles University in Prague, Czech Republic.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import ufal.udpipe
# ufal.udpipe.Model etc. are SWIG-magic and cannot be detected by pylint
# pylint: disable=no-member

class Model:
    def __init__(self, path):
        """Load given model."""
        self.model = ufal.udpipe.Model.load(path)
        if not self.model:
            raise Exception("Cannot load UDPipe model from file '%s'" % path)

    def tokenize(self, text):
        """Tokenize the text and return list of ufal.udpipe.Sentence-s."""
        tokenizer = self.model.newTokenizer(self.model.DEFAULT)
        if not tokenizer:
            raise Exception("The model does not have a tokenizer")
        return self._read(text, tokenizer)

    def read(self, text, in_format):
        """Load text in the given format (conllu|horizontal|vertical) and return list of ufal.udpipe.Sentence-s."""
        input_format = ufal.udpipe.InputFormat.newInputFormat(in_format)
        if not input_format:
            raise Exception("Cannot create input format '%s'" % in_format)
        return self._read(text, input_format)

    def _read(self, text, input_format):
        input_format.setText(text)
        error = ufal.udpipe.ProcessingError()
        sentences = []

        sentence = ufal.udpipe.Sentence()
        while input_format.nextSentence(sentence, error):
            sentences.append(sentence)
            sentence = ufal.udpipe.Sentence()
        if error.occurred():
            raise Exception(error.message)

        return sentences

    def tag(self, sentence):
        """Tag the given ufal.udpipe.Sentence (inplace)."""
        self.model.tag(sentence, self.model.DEFAULT)

    def parse(self, sentence):
        """Parse the given ufal.udpipe.Sentence (inplace)."""
        self.model.parse(sentence, self.model.DEFAULT)

    def write(self, sentences, out_format):
        """Write given ufal.udpipe.Sentence-s in the required format (conllu|horizontal|vertical)."""

        output_format = ufal.udpipe.OutputFormat.newOutputFormat(out_format)
        output = ''
        for sentence in sentences:
            output += output_format.writeSentence(sentence)
        output += output_format.finishDocument()

        return output

# Can be used as
#  model = Model('english-ud-1.2-160523.udpipe')
#  sentences = model.tokenize("Hi there. How are you?")
#  for s in sentences:
#      model.tag(s)
#      model.parse(s)
#  conllu = model.write(sentences, "conllu")

In [16]:
model = Model('english-partut-ud-2.0-170801.udpipe')

In [17]:
def create_df_from_conllu(conllu):
    conllu = re.sub('#.+?\n', '', conllu)
    #print(conllu)
    l = conllu.split('\n')
    string = 'Id\tForm\tLemma\tUPosTag\tXPosTag\tFeats\tHead\tDepRel\tDeps\tMisc\n'
    string += "\n".join(l)
    TESTDATA = StringIO(string)
    df = pd.read_csv(TESTDATA, sep="\t")
    return df

In [18]:
def delete_from_df(df):
    to_drop = []
    for index, row in df.iterrows():
        if '-' in str(row['Id']):
            num = len(row['Id'].split('-'))
            #print(row['Id'], index, num)
            for x in range(num):
                if num != 0:
                    to_drop.append(index + num)
                    num -= 1
    return to_drop

In [19]:
def parsing(text):
    sentences = model.tokenize(text)
    for s in sentences:
        model.tag(s)
        model.parse(s)
    conllu = model.write(sentences, "conllu")
    return conllu

In [38]:
def open_ann_find_end(ann):
    ts = re.findall('\n(T[0-9]+)\t', ann)
    ts = sorted([int(t[1:]) for t in ts])
    grids = re.findall('\n(#[0-9]+)\t', ann)
    grids = sorted([int(grid[1:]) for grid in grids])
    if grids == [] and ts != []:
        return ts[-1]+1, 1
    elif grids != [] and ts == []:
        return 1, grids[-1]+1
    else:
        return ts[-1]+1, grids[-1]+1

In [39]:
def opening(path):
    with open(path, 'r') as f:
        text = f.read()
    return text

In [40]:
def df_changing(text):
    conllu = parsing(text)
    df = create_df_from_conllu(conllu)
    to_drop = delete_from_df(df)
    df = df.drop(df.index[to_drop])
    return df

In [41]:
text = opening('/Users/irene/Downloads/realec/2012-2014/esl_00206.txt')
ann_check = opening('/Users/irene/Downloads/realec/2012-2014/esl_00206.ann')
ann = opening('ann_withoutpos.ann')

In [42]:
df = df_changing(text)

In [43]:
df.head()

Unnamed: 0,Id,Form,Lemma,UPosTag,XPosTag,Feats,Head,DepRel,Deps,Misc
0,1,The,the,DET,RD,Definite=Def|PronType=Art,3,det,_,_
1,2,first,first,ADJ,NO,Degree=Pos|NumType=Ord,3,amod,_,_
2,3,chart,chart,NOUN,S,Number=Sing,4,nsubj,_,_
3,4,demonstrates,demonstrate,VERB,V,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,0,root,_,_
4,5,that,that,SCONJ,CS,_,12,mark,_,_


In [44]:
t, grid = open_ann_find_end(ann)

In [45]:
def start_end(df, text):
    starts = []
    ends = []
    end = 0
    for index, row in df.iterrows():
        start = end + text.find(row['Form'])
        end = start + len(row['Form'])
        text = text[text.find(row['Form'])+len(row['Form']):]
        print(row['Form'], start, end)
        starts.append(start)
        ends.append(end)
    return starts, ends

In [46]:
starts, ends = start_end(df, text)

The 0 3
first 4 9
chart 10 15
demonstrates 16 28
that 29 33
the 34 37
total 38 43
amount 44 50
of 51 53
waste 54 59
has 60 63
increased 64 73
from 74 78
800 79 82
thousand 83 91
tones 92 97
to 98 100
one 101 104
million 105 112
in 113 115
a 116 117
four 118 122
- 122 123
year 123 127
period 128 134
. 134 135
Thelowest 136 145
rate 146 150
was 151 154
in 155 157
the 158 161
year 162 166
2009 167 171
: 171 172
according 173 182
to 183 185
the 186 189
chart 190 195
, 195 196
it 197 199
was 200 203
only 204 208
700 209 212
thousand 213 221
tones 222 227
, 227 228
which 229 234
is 235 237
less 238 242
for 243 246
100 247 250
thousand 251 259
tones 260 265
than 266 270
the 271 274
rate 275 279
ofthe 280 285
previous 286 294
year 295 299
. 299 300
However 301 308
, 308 309
next 310 314
year 315 319
the 320 323
rate 324 328
reached 329 336
its 337 340
peak 341 345
- 346 347
it 348 350
was 351 354
1 355 356
200 357 360
thousand 361 369
tones 370 375
. 375 376
Thereby 377 384
, 384 385
the 386 3

In [47]:
df['START'] = starts
df['END'] = ends

In [48]:
def add_ann(ann, df, t, grid):
    strings_in_file = []
    tx = ann
    for index, row in df.iterrows(): 
        s1 = 'T' + str(t) + '\t' + str(row['UPosTag']) + ' ' + str(row['START']) + ' ' + str(row['END']) + '\t' + row['Form'] + '\n'
        s2 = '#'+ str(grid) + '\tAnnotatorNotes T' + str(t) + "\tlemma = '" + row['Lemma'] + "'\n"
        #print(s1)
        #print(s2)
        tx += s1 + s2
        t += 1
        grid += 1
    return tx

In [49]:
t

11

In [50]:
grid

1

In [51]:
tx = add_ann(ann, df, t, grid)

In [52]:
with open('esl_00206_test.ann', 'w') as f:
    f.write(tx)

In [None]:
for x in ann_check.split('\n')[126:-3]:
    if '#' not in x:
        print(x)