In [1]:
# This file is part of UDPipe <http://github.com/ufal/udpipe/>.
#
# Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
# Mathematics and Physics, Charles University in Prague, Czech Republic.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import ufal.udpipe
# ufal.udpipe.Model etc. are SWIG-magic and cannot be detected by pylint
# pylint: disable=no-member

class Model:
    def __init__(self, path):
        """Load given model."""
        self.model = ufal.udpipe.Model.load(path)
        if not self.model:
            raise Exception("Cannot load UDPipe model from file '%s'" % path)

    def tokenize(self, text):
        """Tokenize the text and return list of ufal.udpipe.Sentence-s."""
        tokenizer = self.model.newTokenizer(self.model.DEFAULT)
        if not tokenizer:
            raise Exception("The model does not have a tokenizer")
        return self._read(text, tokenizer)

    def read(self, text, in_format):
        """Load text in the given format (conllu|horizontal|vertical) and return list of ufal.udpipe.Sentence-s."""
        input_format = ufal.udpipe.InputFormat.newInputFormat(in_format)
        if not input_format:
            raise Exception("Cannot create input format '%s'" % in_format)
        return self._read(text, input_format)

    def _read(self, text, input_format):
        input_format.setText(text)
        error = ufal.udpipe.ProcessingError()
        sentences = []

        sentence = ufal.udpipe.Sentence()
        while input_format.nextSentence(sentence, error):
            sentences.append(sentence)
            sentence = ufal.udpipe.Sentence()
        if error.occurred():
            raise Exception(error.message)

        return sentences

    def tag(self, sentence):
        """Tag the given ufal.udpipe.Sentence (inplace)."""
        self.model.tag(sentence, self.model.DEFAULT)

    def parse(self, sentence):
        """Parse the given ufal.udpipe.Sentence (inplace)."""
        self.model.parse(sentence, self.model.DEFAULT)

    def write(self, sentences, out_format):
        """Write given ufal.udpipe.Sentence-s in the required format (conllu|horizontal|vertical)."""

        output_format = ufal.udpipe.OutputFormat.newOutputFormat(out_format)
        output = ''
        for sentence in sentences:
            output += output_format.writeSentence(sentence)
        output += output_format.finishDocument()

        return output

# Can be used as
#  model = Model('english-ud-1.2-160523.udpipe')
#  sentences = model.tokenize("Hi there. How are you?")
#  for s in sentences:
#      model.tag(s)
#      model.parse(s)
#  conllu = model.write(sentences, "conllu")

In [6]:
import re

In [20]:
def space(string):
    string = re.sub('([a-zA-Z]| )([\.\?!])', '\\1\\2 ', string)
    string = re.sub(': ', ' : ', string)
    string = re.sub('  +', ' ', string)
    return string

In [21]:
model = Model('english-partut-ud-2.0-170801.udpipe')
#text = open('/Users/irene/Downloads/exam2014/AAl_1_1.txt','r')
#text = text.read()
text = 'That means: only previous qualifications and / or knowledge of each applicant is taken into consideration.'
text = space(text)
sentences = model.tokenize(text)
for s in sentences:
      model.tag(s)
      model.parse(s)
conllu = model.write(sentences, "conllu")
result = open('/Users/irene/Downloads/AAl_1_1.conllu','w')
result.write(conllu)
result.close()

In [22]:
conllu

'# newdoc\n# newpar\n# sent_id = 1\n# text = That means : only previous qualifications and / or knowledge of each applicant is taken into consideration.\n1\tThat\tthat\tADJ\tA\tDegree=Pos\t2\tamod\t_\t_\n2\tmeans\tmean\tNOUN\tS\tNumber=Sing\t15\tobl\t_\t_\n3\t:\t:\tPUNCT\tFC\t_\t2\tpunct\t_\t_\n4\tonly\tonly\tADV\tB\t_\t5\tadvmod\t_\t_\n5\tprevious\tprevious\tADJ\tA\tDegree=Pos\t6\tamod\t_\t_\n6\tqualifications\tqualification\tNOUN\tS\tNumber=Plur\t15\tnsubj:pass\t_\t_\n7\tand\tand\tCCONJ\tCC\t_\t10\tcc\t_\t_\n8\t/\t/\tPUNCT\tFF\t_\t10\tpunct\t_\t_\n9\tor\tor\tCCONJ\tCC\t_\t10\tcc\t_\t_\n10\tknowledge\tknowledge\tNOUN\tS\tNumber=Sing\t6\tconj\t_\t_\n11\tof\tof\tADP\tE\t_\t13\tcase\t_\t_\n12\teach\teach\tDET\tDI\tNumber=Sing|PronType=Ind\t13\tdet\t_\t_\n13\tapplicant\tapplicant\tNOUN\tS\tNumber=Sing\t10\tnmod\t_\t_\n14\tis\tbe\tAUX\tVA\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t15\taux:pass\t_\t_\n15\ttaken\ttake\tVERB\tV\tTense=Past|VerbForm=Part\t0\troot\t_\t_\n16\tinto\t