In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import codecs

# UD_Dutch_LassySmall
data_dir = '/home/jvdzwaan/data/UD_Dutch-LassySmall/'
test = data_dir + 'nl_lassysmall-ud-test.conllu'
train = data_dir + 'nl_lassysmall-ud-train.conllu'

Spacy uses a different numbering of the depencency arcs than conllx for Dutch. To transform conll x to spacy format, the root word (with head 0) should be the index in the word list. And all other words should be decremented with 1.

In [4]:
def get_sentence(data):
    words = []
    heads = []
    deps = []
    parts = data.split('\n')
    if parts:
        for part in parts:
            if not part.startswith('#'):
                d = part.split()
                if len(d) > 3:
                    words.append(d[1])
                    heads.append(int(d[6]))
                    if d[7] == 'root':
                        deps.append('ROOT')
                    else:
                        deps.append(d[7])
    new_heads = []
    for indx, h in enumerate(heads):
        if h == 0:
            new_heads.append(indx)
        else:
            new_heads.append(h-1)
                
    return words, new_heads, deps
    

def read_connl(filepath):
    with codecs.open(filepath, encoding='utf-8') as f:
        data = f.read()
        
    sentences = data.split('\n\n')
    for sentence in sentences:
        yield get_sentence(sentence)

In [13]:
import sys
import random
data = []
for w, n, l in read_connl(train):
    data.append((w, n, l))
print len(data)
print data[random.randint(0, len(data))]

6642
([u'Zij', u'behoort', u'tot', u'de', u'socialistische', u'zuil', u'.'], [1, 1, 5, 5, 5, 1, 1], [u'nsubj', 'ROOT', u'case', u'det', u'amod', u'nmod', u'punct'])


In [17]:
# source: examples/training/train_parser.py
import random

import spacy
from spacy.pipeline import DependencyParser
from spacy.gold import GoldParse
from spacy.tokens import Doc

def train_parser(nlp, train_data, left_labels, right_labels):
    labels = set(left_labels+right_labels)
    parser = DependencyParser(
                nlp.vocab,
                labels=labels)
    for itn in range(1000):
        random.shuffle(train_data)
        loss = 0
        for words, heads, deps in train_data:
            print words
            print heads
            print deps
            doc = Doc(nlp.vocab, words=words)
            gold = GoldParse(doc, heads=heads, deps=deps)
            #print gold.cand_to_gold
            #print gold.gold_to_cand
            #print gold.orig_annot
            print gold.is_projective
            loss += parser.update(doc, gold)
    parser.model.end_training()
    return parser

nlp = spacy.load('nl', tagger=False, parser=False, entity=False, add_vectors=False)

left_labels = set()
right_labels = set()
for _, heads, deps in data:
    for i, (head, dep) in enumerate(zip(heads, deps)):
        if i < head:
            left_labels.add(dep)
        elif i > head:
            right_labels.add(dep)
#print sorted(left_labels)
#print sorted(right_labels)
parser = train_parser(nlp, data, sorted(left_labels), sorted(right_labels))

[u'Parijs-Roubaix', u'is', u'een', u'eendaagse', u'wielerwedstrijd', u'die', u'elk', u'voorjaar', u'wordt', u'verreden', u'in', u'het', u'noorden', u'van', u'Frankrijk', u'.']
[4, 4, 4, 4, 4, 9, 7, 9, 9, 4, 12, 12, 9, 14, 12, 4]
[u'nsubj', u'cop', u'det', u'amod', 'ROOT', u'nsubj', u'det', u'nmod', u'auxpass', u'acl', u'case', u'det', u'nmod', u'case', u'nmod', u'punct']
True


ValueError: Could not find a gold-standard action to supervise the dependency parser.
The GoldParse was projective.

In [None]:
from spacy.vocab import Vocab
from spacy.pipeline import DependencyParser
from spacy.tokens import Doc

vocab = Vocab()
parser = DependencyParser(vocab, labels=['nsubj', 'compound', 'dobj', 'punct'])

doc = Doc(vocab, words=[u'Who', u'is', u'Shaka', u'Khan', u'?'])
parser.update(doc, [(1, 'nsubj'), (1, 'ROOT'), (3, 'compound'), (1, 'dobj'),
                    (1, 'punct')])

parser.model.end_training()

Spacy uses a different numbering of the depencency arcs than conllx for Dutch. To transform conll x to spacy format, the root word (with head 0) should be the index in the word list. And all other words should be decremented with 1.