In [1]:
import codecs
from spacy.munge import read_conll

data_dir = '/home/jvdzwaan/data/UD_Dutch/'

with codecs.open(data_dir+'nl-ud-train.conllu', 'rb', encoding='utf-8') as f:
    data = f.read()

In [44]:
def get_sentence(data):
    words = []
    tags = []
    parts = data.split('\n')
    if parts:
        for part in parts:
            if not part.startswith('#'):
                d = part.split()
                if len(d) > 3:
                    words.append(d[1])
                    tags.append(d[3])
                
    return words, tags
    

def read_connl(filepath):
    with codecs.open(filepath, encoding='utf-8') as f:
        data = f.read()
        
    sentences = data.split('\n\n')
    for sentence in sentences:
        yield get_sentence(sentence)

In [60]:
# create tag map
import json
import codecs

tags = set()
data = []
for w, t in read_connl(data_dir+'nl-ud-train.conllu'):
    tags.update(t)
    data.append((w,t))

tag_map = {}    
for t in tags:
    tag_map[t] = {'pos': t}
print tag_map

with codecs.open('/home/jvdzwaan/data/tmp/sherlock/spaCy/vocab/tag_map.json', 'wb', encoding='utf-8') as f:
    json.dump(tag_map, f, indent=2, encoding='utf-8')

{u'ADV': {'pos': u'ADV'}, u'NOUN': {'pos': u'NOUN'}, u'ADP': {'pos': u'ADP'}, u'PRON': {'pos': u'PRON'}, u'SCONJ': {'pos': u'SCONJ'}, u'PROPN': {'pos': u'PROPN'}, u'DET': {'pos': u'DET'}, u'SYM': {'pos': u'SYM'}, u'INTJ': {'pos': u'INTJ'}, u'PUNCT': {'pos': u'PUNCT'}, u'NUM': {'pos': u'NUM'}, u'AUX': {'pos': u'AUX'}, u'X': {'pos': u'X'}, u'CONJ': {'pos': u'CONJ'}, u'ADJ': {'pos': u'ADJ'}, u'VERB': {'pos': u'VERB'}}


In [61]:
with codecs.open('/home/jvdzwaan/data/tmp/sherlock/spaCy/vocab/tag_map.json', encoding='utf-8') as f:
    tag_map = json.load(f, encoding='utf-8')
print tag_map

{u'ADV': {u'pos': u'ADV'}, u'NOUN': {u'pos': u'NOUN'}, u'ADP': {u'pos': u'ADP'}, u'PUNCT': {u'pos': u'PUNCT'}, u'SCONJ': {u'pos': u'SCONJ'}, u'PROPN': {u'pos': u'PROPN'}, u'DET': {u'pos': u'DET'}, u'SYM': {u'pos': u'SYM'}, u'INTJ': {u'pos': u'INTJ'}, u'PRON': {u'pos': u'PRON'}, u'NUM': {u'pos': u'NUM'}, u'X': {u'pos': u'X'}, u'AUX': {u'pos': u'AUX'}, u'CONJ': {u'pos': u'CONJ'}, u'ADJ': {u'pos': u'ADJ'}, u'VERB': {u'pos': u'VERB'}}


In [62]:
from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.tokens import Doc
from spacy.gold import GoldParse

import random

vocab = Vocab(tag_map=tag_map)
tagger = Tagger(vocab)

n_iter = 25
failed_sentences = 0
sentences = 0

for i in range(n_iter):
    print i,
    random.shuffle(data)
    for sample in data:
        sentences += 1
            
        try:
            doc = Doc(vocab, words=sample[0])
            gold = GoldParse(doc, tags=sample[1])
            
            tagger.update(doc, gold)
        except Exception:
             failed_sentences += 1

print
print 'number of sentences', sentences/n_iter
print 'number of sentences that failed', failed_sentences/n_iter

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
number of sentences 13001
number of sentences that failed 0


In [63]:
tagger.model.end_training()

In [64]:
import os
def create_dirs(fname):
    """Create (output) directories if they don't exist
    """
    if not os.path.exists(fname):
        os.makedirs(fname)
        
out_dirs = ['/home/jvdzwaan/data/tmp/sherlock/spaCy/', '/home/jvdzwaan/data/tmp/sherlock/spaCy/vocab/', '/home/jvdzwaan/data/tmp/sherlock/spaCy/pos/']
for o in out_dirs:
    print o
    create_dirs(o)

tagger.model.dump(os.path.join(out_dirs[0], 'pos', 'model'))
with codecs.open(os.path.join(out_dirs[0], 'vocab', 'strings.json'), 'wb', encoding='utf-8') as file_:
    tagger.vocab.strings.dump(file_)

/home/jvdzwaan/data/tmp/sherlock/spaCy/
/home/jvdzwaan/data/tmp/sherlock/spaCy/vocab/
/home/jvdzwaan/data/tmp/sherlock/spaCy/pos/


In [65]:
# save other relevant models (taken from spaCy/language.py end_training line 353)
from spacy.attrs import TAG
tagger_freqs = list(tagger.freqs[TAG].items())
print len(tagger_freqs)

17


In [66]:
with codecs.open(os.path.join(out_dirs[0], 'vocab', 'serializer.json'), 'wb', encoding='utf-8') as file_:
            file_.write(
                json.dumps([
                    (TAG, tagger_freqs)
                ]))

In [67]:
vocab.dump(os.path.join(out_dirs[0], 'vocab', 'lexemes.bin'))