In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# sources:
# https://spacy.io/docs/usage/training
# https://github.com/explosion/spaCy/blob/master/examples/training/train_tagger.py

import codecs
import json

from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.tokens import Doc
from spacy.gold import GoldParse

import random

# You need to define a mapping from your data's part-of-speech tag names to the
# Universal Part-of-Speech tag set, as spaCy includes an enum of these tags.
# See here for the Universal Tag Set:
# http://universaldependencies.github.io/docs/u/pos/index.html
# You may also specify morphological features for your tags, from the universal
# scheme.
with codecs.open('/home/jvdzwaan/data/tmp/sherlock/nl_tag_map.json', 'rb', encoding='utf-8') as f:
    TAG_MAP = json.load(f)

print len(TAG_MAP)

202


In [3]:
# create vocab
vocab = Vocab(tag_map=TAG_MAP)
print len(vocab)

1


In [4]:
# create tagger

# The default_templates argument is where features are specified. See
# spacy/tagger.pyx for the defaults.
tagger = Tagger(vocab)

In [7]:
# Create training data
# data is stored in json files of approximately 1 Mb
# data format [(['list' 'of', 'words'], ['list' 'of' 'tags']), ..]

import glob
import uuid
import os
import sys

# Usually you'll read this in, of course. Data formats vary.
# Ensure your strings are unicode
files = glob.glob('/home/jvdzwaan/data/tmp/sherlock/nlwiki-json/*.json')
print len(files), 'files'

data = []
SIZE = 1 * 1000 * 1000

def write_to_file(data):
    fname = '{}.json'.format(str(uuid.uuid4()))
    with codecs.open(os.path.join('/home/jvdzwaan/data/tmp/sherlock/tagger-train/', fname), 'wb', encoding='utf-8') as f:
        json.dump(data, f, encoding='utf-8')
    

for fi in files:
    with codecs.open(fi, 'rb', encoding='utf-8') as f:
        saf = json.load(f)
    #print len(saf['tokens'])
    tokens = saf['tokens']
    sent = tokens[0]['sentence']
    s = []
    tags = []
    #print sent
    for token in tokens:
        if token['sentence'] != sent:
            # save s and tags
            data.append((s, tags))
            
            #print sys.getsizeof(json.dumps(data)), len(data)
            if sys.getsizeof(json.dumps(data)) >= SIZE:
                print 'write to file'
                write_to_file(data)
                
                data = []
             
            # reset s and tags
            s = []
            tags = []

            sent = token['sentence']
        
        words = token['word'].split('_')
        pos_tags = token['pos'].split('_')
        for w, p in zip(words, pos_tags):
            s.append(w)
            tags.append(p)
write_to_file(data)

999 files
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file
write to file


In [8]:
files = glob.glob('/home/jvdzwaan/data/tmp/sherlock/tagger-train/*.json')
print len(files), 'files'

n_iter = 25
failed_sentences = 0
sentences = 0

for i in range(n_iter):
    print i,
    random.shuffle(files)
    for fi in files:
        with codecs.open(fi, 'rb', encoding='utf-8') as f:
            data = json.load(f)
        random.shuffle(data)
        for sample in data:          
            sentences += 1
            
            try:
                doc = Doc(vocab, words=sample[0])
                gold = GoldParse(doc, tags=sample[1])
            
                tagger.update(doc, gold)
            except Exception:
                failed_sentences += 1

print
print 'number of sentences', sentences/n_iter
print 'number of sentences that failed', failed_sentences/n_iter

22 files
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
number of sentences 40303
number of sentences that failed 5


In [9]:
tagger.model.end_training()

In [58]:
type(tagger.model)

spacy.tagger.TaggerModel

In [10]:
def create_dirs(fname):
    """Create (output) directories if they don't exist
    """
    if not os.path.exists(fname):
        os.makedirs(fname)
        
out_dirs = ['/home/jvdzwaan/data/tmp/sherlock/spaCy/', '/home/jvdzwaan/data/tmp/sherlock/spaCy/vocab/', '/home/jvdzwaan/data/tmp/sherlock/spaCy/pos/']
for o in out_dirs:
    print o
    create_dirs(o)

tagger.model.dump(os.path.join(out_dirs[0], 'pos', 'model'))
with codecs.open(os.path.join(out_dirs[0], 'vocab', 'strings.json'), 'wb', encoding='utf-8') as file_:
    tagger.vocab.strings.dump(file_)

/home/jvdzwaan/data/tmp/sherlock/spaCy/
/home/jvdzwaan/data/tmp/sherlock/spaCy/vocab/
/home/jvdzwaan/data/tmp/sherlock/spaCy/pos/


In [11]:
# save other relevant models (taken from spaCy/language.py end_training line 353)
from spacy.attrs import TAG
tagger_freqs = list(tagger.freqs[TAG].items())
print len(tagger_freqs)

203


In [12]:
with codecs.open(os.path.join(out_dirs[0], 'vocab', 'serializer.json'), 'wb', encoding='utf-8') as file_:
            file_.write(
                json.dumps([
                    (TAG, tagger_freqs)
                ]))

In [13]:
vocab.dump(os.path.join(out_dirs[0], 'vocab', 'lexemes.bin'))