Skip to content

Commit

Permalink
RDLM training without editing bash scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
rsennrich committed Mar 20, 2015
1 parent 2271f29 commit b8ca33c
Show file tree
Hide file tree
Showing 6 changed files with 243 additions and 193 deletions.
38 changes: 23 additions & 15 deletions scripts/training/rdlm/README
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
RDLM: relational dependency language model
------------------------------------------

This is a language model for the string-to-tree decoder with a dependency grammar.
It should work with any corpus with projective dependency annotation in ConLL format,
converted into the Moses format with the script mosesdecoder/scripts/training/wrappers/conll2mosesxml.py
It depends on NPLM for neural network training and querying.
This is a language model for the string-to-tree decoder with a dependency
grammar. It should work with any corpus with projective dependency annotation in
ConLL format, converted into the Moses format with the script
mosesdecoder/scripts/training/wrappers/conll2mosesxml.py It depends on NPLM for
neural network training and querying.

Prerequisites
-------------
Expand All @@ -16,28 +17,35 @@ Install NPLM and compile moses with it. See the instructions in the Moses docume
Training
--------

RDLM is designed for string-to-tree decoding with dependency annotation on the target side.
If you have such a system, you can train RDLM on the target side of the same parallel corpus
that is used for training the translation model.
RDLM is designed for string-to-tree decoding with dependency annotation on the
target side. If you have such a system, you can train RDLM on the target side of
the same parallel corpus that is used for training the translation model.

To train the model on additional monolingual data, or test it on some held-out test/dev data,
parse and process it in the same way that the parallel corpus has been processed.
This includes tokenization, parsing, truecasing, compound splitting etc.
To train the model on additional monolingual data, or test it on some held-out
test/dev data, parse and process it in the same way that the parallel corpus has
been processed. This includes tokenization, parsing, truecasing, compound
splitting etc.

RDLM is split into two neural network models, which can be trained with `train_model_head.sh` and `train_model_label.sh`
set the paths to NPLM, Moses, and the training/test files in the respective files, then execute:
RDLM is split into two neural network models, which can be trained with
`train_rdlm.py`. An example command for training follows:

./train_model_head.sh rdlm_head.nnlm working_dir_head
./train_model_label.sh rdlm_label.nnlm working_dir_label
mkdir working_dir_head
mkdir working_dir_label
./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_head --output-dir /path/to/output_directory --output-model rdlm_head --mode head --output-vocab-size 500000 --noise-samples 100
./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise-samples 50

for more options, run `train_rdlm.py --help`. Parameters you may want to adjust
include the vocabulary size of the label model (depending on the number of
dependency relations in the grammar), the size of the models, and the number of
training epochs.

Decoding
--------

To use RDLM during decoding, add the following line to your moses.ini config:

[feature]
RDLM path_head_lm=/path/to/rdlm_head.nnlm path_label_lm=/path/to/rdlm_label.nnlm context_up=2 context_left=3 context_right=0
RDLM path_head_lm=/path/to/output_directory/rdlm_head.model.nplm path_label_lm=/path/to/output_directory/rdlm_label.model.nplm context_up=2 context_left=3 context_right=0

[weight]
RDLM 0.1 0.1
Expand Down
43 changes: 29 additions & 14 deletions scripts/training/rdlm/extract_syntactic_ngrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,24 @@
from __future__ import print_function, unicode_literals, division
import sys
import codecs
import io
import argparse

# hack for python2/3 compatibility
from io import open
argparse.open = open

try:
from lxml import etree as ET
except ImportError:
from xml.etree import cElementTree as ET

def parse_arguments():
def create_parser():
parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM")

parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
help='input file (default: standard input).')
parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout, metavar='PATH',
help='output file (default: standard output).')
parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)',
choices=['label', 'head'], required=True)
parser.add_argument('--vocab', metavar='PATH', type=str, required=True,
Expand All @@ -40,7 +47,7 @@ def parse_arguments():
help='sentence end symbol. Will be skipped during extraction (default: %(default)s)')
parser.add_argument('--ptkvz', action='store_true',
help='special rule for German dependency trees: concatenate separable verb prefix and verb')
return parser.parse_args()
return parser

def escape_text(s):

Expand Down Expand Up @@ -203,7 +210,7 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
int_list.append(vocab.get(labels[i], 0))
int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0)))

sys.stdout.write(' '.join(map(str, int_list)) + '\n')
options.output.write(' '.join(map(str, int_list)) + '\n')

parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0)))
parent_labels.append(vocab.get(labels[i], 0))
Expand All @@ -216,18 +223,11 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p

def load_vocab(path):
v = {}
for i,line in enumerate(io.open(path, encoding="UTF-8")):
for i,line in enumerate(open(path, encoding="UTF-8")):
v[line.strip()] = i
return v

if __name__ == '__main__':

if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)

options = parse_arguments()

def main(options):
vocab = load_vocab(options.vocab)

if options.output_vocab is None:
Expand All @@ -236,13 +236,17 @@ def load_vocab(path):
else:
output_vocab = load_vocab(options.output_vocab)

global start_head_idx
global start_label_idx
global stop_head_idx
global stop_label_idx
start_head_idx = vocab.get("<start_head>", 0)
start_label_idx = vocab.get("<start_label>", 0)
stop_head_idx = vocab.get("<stop_head>", 0)
stop_label_idx = vocab.get("<stop_label>", 0)

i = 0
for line in sys.stdin:
for line in options.input:
if i and not i % 50000:
sys.stderr.write('.')
if i and not i % 1000000:
Expand All @@ -260,3 +264,14 @@ def load_vocab(path):
xml = ET.fromstring(line)
get_syntactic_ngrams(xml, options, vocab, output_vocab)
i += 1

if __name__ == '__main__':

if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)

parser = create_parser()
options = parser.parse_args()

main(options)
60 changes: 33 additions & 27 deletions scripts/training/rdlm/extract_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,19 @@
from __future__ import print_function, unicode_literals, division
import sys
import codecs
import io
import argparse
from collections import Counter

# hack for python2/3 compatibility
from io import open
argparse.open = open

try:
from lxml import etree as ET
except ImportError:
from xml.etree import cElementTree as ET

def parse_arguments():
def create_parser():

help_text = "generate 5 vocabulary files from parsed corpus in moses XML format\n"
help_text += " [PREFIX].special: around 40 symbols reserved for RDLM\n";
Expand All @@ -34,9 +37,7 @@ def parse_arguments():
parser.add_argument('--ptkvz', action="store_true",
help='special rule for German dependency trees: attach separable verb prefixes to verb')

args = parser.parse_args()

return args
return parser

def escape_text(s):

Expand All @@ -48,7 +49,7 @@ def escape_text(s):
return s

# deterministic heuristic to get head of subtree
def get_head(xml):
def get_head(xml, args):
head = None
preterminal = None
for child in xml:
Expand All @@ -70,11 +71,11 @@ def get_head(xml):

return head, preterminal

def get_vocab(xml):
def get_vocab(xml, args):

if len(xml):

head, preterminal = get_head(xml)
head, preterminal = get_head(xml, args)
if not head:
head = '<null>'
preterminal = '<null>'
Expand All @@ -89,18 +90,13 @@ def get_vocab(xml):
for child in xml:
if not len(child):
continue
get_vocab(child)



if __name__ == '__main__':
get_vocab(child, args)

if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
def main(args):

args = parse_arguments()
global heads
global preterminals
global nonterminals

heads = Counter()
preterminals = Counter()
Expand All @@ -115,39 +111,36 @@ def get_vocab(xml):
if line == '\n':
continue

# hack for older moses versions with inconsistent encoding of "|"
line = line.replace('&bar;', '&#124;')

xml = ET.fromstring(line)
get_vocab(xml)
get_vocab(xml, args)
i += 1

special_tokens = ['<unk>', '<null>', '<null_label>', '<null_head>', '<head_label>', '<root_label>', '<start_label>', '<stop_label>', '<head_head>', '<root_head>', '<start_head>', '<dummy_head>', '<stop_head>']

for i in range(30):
special_tokens.append('<null_{0}>'.format(i))

f = io.open(args.output + '.special', 'w', encoding='UTF-8')
f = open(args.output + '.special', 'w', encoding='UTF-8')
for item in special_tokens:
f.write(item + '\n')
f.close()

f = io.open(args.output + '.preterminals', 'w', encoding='UTF-8')
f = open(args.output + '.preterminals', 'w', encoding='UTF-8')
for item in sorted(preterminals, key=preterminals.get, reverse=True):
f.write(item + '\n')
f.close()

f = io.open(args.output + '.nonterminals', 'w', encoding='UTF-8')
f = open(args.output + '.nonterminals', 'w', encoding='UTF-8')
for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
f.write(item + '\n')
f.close()

f = io.open(args.output + '.terminals', 'w', encoding='UTF-8')
f = open(args.output + '.terminals', 'w', encoding='UTF-8')
for item in sorted(heads, key=heads.get, reverse=True):
f.write(item + '\n')
f.close()

f = io.open(args.output + '.all', 'w', encoding='UTF-8')
f = open(args.output + '.all', 'w', encoding='UTF-8')
special_tokens_set = set(special_tokens)
for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
if item not in special_tokens:
Expand All @@ -167,3 +160,16 @@ def get_vocab(xml):
i += 1
f.write(item + '\n')
f.close()



if __name__ == '__main__':

if sys.version_info < (3, 0):
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)

parser = create_parser()
args = parser.parse_args()
main(args)
65 changes: 0 additions & 65 deletions scripts/training/rdlm/train_model_head.sh

This file was deleted.

Loading

0 comments on commit b8ca33c

Please sign in to comment.