RDLM training without editing bash scripts

moses-smt · Mar 20, 2015 · b8ca33c · b8ca33c
1 parent 2271f29
commit b8ca33c
Show file tree

Hide file tree

Showing 6 changed files with 243 additions and 193 deletions.
diff --git a/scripts/training/rdlm/README b/scripts/training/rdlm/README
@@ -1,10 +1,11 @@
 RDLM: relational dependency language model
 ------------------------------------------
 
-This is a language model for the string-to-tree decoder with a dependency grammar.
-It should work with any corpus with projective dependency annotation in ConLL format,
-converted into the Moses format with the script mosesdecoder/scripts/training/wrappers/conll2mosesxml.py
-It depends on NPLM for neural network training and querying.
+This is a language model for the string-to-tree decoder with a dependency
+grammar. It should work with any corpus with projective dependency annotation in
+ConLL format, converted into the Moses format with the script
+mosesdecoder/scripts/training/wrappers/conll2mosesxml.py It depends on NPLM for
+neural network training and querying.
 
 Prerequisites
 -------------
@@ -16,28 +17,35 @@ Install NPLM and compile moses with it. See the instructions in the Moses docume
 Training
 --------
 
-RDLM is designed for string-to-tree decoding with dependency annotation on the target side.
-If you have such a system, you can train RDLM on the target side of the same parallel corpus
-that is used for training the translation model.
+RDLM is designed for string-to-tree decoding with dependency annotation on the
+target side. If you have such a system, you can train RDLM on the target side of
+the same parallel corpus that is used for training the translation model.
 
-To train the model on additional monolingual data, or test it on some held-out test/dev data,
-parse and process it in the same way that the parallel corpus has been processed.
-This includes tokenization, parsing, truecasing, compound splitting etc.
+To train the model on additional monolingual data, or test it on some held-out
+test/dev data, parse and process it in the same way that the parallel corpus has
+been processed. This includes tokenization, parsing, truecasing, compound
+splitting etc.
 
-RDLM is split into two neural network models, which can be trained with `train_model_head.sh` and `train_model_label.sh`
-set the paths to NPLM, Moses, and the training/test files in the respective files, then execute:
+RDLM is split into two neural network models, which can be trained with
+`train_rdlm.py`. An example command for training follows:
 
-  ./train_model_head.sh rdlm_head.nnlm working_dir_head
-  ./train_model_label.sh rdlm_label.nnlm working_dir_label
+  mkdir working_dir_head
+  mkdir working_dir_label
+  ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_head  --output-dir /path/to/output_directory --output-model rdlm_head  --mode head  --output-vocab-size 500000 --noise-samples 100
+  ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise-samples 50
 
+for more options, run `train_rdlm.py --help`. Parameters you may want to adjust
+include the vocabulary size of the label model (depending on the number of
+dependency relations in the grammar), the size of the models, and the number of
+training epochs.
 
 Decoding
 --------
 
 To use RDLM during decoding, add the following line to your moses.ini config:
 
   [feature]
-  RDLM path_head_lm=/path/to/rdlm_head.nnlm path_label_lm=/path/to/rdlm_label.nnlm context_up=2 context_left=3 context_right=0
+  RDLM path_head_lm=/path/to/output_directory/rdlm_head.model.nplm path_label_lm=/path/to/output_directory/rdlm_label.model.nplm context_up=2 context_left=3 context_right=0
 
   [weight]
   RDLM 0.1 0.1

diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py
@@ -9,17 +9,24 @@
 from __future__ import print_function, unicode_literals, division
 import sys
 import codecs
-import io
 import argparse
 
+# hack for python2/3 compatibility
+from io import open
+argparse.open = open
+
 try:
     from lxml import etree as ET
 except ImportError:
     from xml.etree import cElementTree as ET
 
-def parse_arguments():
+def create_parser():
     parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM")
 
+    parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
+                        help='input file (default: standard input).')
+    parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout, metavar='PATH',
+                        help='output file (default: standard output).')
     parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)',
                         choices=['label', 'head'], required=True)
     parser.add_argument('--vocab', metavar='PATH', type=str, required=True,
@@ -40,7 +47,7 @@ def parse_arguments():
                         help='sentence end symbol. Will be skipped during extraction (default: %(default)s)')
     parser.add_argument('--ptkvz', action='store_true',
                         help='special rule for German dependency trees: concatenate separable verb prefix and verb')
-    return parser.parse_args()
+    return parser
 
 def escape_text(s):
 
@@ -203,7 +210,7 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
                 int_list.append(vocab.get(labels[i], 0))
                 int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0)))
 
-            sys.stdout.write(' '.join(map(str, int_list)) + '\n')
+            options.output.write(' '.join(map(str, int_list)) + '\n')
 
             parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0)))
             parent_labels.append(vocab.get(labels[i], 0))
@@ -216,18 +223,11 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
 
 def load_vocab(path):
     v = {}
-    for i,line in enumerate(io.open(path, encoding="UTF-8")):
+    for i,line in enumerate(open(path, encoding="UTF-8")):
         v[line.strip()] = i
     return v
 
-if __name__ == '__main__':
-
-    if sys.version_info < (3, 0):
-        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
-        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
-
-    options = parse_arguments()
-
+def main(options):
     vocab = load_vocab(options.vocab)
 
     if options.output_vocab is None:
@@ -236,13 +236,17 @@ def load_vocab(path):
     else:
         output_vocab = load_vocab(options.output_vocab)
 
+    global start_head_idx
+    global start_label_idx
+    global stop_head_idx
+    global stop_label_idx
     start_head_idx = vocab.get("<start_head>", 0)
     start_label_idx = vocab.get("<start_label>", 0)
     stop_head_idx = vocab.get("<stop_head>", 0)
     stop_label_idx = vocab.get("<stop_label>", 0)
 
     i = 0
-    for line in sys.stdin:
+    for line in options.input:
         if i and not i % 50000:
             sys.stderr.write('.')
         if i and not i % 1000000:
@@ -260,3 +264,14 @@ def load_vocab(path):
         xml = ET.fromstring(line)
         get_syntactic_ngrams(xml, options, vocab, output_vocab)
         i += 1
+
+if __name__ == '__main__':
+
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+
+    parser = create_parser()
+    options = parser.parse_args()
+
+    main(options)
diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py
@@ -7,16 +7,19 @@
 from __future__ import print_function, unicode_literals, division
 import sys
 import codecs
-import io
 import argparse
 from collections import Counter
 
+# hack for python2/3 compatibility
+from io import open
+argparse.open = open
+
 try:
     from lxml import etree as ET
 except ImportError:
     from xml.etree import cElementTree as ET
 
-def parse_arguments():
+def create_parser():
 
     help_text =  "generate 5 vocabulary files from parsed corpus in moses XML format\n"
     help_text += "  [PREFIX].special: around 40 symbols reserved for RDLM\n";
@@ -34,9 +37,7 @@ def parse_arguments():
     parser.add_argument('--ptkvz', action="store_true",
                     help='special rule for German dependency trees: attach separable verb prefixes to verb')
 
-    args = parser.parse_args()
-
-    return args
+    return parser
 
 def escape_text(s):
 
@@ -48,7 +49,7 @@ def escape_text(s):
     return s
 
 # deterministic heuristic to get head of subtree
-def get_head(xml):
+def get_head(xml, args):
     head = None
     preterminal = None
     for child in xml:
@@ -70,11 +71,11 @@ def get_head(xml):
 
     return head, preterminal
 
-def get_vocab(xml):
+def get_vocab(xml, args):
 
     if len(xml):
 
-        head, preterminal = get_head(xml)
+        head, preterminal = get_head(xml, args)
         if not head:
             head = '<null>'
             preterminal = '<null>'
@@ -89,18 +90,13 @@ def get_vocab(xml):
         for child in xml:
             if not len(child):
                 continue
-            get_vocab(child)
-
-
-
-if __name__ == '__main__':
+            get_vocab(child, args)
 
-    if sys.version_info < (3, 0):
-        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
-        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
-        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+def main(args):
 
-    args = parse_arguments()
+    global heads
+    global preterminals
+    global nonterminals
 
     heads = Counter()
     preterminals = Counter()
@@ -115,39 +111,36 @@ def get_vocab(xml):
         if line == '\n':
             continue
 
-        # hack for older moses versions with inconsistent encoding of "|"
-        line = line.replace('&bar;', '&#124;')
-
         xml = ET.fromstring(line)
-        get_vocab(xml)
+        get_vocab(xml, args)
         i += 1
 
     special_tokens = ['<unk>', '<null>', '<null_label>', '<null_head>', '<head_label>', '<root_label>', '<start_label>', '<stop_label>', '<head_head>', '<root_head>', '<start_head>', '<dummy_head>', '<stop_head>']
 
     for i in range(30):
       special_tokens.append('<null_{0}>'.format(i))
 
-    f = io.open(args.output + '.special', 'w', encoding='UTF-8')
+    f = open(args.output + '.special', 'w', encoding='UTF-8')
     for item in special_tokens:
         f.write(item + '\n')
     f.close()
 
-    f = io.open(args.output + '.preterminals', 'w', encoding='UTF-8')
+    f = open(args.output + '.preterminals', 'w', encoding='UTF-8')
     for item in sorted(preterminals, key=preterminals.get, reverse=True):
         f.write(item + '\n')
     f.close()
 
-    f = io.open(args.output + '.nonterminals', 'w', encoding='UTF-8')
+    f = open(args.output + '.nonterminals', 'w', encoding='UTF-8')
     for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
         f.write(item + '\n')
     f.close()
 
-    f = io.open(args.output + '.terminals', 'w', encoding='UTF-8')
+    f = open(args.output + '.terminals', 'w', encoding='UTF-8')
     for item in sorted(heads, key=heads.get, reverse=True):
         f.write(item + '\n')
     f.close()
 
-    f = io.open(args.output + '.all', 'w', encoding='UTF-8')
+    f = open(args.output + '.all', 'w', encoding='UTF-8')
     special_tokens_set = set(special_tokens)
     for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
         if item not in special_tokens:
@@ -167,3 +160,16 @@ def get_vocab(xml):
         i += 1
         f.write(item + '\n')
     f.close()
+
+
+
+if __name__ == '__main__':
+
+    if sys.version_info < (3, 0):
+        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+
+    parser = create_parser()
+    args = parser.parse_args()
+    main(args)
diff --git a/scripts/training/rdlm/train_model_head.sh b/scripts/training/rdlm/train_model_head.sh