In [1]:
class Vocabulary(list):
  def __init__(self, word_list, comma_split_special_tokens):
    special_tokens = []
    for token_id, token in enumerate(comma_split_special_tokens.split(':')):
      special_tokens.append(f'<{token}>')
      setattr(self, token, f'<{token}>')
      setattr(self, f'{token}_id', token_id)

    super().__init__(special_tokens + word_list)
    self.dictionary = {word : index for index, word in enumerate(self) if word not in special_tokens}

In [2]:
class TokiPonaVocabulary(Vocabulary):
  def __init__(self, comma_split_special_tokens = 'sep:num:prp'):
    all_words = ' '.join([
        'a akesi ala alasa ali anpa ante anu awen e en esun',
        'ijo ike ilo insa jaki jan jelo jo kala kalama kama kasi',
        'ken kepeken kili kiwen ko kon kule kulupu kute la lape laso',
        'lawa len lete li lili linja lipu loje lon luka lukin lupa',
        'ma mama mani meli mi mije moku moli monsi mu mun musi',
        'mute nanpa nasa nasin nena ni nimi noka o olin ona open',
        'pakala pali palisa pan pana pi pilin pimeja pini pipi poka poki',
        'pona pu sama seli selo seme sewi sijelo sike sin sina sinpin',
        'sitelen sona soweli suli suno supa suwi tan taso tawa telo tenpo',
        'toki tomo tu unpa uta utala walo wan waso wawa weka wile'])
    super().__init__(all_words.split(), comma_split_special_tokens=comma_split_special_tokens)
    self.dictionary['ale'] = self.dictionary['ali']
    self.dictionary['oko'] = self.dictionary['lukin']
    self.dictionary['kin'] = self.dictionary['a']

In [3]:
import re

class IloTunimiParsingException(Exception):
  pass

class IloTunimi:
  def __init__(self):
    self.vocab = TokiPonaVocabulary()
    self.prp_pattern = re.compile(r'^([AIUEO]|[KSNPML][aiueo]|[TJ][aueo]|W[aie])n?(([ksnpml][aiueo]|[tj][aueo]|w[aie])n?)*$')

  def convert(self, x):
    if x in {'.', '!', '?', ':'}:
      return self.vocab.sep_id
    x = re.sub('[^0-9A-Za-z]', '', x)
    if x == '':
      return None
    elif x in self.vocab.dictionary:
      return self.vocab.dictionary[x]
    elif x.isdecimal():
      return self.vocab.num_id
    elif self.prp_pattern.match(x) and ('nm' not in x) and ('nn' not in x):
      return self.vocab.prp_id
    else:
      raise IloTunimiParsingException()

  def __call__(self, xs):
    xs = xs.strip()
    xs = re.sub(r'([.!?:])', ' \\1 ', xs)
    xs = xs.split()
    ys = []
    for x in xs:
      x = self.convert(x)
      if x is not None:
        ys.append(self.vocab[x])
    return ys

In [4]:
class Rule:
  def __init__(self, src, trg, terms):
    self.src = src
    self.trg = trg
    self.terms = set(terms)

class RRG:
  def __init__(self):
    self.rules = []
    self.finals = []

  def extend_finals(self, finals):
    self.finals += finals

  def add(self, src, terms, trg):
    self.rules.append(Rule(src, trg, terms))

  def update_state_list(self, word, old_state_list):
    new_state_list = []
    for rule in self.rules:
      if word in rule.terms:
        new_state_list.append(rule.trg)
    return new_state_list

  def __call__(self, sent, start='S'):
    state_list = [start]
    tmp = ' '.join(state_list)
    for word in sent:
      tmp += ' --{}--> '.format(word)
      state_list = self.update_state_list(word, state_list)
      tmp += ' '.join(state_list)
    print(tmp)
    return any(state in self.finals for state in state_list)

In [5]:
tokenizer = IloTunimi()
G = RRG()
G.extend_finals('E')
G.add('S', ['mi', 'sina'], 'MS')
G.add('MS', ['sona'], 'V1')
G.add('V1', ['ala'], 'A1')
G.add('A1', ['<sep>'], 'E')
G(tokenizer('mi sona ala.'))

S --mi--> MS --sona--> V1 --ala--> A1 --<sep>--> E


True