<a href="https://colab.research.google.com/github/nelly-hateva/tardis/blob/master/notebooks/State_Regularized_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Prerequisities: Access data in google drive

In [0]:
from google.colab import drive

MOUNT_POINT = "/content/drive/"
DATA_DIR = MOUNT_POINT + "My Drive/Colab Notebooks/Thesis-Data"
MODELS_DIR = MOUNT_POINT + "My Drive/Colab Notebooks/Thesis-Models"
drive.mount(MOUNT_POINT)

Import libraries, set random seed and device

In [0]:
import os
import random
import time
import pickle
import copy

import numpy
import torch
from torch import nn, optim, utils

def seed_torch(seed=666):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  numpy.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

seed_torch()

# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

State Regularized RNN

In [0]:
class SRRNN(nn.Module):

  r""" https://arxiv.org/pdf/1901.08817.pdf
       https://github.com/deepsemantic/sr-rnns
  Applies a single-layer State Regularized RNN to an input sequence.

  If :attr:`mode` is ``'rnn'``, then for each element in the input sequence computes the function

    .. math::
        h_t' = \text{tanh}(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh})

    where :math:`x_t` is the input at time `t`, :math:`h_{(t-1)}` is the hidden state
    at time `t-1` or the initial hidden state at time `0`.
    If :attr:`nonlinearity` is ``'relu'``, then `ReLU` is used instead of `tanh`.

  If :attr:`mode` is ``'gru'``, then for each element in the input sequence computes the function

    .. math::
      \begin{array}{ll} \\
        r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
        z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
        n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
        h_t' = (1 - z_t) * n_t + z_t * h_{(t-1)}
      \end{array}

    where :math:`x_t` is the input at time `t`, :math:`h_{(t-1)}` is the hidden state
    at time `t-1` or the initial hidden state at time `0`.
    :math:`r_t`, :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively.
    :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.

  In both modes, if :attr:`number_of_states` is ``None``, then the hidden state
  at time `t` :math:`h_t` equals :math:`h_t'`. 

  Otherwise,

    .. math::
      \begin{array}{ll} \\
        \alpha_{i} = \frac{\exp(- \Vert{h_t' - s_i}\Vert)}{\sum_{i=1}^{k} \exp(- \Vert{h_t' - s_i}\Vert)}
        h_t = {\sum_{i=1}^{k} \alpha_{i} s_i}
      \end{array}

    where :math:`\{s_1, s_2, ..., s_k\}` are the k learnable states. 
    :math:`\alpha_{i}` is the probability of the RNN to transition to state i
    given the vector :math:`h_t'` for which we write :math:`p_{h_t'}(i) = \alpha_{i}`

    Args:
        input_size: The number of expected features in the input `x`
        hidden_size: The number of features in the hidden state `h`
        mode: The RNN mode to use.
          Can be either ``'rnn'`` or ``'gru'``. Default: ``'rnn'``
        nonlinearity: The non-linearity to use if :attr:`mode` is ``'rnn'``.
          Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
            Default: ``True``
        number_of_states: The number of learnable finite states.
          If ``None`` then the stohastic component is not used. Default: ``None``

    Inputs: input, h_0
        - **input** of shape `(batch, seq_len, input_size)`:
          tensor containing the features of the input sequence.
          The input can also be a packed variable length sequence.
          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
          :func:`torch.nn.utils.rnn.pack_sequence` for details.
        - **h_0** of shape `(batch, hidden_size)`: tensor
          containing the initial hidden state for each element in the batch.

          If `h_0` is not provided, it defaults to zero.

    Outputs: output, h_n
        - **output** of shape `(batch, seq_len, hidden_size)`: tensor
          containing the output features `(h_t)` from the RNN for each `t`.
          If a :class:`torch.nn.utils.rnn.PackedSequence` has been
          given as the input, the output will also be a packed sequence.
        - **h_n** of shape `(batch, hidden_size)`: tensor
          containing the hidden state for `t = seq_len`.

    Attributes:
        If :attr:`mode` is ``'rnn'``
          weight_ih: the learnable input-hidden weights,
              of shape `(hidden_size, input_size)`
          weight_hh: the learnable hidden-hidden weights,
              of shape `(hidden_size, hidden_size)`
          bias_ih: the learnable input-hidden bias,
              of shape `(hidden_size)`
          bias_hh: the learnable hidden-hidden bias,
              of shape `(hidden_size)`
        If :attr:`mode` is ``'gru'``
          weight_ih : the learnable input-hidden weights
              (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)`
          weight_hh : the learnable hidden-hidden weights
              (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)`
          bias_ih : the learnable input-hidden bias
              (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
          bias_hh : the learnable hidden-hidden bias
              (b_hr|b_hz|b_hn), of shape `(3*hidden_size)`
        states: the learnable finite number of states, of shape
            `(number_of_states, hidden_size)

    .. note::
        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
        where :math:`k = \frac{1}{\text{hidden\_size}}`

    Examples::
        >>> rnn = SRRNN(10, 20)
        >>> input = torch.randn(6, 3, 10)
        >>> h_0 = torch.randn(6, 20)
        >>> output, h_n = rnn(input, h_0)
    """

  def __init__(
    self, input_size, hidden_size, bias=True, mode='rnn', nonlinearity='tanh',
    number_of_states=None, temperature=1.00
  ):
    super(SRRNN, self).__init__()

    self.input_size = input_size
    self.hidden_size = hidden_size
    self.bias = bias

    if mode == 'rnn':
      self.rnn_cell = nn.RNNCell(
        input_size, hidden_size, bias=bias, nonlinearity=nonlinearity
      )
    elif mode == 'gru':
      self.rnn_cell = nn.GRUCell(
        input_size, hidden_size, bias=bias
      )
    else:
      raise ValueError("Unknown model '{}'".format(mode))

    self.stohastic_component = False

    if number_of_states:
      self.stohastic_component = True
      self.number_of_states = number_of_states

      self.softmax = nn.Softmax(dim=1)
      self.states = nn.Parameter(
        torch.Tensor(
            self.number_of_states, hidden_size
        )
      )
      self.temperature = temperature

    self.reset_parameters()

  def reset_parameters(self):
    # for hh, ih in zip(
    #   self.rnn_cell.weight_hh.chunk(self.number_of_gates, 0),
    #   self.rnn_cell.weight_ih.chunk(self.number_of_gates, 0)
    # ):
    #   nn.init.orthogonal_(hh)
    #   nn.init.orthogonal_(ih)

    # if self.bias:
    #   nn.init.zeros_(self.rnn_cell.bias_ih)
    #   nn.init.zeros_(self.rnn_cell.bias_hh)

    if self.stohastic_component:
      nn.init.uniform_(self.states, a=-0.1, b=0.1)

  def extra_repr(self):
    s = ''
    if 'number_of_states' in self.__dict__:
      s = 'number_of_states={number_of_states}'
      if 'temperature' in self.__dict__ and self.temperature != 1.00:
        s += ', temperature={temperature}'

    return s.format(**self.__dict__)

  def permute_hidden(self, hidden, permutation, dim=0):
    if permutation is None:
      return hidden
    return hidden.index_select(dim, permutation)

  def forward(self, input, h_0=None):
    orig_input = input

    if isinstance(orig_input, nn.utils.rnn.PackedSequence):
      input, batch_sizes, sorted_indices, unsorted_indices = input
      max_batch_size = int(batch_sizes[0])
    else:
      max_batch_size, sorted_indices = input.size(0), None

    if h_0 is None:
      h_0 = torch.zeros(
        max_batch_size, self.hidden_size, dtype=input.dtype, device=input.device
      )
      #  h_0 = self.states[0].expand(max_batch_size, -1)
    else:
      # Each batch of the hidden state should match the input sequence that
      # the user believes he/she is passing in.
      h_0 = self.permute_hidden(h_0, sorted_indices)

    if isinstance(orig_input, nn.utils.rnn.PackedSequence):
      output, hidden, transition_probabilities = self.forward_packed(input, batch_sizes, h_0)

      if self.stohastic_component:
        transition_probabilities = nn.utils.rnn.PackedSequence(
          transition_probabilities, batch_sizes, sorted_indices, unsorted_indices
        )

      hidden = self.permute_hidden(hidden, unsorted_indices)
      output = nn.utils.rnn.PackedSequence(
        output, batch_sizes, sorted_indices, unsorted_indices
      )

      return output, hidden, transition_probabilities

    return self.forward_tensor(input, h_0)

  def forward_tensor(self, input, h_0):
    output, h_t = [], h_0

    if self.stohastic_component:
      transition_probabilities = []

    for t in range(input.size(1)):
      result = self.forward_impl(input[:,t,:], h_t)
      if self.stohastic_component:
        transition_probs, h_t = result
        transition_probabilities.append(transition_probs)
      else:
        h_t = result
      output.append(h_t)

    output = torch.stack(output).permute(1, 0, 2)

    if self.stohastic_component:
      return output, h_t, torch.stack(transition_probabilities).permute(1, 0, 2)
    return output, h_t, None

  def forward_packed(self, input, batch_sizes, h_0):
    output, h_n, t, h_t = [], [], 0, h_0

    if self.stohastic_component:
      transition_probabilities = []

    for batch_size in batch_sizes:
      batch_size = int(batch_size)

      h_t, h_n_ = h_t[:batch_size], h_t[batch_size:]
      h_n.append(h_n_)
      result = self.forward_impl(input[t : t + batch_size], h_t)

      if self.stohastic_component:
        transition_probs, h_t = result
        transition_probabilities.append(transition_probs)
      else:
        h_t = result

      output.append(h_t)
      t += batch_size

    h_n.append(h_t)
    h_n.reverse()

    output, h_n = torch.cat(output), torch.cat(h_n)

    if self.stohastic_component:
      return output, h_n, torch.cat(transition_probabilities)
    return output, h_n, None

  def forward_impl(self, input, h_t):
    h_t_ = self.rnn_cell(input, h_t)
    if self.stohastic_component:
      transition_probs = self.softmax(
        (self.states * h_t_.unsqueeze(1)).sum(-1)
      )
      return transition_probs, torch.matmul(transition_probs, self.states)
    return h_t_

Test for pack padded sequences

In [0]:
with torch.no_grad():
  for _ in range(1024):

    batch_size = random.randint(1, 15)
    seq_length = random.randint(1, 15)
    input_size = random.randint(1, 5)
    hidden_size = random.randint(1, 5)
    bias = bool(random.getrandbits(1))
    nonlinearity = random.choice(['tanh', 'relu'])
    mode = random.choice(['rnn', 'gru'])
    number_of_states = random.randint(1, 15)
    temperature = random.choice([1.00, 0.5, 0.1])

    input = torch.rand(batch_size, seq_length, input_size).to(device)
    h_0 = torch.rand(batch_size, hidden_size).to(device)

    lengths = torch.randint(1, seq_length + 1, (batch_size,)).to(device)
    packed_sequence = torch.nn.utils.rnn.pack_padded_sequence(
      input, lengths, batch_first=True, enforce_sorted=False
    )

    # without stohastic component
    model = SRRNN(
      input_size, hidden_size, bias=bias, mode=mode, nonlinearity=nonlinearity
    )
    model.to(device)

    output, h_n, _ = model(input, h_0)

    assert h_n.size() == (batch_size, hidden_size)
    assert output.size() == (batch_size, seq_length, hidden_size)

    output_packed, h_n_packed, _ = model(packed_sequence, h_0)

    assert h_n_packed.size() == (batch_size, hidden_size)
    assert output_packed.data.size() == (lengths.sum().item(), hidden_size)

    for i in range(batch_size):
      assert torch.allclose(h_n_packed[i,:], output[i, lengths[i].item() - 1, :], atol=1e-07)

    # with stohastic component
    model = SRRNN(
      input_size, hidden_size, bias=bias, mode=mode, nonlinearity=nonlinearity,
      number_of_states=number_of_states, temperature=temperature
    )
    model.to(device)

    output, h_n, transition_probabilities = model(input, h_0)

    assert h_n.size() == (batch_size, hidden_size)
    assert output.size() == (batch_size, seq_length, hidden_size)
    assert transition_probabilities.size() == (batch_size, seq_length, number_of_states)

    output_packed, h_n_packed, transition_probabilities_packed = model(packed_sequence, h_0)

    assert h_n_packed.size() == (batch_size, hidden_size)
    assert output_packed.data.size() == (lengths.sum().item(), hidden_size)
    assert transition_probabilities_packed.data.size() == (lengths.sum().item(), number_of_states)

    for i in range(batch_size):
      assert torch.allclose(h_n_packed[i,:], output[i, lengths[i].item() - 1, :], atol=1e-07)

In [0]:
class NLNN(nn.Module):

  def __init__(self, params):
    super(NLNN, self).__init__()

    # + 1 because of the padding with 0s
    num_embeddings = params['num_embeddings'] + 1

    if 'embedding_dim' in params:
      embedding_dim = params['embedding_dim']
      self.embeddings = nn.Embedding(
        num_embeddings,
        embedding_dim,
        padding_idx = 0
      )
      input_size = embedding_dim
    else:
      # one hot encoding
      self.embeddings = nn.Embedding(
          num_embeddings,
          num_embeddings,
          padding_idx = 0
      )
      nn.init.eye_(self.embeddings.weight.data)
      self.embeddings.weight.requires_grad = False
      input_size = num_embeddings

    if 'number_of_states' in params:
      self.rnn = SRRNN(
        input_size, params['hidden_size'], bias=params['bias'],
        mode=params['mode'], nonlinearity=params['nonlinearity'],
        number_of_states=params['number_of_states'],
        temperature=params['temperature']
      )
    else:
      self.rnn = SRRNN(
        input_size, params['hidden_size'], bias=params['bias'],
        mode=params['mode'], nonlinearity=params['nonlinearity']
      )

    self.linear = nn.Linear(
      in_features=params['hidden_size'], out_features=2, bias=True
    )

    self.softmax = nn.Softmax(dim=1)

  def forward(self, x, length, return_probabilities=False):
    embedding_output = self.embeddings(x)
    packed_sequence = nn.utils.rnn.pack_padded_sequence(
      embedding_output, length, batch_first=True, enforce_sorted=False
    )
    rnn_output, h_n, transition_probs = self.rnn(packed_sequence)
    linear_output = self.linear(h_n)
    softmax_output = self.softmax(linear_output)

    if return_probabilities:
      return softmax_output, transition_probs

    return softmax_output

In [0]:
class NLDataset(utils.data.Dataset):

  def __init__(self, dataset):
    data, length, labels = dataset
    self.data = torch.tensor(data).long().to(device)
    self.length = torch.tensor(length).long().to(device)
    self.labels = torch.tensor(labels).long().to(device)

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return {
      'x': self.data[idx],
      'length': self.length[idx],
      'y': self.labels[idx]
    }

def load_data(filename_data, filename_length, filename_labels):
  return numpy.load(filename_data, allow_pickle=True), \
    numpy.load(filename_length, allow_pickle=True), \
    numpy.load(filename_labels, allow_pickle=True)

In [0]:
def __evaluate(predictions, labels):
  assert(len(predictions) == len(labels))

  tp, tn, fp, fn = 0, 0, 0, 0
  for prediction, label in zip(predictions, labels):
    if label == 1:
      if prediction == 1:
        tp += 1
      else:
        fn += 1
    else:
      if prediction == 1:
        fp += 1
      else:
        tn += 1

  if tp == 0:
    if fn == 0 and fp == 0:
      pr, r, f1 = 1, 1, 1
    else:
      pr, r, f1 = 0, 0, 0
  else:
    pr = tp / (tp + fp)
    r = tp / (tp + fn)
    f1 = 2 * ((pr * r) / (pr + r))

  accuracy = (tp + tn) / (tp + tn + fp + fn)

  return tp, tn, fp, fn, pr, r, f1, accuracy

def _evaluate(data_loader, model):
  model.eval()

  predictions, labels = [], []

  for data in data_loader:
    result = model(data['x'], data['length'])
    argmax = result.argmax(dim=1).cpu().numpy()
    predictions.extend(list(argmax))
    labels.extend(list(data['y'].cpu().numpy()))

  return __evaluate(predictions, labels)

def evaluate_model(data_loader, model, set_name):
  tp, tn, fp, fn, pr, r, f1, acc = _evaluate(data_loader, model)
  print("{} : TP : {} TN : {} FP : {} FN : {} Pr : {} R : {} F1: {} ACC : {} ".format(
    set_name, tp, tn, fp, fn, pr, r, f1, acc
  ))

def model_summary(model):
  print (model)
  print()

  print("Trainable parameters:")
  for name, param in model.named_parameters():
    if param.requires_grad:
      print (" ", name)
  print()

  number_of_trainable_parameters = sum(
    p.numel() for p in model.parameters() if p.requires_grad
  )
  print("Number of trainable parameters {0:,}".format(
    number_of_trainable_parameters
  ))
  print()

def train_model(
    model, params=None, params_path=None, model_path=None,
    train_loader=None, dev_loader=None
):
  model.to(device)
  model_summary(model)

  loss_function = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=0.001)

  best_dev_accuracy, best_state_dict, best_epoch = 0, dict(), -1

  # first_batch = next(iter(train_loader))
  # print (first_batch)

  for epoch in range(params['num_epochs']):
    print('Epoch {}/{} : '.format(epoch, params['num_epochs']))
    t0 = time.time()

    model.train() # set the model to training mode
    #for data in [first_batch] * 1:
    for data in train_loader:
      model.zero_grad()
      predictions = model(data['x'], data['length'])
      batch_loss = loss_function(predictions, data['y'])
      batch_loss.backward()
      optimizer.step()

    # _, _, _, _, _, _, _, dev_accuracy = _evaluate(dev_loader, model)

    # if dev_accuracy > best_dev_accuracy:
    #   best_dev_accuracy = dev_accuracy
    #   best_state_dict = copy.deepcopy(model.state_dict())
    #   best_epoch = epoch

    # print(
    #   "dev accuracy:{}\ttime:{:.2f}s"
    #   .format(dev_accuracy, time.time() - t0)
    # )

    _, _, _, _, _, _, _, train_accuracy = _evaluate(train_loader, model)

    print(
      "train accuracy:{}\ttime:{:.2f}s"
      .format(train_accuracy, time.time() - t0)
    )

  #model.load_state_dict(best_state_dict)
  if model_path:
    print("Best dev epoch {} and accuracy {}".format(best_epoch, best_dev_accuracy))
    torch.save(model.state_dict(), MODELS_DIR + model_path)
    if params_path:
      with open(MODELS_DIR + params_path, 'wb') as f:
        pickle.dump(params, f)

  #evaluate_model([first_batch] * 1, model, "train")
  evaluate_model(train_loader, model, "train")
  #evaluate_model(dev_loader, model, "dev")


In [0]:
train_data = load_data(
  DATA_DIR + "/numeral/train.data.npy",
  DATA_DIR + "/numeral/train.length.npy",
  DATA_DIR + "/numeral/train.labels.npy"
)
dev_data = load_data(
  DATA_DIR + "/numeral/dev.data.npy",
  DATA_DIR + "/numeral/dev.length.npy",
  DATA_DIR + "/numeral/dev.labels.npy"
)
test_data = load_data(
  DATA_DIR + "/numeral/test.data.npy",
  DATA_DIR + "/numeral/test.length.npy",
  DATA_DIR + "/numeral/test.labels.npy"
)

alphabet = {}
with open(DATA_DIR + '/numeral/alphabet.dict', 'rb') as f:
  alphabet = pickle.load(f)

params = {
  'batch_size': 3,
  'num_epochs': 100,
  'num_embeddings': len(alphabet),
  'embedding_dim': 3,
  'mode': 'rnn',
  'nonlinearity': 'tanh',
  'hidden_size': 100,
  'bias': True
  # 'number_of_states': 5,
  # 'temperature': 0.0001,
}

train_loader = utils.data.DataLoader(
  NLDataset(train_data), batch_size=params["batch_size"], shuffle = True
)
dev_loader = utils.data.DataLoader(
  NLDataset(dev_data), batch_size=params["batch_size"]
)
test_loader = utils.data.DataLoader(
  NLDataset(test_data), batch_size=params["batch_size"]
)

model = NLNN(params)
train_model(
  model,
  params=params,
  params_path="/numeral/gru.model.params",
  model_path="/numeral/gru.model.pt",
  train_loader=train_loader, dev_loader=dev_loader
)
#evaluate_model(test_loader, model, "test")

In [0]:
def load_model(params_path=None, model_path=None):
  with open(DATA_DIR + params_path, 'rb') as f:
    params = pickle.load(f)
  model = NLNN(params)
  model.load_state_dict(torch.load(DATA_DIR + model_path))
  model.to(device)
  return model, params

In [0]:
from graphviz import Digraph

def draw(transitions, is_final, start_state, pdf):

  states = set()
  for transition in transitions:
    q_1, l, q_2 = transition
    states.add(q_1)
    states.add(q_2)

  g = Digraph('G', filename=DATA_DIR + pdf, format='pdf')
  g.attr(rankdir='LR', size='8,5')

  for state in states:

    if is_final[state] == 1:
      if state == start_state:
        g.attr('node', shape='doublecircle', style='filled', color='gray80')
      else:
        g.attr('node', shape='doublecircle', style='filled', color='gray80')
      g.node(str(state))
    else:
      if state == start_state:
        g.attr('node', shape='doubleoctagon', style='filled', color='gray80')
      else:
        g.attr('node', shape='circle', style='filled', color='gray80')
      g.node(str(state))

    g.attr('node', shape='point', color="black")

  g.edge('', str(start_state), label='', color="black")

  for transition in transitions:
    q_1, l, q_2 = transition
    g.edge(str(q_1), str(q_2), label=str(l))

  g.render() 
  print('DFA extracted at: ' + str(DATA_DIR + pdf + ".pdf"))


In [0]:
model, params = load_model(
    params_path="/model/srrnn-light/model.params",
    model_path="/model/srrnn-light/model.pt"
)
print (params)
evaluate_model(train_loader, model, "train")
evaluate_model(dev_loader, model, "dev")
dev_loader = utils.data.DataLoader(
  NLDataset(train_data), batch_size=1
)

# inv_alphabet = { v : k for k, v in alphabet.items() }

# transitions = set()
# for data in dev_loader:
#   x, length = data['x'], data['length']
#   #print (x, length)
#   #print (data['y'].item(), "".join([inv_alphabet[c] for c in data['x'][0].cpu().numpy()[:data['length'][0]]]))
#   softmax_output, transition_probabilities = model(x, length, return_probabilities=True)
#   transition_probabilities, _ = nn.utils.rnn.pad_packed_sequence(transition_probabilities, batch_first=True)
#   first = x[0]
#   prev_state = 0
#   for t in range(length[0]):
#     l = inv_alphabet[first[t].item()]
#     next_state = torch.argmax(transition_probabilities[0, t, ]).item() + 1
#     transitions.add((prev_state, l, next_state))
#     prev_state = next_state

# h_0 = torch.zeros((1, params['hidden_size']), device=device)
# is_final = model.softmax(model.linear(torch.cat((h_0, model.rnn.states)))).argmax(dim=1).cpu().numpy()
# draw(transitions, is_final, 0, "/model.35.train")