In [17]:
import numpy as np
import pandas as pd
import json
import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import lasagne
from lasagne.layers import *
from lasagne.layers import Layer
from lasagne.random import get_rng

from agentnet import Recurrence
from agentnet.resolver import ProbabilisticResolver
from agentnet.memory import *

In [18]:
filename = 'log'
log_filename = 'data/'+filename+'.csv'
json_filename = 'output/' + filename + ".json"
dot_filename = 'output/' + filename + ".dot"

In [19]:
class GumbelSigmoid:
    def __init__(self,
                 t=0.1,
                 discrete=True,
                 eps=1e-20):
        assert t != 0
        self.temperature=t
        self.eps=eps
        self.discrete=discrete
        self._srng = RandomStreams(get_rng().randint(1, 2147462579))
    def __call__(self,logits):
        """computes a gumbel softmax sample"""
                
        #sample from Gumbel(0, 1)
        uniform1 = self._srng.uniform(logits.shape,low=0,high=1)
        uniform2 = self._srng.uniform(logits.shape,low=0,high=1)
        
        noise = -T.log(T.log(uniform2 + self.eps)/T.log(uniform1 + self.eps) +self.eps)
        
        #draw a sample from the Gumbel-Sigmoid distribution
        gumbel_sigm = T.nnet.sigmoid((logits + noise) / self.temperature)
        
        if self.discrete:
            return theano.gradient.zero_grad(hard_sigm(logits + noise) - gumbel_sigm) + gumbel_sigm
        else:
            return gumbel_sigm

In [20]:
def hard_sigm(logits):
    """computes a hard indicator function. Not differentiable"""
    return T.switch(T.gt(logits,0),1,0)

In [21]:
def read_log(filename, sep=';', sheetname='Лист1'):
    if filename.lower().endswith('.csv'):
        df = pd.read_csv(filename,sep=sep)
    elif filename.lower().endswith('.xlsx'):
        xl = pd.ExcelFile(filename)
        df = xl.parse(sheetname)
    else:
        raise Exception("Некорректный тип лога")
    return df

In [22]:
def preprocessing(filename):
    df = read_log(log_filename)
    tracks = [list(track.sort_values(by=["timest"], ascending=True).activity.values) for track_id, track in df.groupby(df.trace)]    
    tokens = sorted(list(set(df.activity)) +['#'])
    tokens
    token_to_id = {t:i for i,t in enumerate(tokens)}
    id_to_token = {i:t for i,t in enumerate(tokens)}
    MAX_LEN = max(list(map(len, tracks)))
    tracks_ix = list(map(lambda track: list(map(token_to_id.get,track)), tracks))
    for i in range(len(tracks_ix)):
        if len(tracks_ix[i]) < MAX_LEN:
            tracks_ix[i] += [token_to_id['#']]*(MAX_LEN - len(tracks_ix[i]))
    tracks_ix = np.array(tracks_ix)
    return tokens, tracks_ix, id_to_token, token_to_id

In [23]:
def sample_batch(data, batch_size):
    rows = data[np.random.randint(0,len(data),size=batch_size)]
    return rows

In [24]:
def binary_state_to_id(binary_state):
    return str(int(sum(val*2**index for index, val in enumerate(binary_state))))

In [25]:
def model(tokens, token_to_id, neurons_num=5):
    sequence = T.matrix('token sequence','int64')
    inputs = sequence[:,:-1] # по символу предсказываю следующий, поэтому входные - все, кроме последнего
    targets = sequence[:,1:] # а выходные - все, кроме первого
    l_input_sequence = InputLayer(shape=(None, None),input_var=inputs)
    tau = theano.shared(np.float32(0.1))
    pseudo_sigmoid = GumbelSigmoid(t=tau)
    class step:    
        #inputs
        h_prev = InputLayer((None, neurons_num),name='previous rnn state')
        inp = InputLayer((None,),name='current character')
        emb = EmbeddingLayer(inp, len(tokens), 30, name='emb') # сопоставление при условии минимума фции потерь
        
        #recurrent part                 
        f_dense = DenseLayer(concat([h_prev, emb]), num_units=8, nonlinearity=T.nnet.relu)    # 0 .. +inf
        s_dense = DenseLayer(f_dense, num_units=neurons_num, nonlinearity=None) # -inf .. +inf
        
        next_state_probs = NonlinearityLayer(s_dense, T.nnet.sigmoid) # Вероятность битов состояния
        h_new = NonlinearityLayer(s_dense, pseudo_sigmoid) # Новое состояние - вектор битов

        next_token_probas = DenseLayer(s_dense, len(tokens),nonlinearity=T.nnet.softmax) # Вероятность токена - P(Ti|Ti-1....T0)
    batch_size = sequence.shape[0]
    initial_state = InputLayer((None, neurons_num), T.zeros((batch_size, neurons_num)))
    training_loop = Recurrence(
        state_variables={step.h_new:step.h_prev},
        state_init={step.h_new:initial_state},
        input_sequences={step.inp:l_input_sequence},
        tracked_outputs=[step.next_token_probas,],
        unroll_scan=False,
    )  
    weights = lasagne.layers.get_all_params(training_loop, trainable=True)    
    predicted_probabilities = lasagne.layers.get_output(training_loop[step.next_token_probas])
    xent = lasagne.objectives.categorical_crossentropy(predicted_probabilities.reshape((-1,len(tokens))),
                                                   targets.reshape((-1,))).reshape(targets.shape)
    mask = T.neq(inputs, token_to_id["#"]) # оставляем только значимые токены, т.к. незначимые появились после паддинга
    loss = (mask * xent).sum(axis=1).mean() # функция ошибки

    #<Loss function - a simple categorical crossentropy will do, maybe add some regularizer>
    updates = lasagne.updates.adam(loss, weights)
    train_step = theano.function([sequence], loss,
                             updates=training_loop.get_automatic_updates()+updates)
    
    h_deterministic = NonlinearityLayer(step.next_state_probs, lambda x: T.gt(x, 0.5).astype(x.dtype))
    validation_loop = Recurrence(
        state_variables={h_deterministic:step.h_prev},
        state_init={h_deterministic:initial_state},
        input_sequences={step.inp:l_input_sequence},
        tracked_outputs=[step.next_token_probas,],
        unroll_scan=False,
    )
    states_seq = get_output(validation_loop[h_deterministic], {l_input_sequence:sequence})
    infer_states = theano.function([sequence], states_seq, updates=None)
    return train_step, infer_states

In [26]:
def training(tokens, tracks_ix, token_to_id, neurons_num=5, n_epochs = 25, batches_per_epoch = 250, batch_size= 10):
    train_step, infer_states = model(tokens,token_to_id)    
    for epoch in range(n_epochs):
        avg_cost = 0;
        for _ in range(batches_per_epoch):
            avg_cost += train_step(sample_batch(tracks_ix, batch_size))
        print("\n\nEpoch {} average loss = {}".format(epoch, avg_cost / batches_per_epoch))
    return infer_states

In [27]:
def build_json(filename, inferred_states, tracks_ix, neurons_num=10): 
    graph = {"states":set(), "transitions":list(), "meta":{"isAccepting":set()}}
    all_states = []
    all_used_states = set()    

    for binary_states in inferred_states:
        binary_states = np.vstack((np.zeros(neurons_num), binary_states)) # add initial state
        states = list(map(binary_state_to_id, binary_states))
        graph["states"].update(set(states))
        all_states.append(states)    
    
    for states, track_ids in zip(all_states, tracks_ix):
        track = list(map(id_to_token.get, track_ids))
        for index, symbol in enumerate(track):
            transition = {"from":states[index], "to":states[index + 1], "track":symbol}
            all_used_states.add(transition["from"])
            all_used_states.add(transition["to"])
            if transition not in graph["transitions"]:
                graph["transitions"].append(transition)  
        graph["meta"]["isAccepting"].add(states[len(track)])
        
    graph["states"] = list(all_used_states)
    graph["meta"]["isAccepting"] = list(graph["meta"]["isAccepting"])
    graph["meta"]["tracksNum"] = len(tracks_ix)
    with open(filename, "w") as json_file:
        json.dump(graph, json_file)
    return graph

In [28]:
def build_dot(json_graph, filename):
    graph = "digraph test {\n"
    for state in json_graph["states"]:
        if state in json_graph["meta"]["isAccepting"]:
            graph += "\t" + state + " [shape=doublecircle];\n"
        else:
            graph += "\t" + state + ";\n"
    for transition in json_graph["transitions"]:
        graph += "\t" + transition["from"] + " -> " + transition["to"] 
        graph += " [label=\"" + transition["track"] + "\"];\n"
    graph += "}" 
    
    with open(filename, "w") as graph_file:
        print(graph, file=graph_file, end="")
        
    return graph

In [29]:
tokens, tracks_ix, id_to_token, token_to_id = preprocessing(filename)    
infer_states = training(tokens, tracks_ix, token_to_id, 10)
inferred_states = infer_states(tracks_ix)
graph = build_json(json_filename, inferred_states, tracks_ix)
dot_graph = build_dot(graph, dot_filename)
!C:\Users\9016danilov-as\Documents\graphviz-2.38\release\bin\dot output/log1.dot -Tpng -o output/log1.png



Epoch 0 average loss = 3.962332988550206


Epoch 1 average loss = 2.136803028987426


Epoch 2 average loss = 1.86175525344277


Epoch 3 average loss = 1.6590040576598115


Epoch 4 average loss = 1.5371119987343587


Epoch 5 average loss = 1.4736885807658635


Epoch 6 average loss = 1.425573951678219


Epoch 7 average loss = 1.4257997702321756


Epoch 8 average loss = 1.3781801830865053


Epoch 9 average loss = 1.3407805026852506


Epoch 10 average loss = 1.2271034258780726


Epoch 11 average loss = 1.1174788308087285


Epoch 12 average loss = 1.002400049335712


Epoch 13 average loss = 0.9117776113924964


Epoch 14 average loss = 0.8668075448652973


Epoch 15 average loss = 0.8211766729082657


Epoch 16 average loss = 0.782874127679791


Epoch 17 average loss = 0.7636623948403101


Epoch 18 average loss = 0.7569500446364912


Epoch 19 average loss = 0.7477379331150648


Epoch 20 average loss = 0.7429234214451412


Epoch 21 average loss = 0.7314433767337961


Epoch 22 average loss = 0

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 10 and the array at index 1 has size 5