# Train RNN with data extracted from HDFS system traces
(Example from: Property Checking with Interpretable Error Characterization)

In [None]:
#!pip install ml_dtypes==0.2.0

In [1]:
import sys
sys.path.append("../")
from pythautomata.model_comparators.wfa_partition_comparison_strategy import WFAPartitionComparator
from pythautomata.utilities.probability_partitioner import TopKProbabilityPartitioner
from pythautomata.model_exporters.partition_mapper import TopKPartitionMapper
from pythautomata.model_exporters.image_exporters.wfa_image_exporter_with_partition_mapper import WFAImageExporterWithPartitionMapper
from pythautomata.base_types.symbol import SymbolStr

from pymodelextractor.learners.observation_tree_learners.bounded_pdfa_quantization_n_ary_tree_learner import BoundedPDFAQuantizationNAryTreeLearner
from pymodelextractor.teachers.pac_batch_probabilistic_teacher import PACBatchProbabilisticTeacher

from utilities.neural_networks.model_definitions import Models 
from data.data_loaders.deeplog_data_loader import DeepLogDataLoader
from utilities.neural_networks.neural_nets_train_auxiliar import load_data_and_train_last_token_language_model_network
from  tensorflow.keras.optimizers import Adam
from IPython.display import Image



### Get DataLoader 

In [2]:
dataloader = DeepLogDataLoader("../data/deep_log/hdfs_train")
alphabet = dataloader.alphabet

### Select network architecture from predefined ones 

In [3]:
has_embedding = False
input_vocab_size = len(alphabet) + 1  #to consider padding symbol
output_vocab_size = len(alphabet) + 1 #to consider terminal symbol
keras_network_architecture = Models.last_token_lstm_model(input_vocab_size, output_vocab_size, has_embedding)

### Define data generation and training parametters

In [4]:
max_sequence_length_for_training = 20
generated_training_data_size = 10000
window_size = 40
output_path="neural_networks/trained_models/hdfs_last_token_language_model/"
padding_symbol= SymbolStr('x')
terminal_symbol= SymbolStr('$')
training_params = {'patience':10, 'epochs':50, 'batch_size':100, 'learning_rate':0.01, 'criterion':Adam, 'loss':'categorical_crossentropy'}
seed = 28

In [5]:
from utilities.neural_networks.rnn_language_models.last_token_language_model import LastTokenLanguageModel
model = LastTokenLanguageModel("../neural_networks/trained_models/hdfs_last_token_language_model/", verbose=0)

The model has been successfully loaded


### Generate data and Train
(These abstractions are made for the sake of simplicity, any model and architecture is valid, as long as it implements the ProbabilisticModel interface)

In [6]:
#model, eval = load_data_and_train_last_token_language_model_network(dataloader, output_path, window_size, padding_symbol, terminal_symbol, keras_network_architecture, training_params, seed, not has_embedding)

### Define Probability Partitioner

In [7]:
partitioner = TopKProbabilityPartitioner(6)

### Define a Synchronic-Model
The Synchornic-Model will wrap the Model and is responsible of making transitions below K zero, (if required it is also able to mask the Model)

In [8]:
from utilities.syncronic_model_guided_language_model import SyncronicModelGuidedLanguageModel
syncrhronic_model = SyncronicModelGuidedLanguageModel(model, guiding_model=None, model_name="HDFS_SYNCH", max_seq_length=10, normalize_outputs=True, top_k=2)

### Define a Learner, a Comparator, and a Teacher

In [19]:
from utilities.hypothesis_aware_sample_probabilistic_teacher import HypothesisAwareSampleProbabilisticTeacher

comparator = WFAPartitionComparator(partitioner)

learner = BoundedPDFAQuantizationNAryTreeLearner(partitioner, max_states = 20, max_query_length = 5, max_seconds_run=None, generate_partial_hipothesis = True, pre_cache_queries_for_building_hipothesis = True,  check_probabilistic_hipothesis = False, omit_zero_transitions=True)

teacher = HypothesisAwareSampleProbabilisticTeacher(syncrhronic_model, comparator, 100, 5)

In [20]:
print(model.alphabet)

frozenset({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29})


### Run extraction

In [21]:
extracted_model = learner.learn(teacher, verbose = True).model

Size before update: 8
CE: 5,9,21,23,23
----update_node----
Old Node (new Leaf) 5,9,21
New Leaf 5,9,21,23
dict_keys([5, ϵ, 22, 5,9, 5,21, 5,9,4, 5,9,21])
dict_keys([5, ϵ, 22, 5,9, 5,21, 5,9,4, 5,9,21, 5,9,21,23])
--------
Size after update: 9
Size before update: 9
CE: 5,9,4,3,4
----update_node----
Old Node (new Leaf) 5,9,4
New Leaf 5,9,4,3
dict_keys([5, ϵ, 22, 5,9, 5,21, 5,9,4, 5,9,21, 5,9,21,23, 5,9,4,3,4])
dict_keys([5, ϵ, 22, 5,9, 5,21, 5,9,4, 5,9,21, 5,9,21,23, 5,9,4,3,4, 5,9,4,3])
--------
QueryLengthExceeded


In [None]:
path = 'learning_outputs/'
mapped_name = extracted_model.name+"_partitioned_top_6"
WFAImageExporterWithPartitionMapper(TopKProbabilityPartitioner(
            6), TopKPartitionMapper()).export(extracted_model, path, mapped_name)


In [None]:
Image(path+mapped_name+'.png')

In [None]:
extracted_model.export(path)
Image(path+extracted_model.name+'.png')

In [None]:
for state in extracted_model.weighted_states:
    print(state.transitions_set)