# TAYSIR competition - Track 1 Starter Kit

### Welcome!

This is a notebook to show the structure of a code to participate to the competition.

You can also check the baseline notebook (available in the same archive) for more details about the TAYSIR models and how to use them.

## Prepare your environment

In [None]:
%pip install --upgrade mlflow torch transformers

In [None]:
import torch
import mlflow
from utils import predict, PytorchInference
import sys
import pandas as pd

### Persisting results for logging

In [None]:
def persist_results(dataset, learning_result, max_extraction_time):
    result = dict()
    extracted_model = learning_result.model
    
    result.update({ 
                'Instance': dataset,
                'Number of Extracted States': len(extracted_model.states) ,   
                'EquivalenceQuery': learning_result.info['equivalence_queries_count'], 
                'MembershipQuery': learning_result.info['membership_queries_count'], 
                'Duration': learning_result.info['duration'], 
                'TimeBound': max_extraction_time
                })
    
    wandb.config.update(result)
    wandb.finish()
    
    # Soon: history!

### Model extraction

In [None]:
import pickle
from utils import predict, PytorchInference
import numpy as np
from wrapper import MlflowDFA
from submit_tools_fix import save_function
from pythautomata.utilities.uniform_word_sequence_generator import UniformWordSequenceGenerator
from pythautomata.model_exporters.dot_exporters.dfa_dot_exporting_strategy import DfaDotExportingStrategy
from pymodelextractor.teachers.pac_comparison_strategy import PACComparisonStrategy
from pymodelextractor.teachers.general_teacher import GeneralTeacher
from pymodelextractor.factories.lstar_factory import LStarFactory
from pythautomata.base_types.alphabet import Alphabet
from utils import test_model
from pymodelextractor.learners.observation_table_learners.translators.partial_dfa_translator import PartialDFATranslator
import wandb

TRACK = 1 #always for his track
DATASET = 7

max_extraction_time = 60
max_sequence_len = 80
min_sequence_len = 10
epsilon = 0.01
delta = 0.01

# params of wandb log
params = dict()
params['DATASET_7'] = {"max_extraction_time":max_extraction_time, "max_sequence_len":max_sequence_len, 
                       "min_sequence_len":min_sequence_len, "epsilon":epsilon, "delta":delta}
# Initialize wandb
wandb.init(
        # Set the project where this run will be logged
        project="taysir_track_1",
        # Track hyperparameters and run metadata
        config=params
    ) 

counter = 0
observation_table = None

model_name = f"models/1.{DATASET}.taysir.model"
model = mlflow.pytorch.load_model(model_name)
model.eval()

file = f"datasets/1.{DATASET}.taysir.valid.words"

empty_sequence_len = 2
with open(file) as f:
    a = f.readline() #Skip first line (number of sequences, alphabet size)
    headline = a.split(' ')
    alphabet_size = int(headline[1].strip())
    alphabet = Alphabet.from_strings([str(x) for x in range(alphabet_size - empty_sequence_len)])

name = "Track: " + str(TRACK) + " - DataSet: " + str(DATASET) + "-  partial n° " + str(counter)
target_model = PytorchInference(alphabet, model, name)

sequence_generator = UniformWordSequenceGenerator(alphabet, max_seq_length=max_sequence_len,
                                                        min_seq_length=min_sequence_len)

comparator = PACComparisonStrategy(target_model_alphabet = alphabet, epsilon = epsilon, delta = delta,
                                   sequence_generator = sequence_generator)

teacher = GeneralTeacher(target_model, comparator)

learner = LStarFactory.get_partial_dfa_lstar_learner(max_time=max_extraction_time)

name = "Track: " + str(TRACK) + " - DataSet: " + str(DATASET) + "-  partial n° " + str(counter)
res = learner.learn(teacher, observation_table)

persist_results(DATASET, res, max_extraction_time)

wandb.finish()

### Some quick metrics

In [None]:
print("Result info:", res.info)
print("---------------------------")
print("Number of extracted states:", len(res.model.states))

### Test with uniform length sequences

In [None]:
# Get validation sequence max length

In [None]:
result = test_model(target_model, res.model, max_seq_len=1000, min_seq_len=50, sequence_amount=1000)
np.mean(result)

### Submission

In [None]:
res.model.name = "Dataset"+str(DATASET)+"-1Acc"
res.model.export()

In [None]:
from fast_dfa_converter import FastDeterministicFiniteAutomatonConverter as Converter

fast_dfa = Converter().to_fast_dfa(res.model)

In [None]:
from wrapper import MlflowDFA
from submit_tools_fix import save_function

#mlflow_dfa = MlflowDFA(fast_dfa)
save_function(fast_dfa, len(res.model.alphabet), target_model.name)