Imports and model name

In [None]:
from transformers import AutoTokenizer

gpt2_model_id = "gpt2"
bert_model_id = "prajjwal1/bert-small"

Defining tokenizers

In [None]:
gpt2_tokenizer = AutoTokenizer.from_pretrained(gpt2_model_id, use_fast=True, add_prefix_space=False)
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_id, use_fast=True, add_prefix_space=False)

In [None]:
def decode_sequence(sequence, tokenizer):
    decoded_sequence = []
    for token in sequence:
        decoded_sequence.append(tokenizer.decode(token))
    return decoded_sequence

In [None]:
def test_tokenizers(string, gpt2_tokenizer, bert_tokenizer):
    print(f"String: {string}")
    print(" ")
    encoded_string1 = gpt2_tokenizer.encode(string)
    print(f"Decoded sequence using GPT-2 tokenizer: {gpt2_tokenizer.decode(encoded_string1)}")
    print(f"Decoded segmented sequence using GPT-2 tokenizer: {decode_sequence(encoded_string1, gpt2_tokenizer)}")
    print("---------------------------------------------------------------------------------")
    encoded_string2 = bert_tokenizer.encode(string)
    print(f"Decoded sequence using BERT tokenizer: {bert_tokenizer.decode(encoded_string2)}")
    print(f"Decoded segmented sequence using BERT tokenizer: {decode_sequence(encoded_string2, bert_tokenizer)}")

Testing with "medicine"

In [None]:
test_tokenizers("medicine", gpt2_tokenizer, bert_tokenizer)

Testing with "astrophysics"

In [None]:
test_tokenizers("astrophysics", gpt2_tokenizer, bert_tokenizer)

Create Guide Models

In [None]:
from mini_relm_resources.automata_examples.small_study_example import get_small_study_example
from pythautomata.base_types.symbol import SymbolStr

gpt2_property_model = get_small_study_example(SymbolStr(gpt2_tokenizer.eos_token))
bert_property_model = get_small_study_example(SymbolStr(bert_tokenizer.sep_token))

Load GPT-2 Model

In [None]:
import torch
from transformers import AutoModelForCausalLM

device = "cuda" if torch.cuda.is_available() else "cpu"
gpt2_model = AutoModelForCausalLM.from_pretrained(gpt2_model_id,
                                            return_dict_in_generate=True,
                                            pad_token_id=gpt2_tokenizer.eos_token_id).to(device)
bert_model = AutoModelForCausalLM.from_pretrained(bert_model_id,
                                            return_dict_in_generate=True,
                                            pad_token_id=bert_tokenizer.sep_token_id).to(device)

In [None]:
from case_studies.gpt2.gpt2_probabilistic_model_wrapper import GPT2_probabilistic_model_wrapper
from case_studies.bert_small.bert_small_probabilistic_model_wrapper import BERT_SMALL_probabilistic_model_wrapper
from mini_relm_resources.automata_examples.small_study_example import alphabet

gpt2_wrapper = GPT2_probabilistic_model_wrapper(50, alphabet, device, gpt2_model, gpt2_tokenizer)
bert_wrapper = BERT_SMALL_probabilistic_model_wrapper(50, alphabet, device, bert_model, bert_tokenizer)

In [None]:
from pythautomata.utilities.guiding_wfa_sequence_generator import GuidingWDFASequenceGenerator
gpt2_guiding_generator = GuidingWDFASequenceGenerator(gpt2_property_model, None)
bert_guiding_generator = GuidingWDFASequenceGenerator(bert_property_model, None)

In [None]:
from pythautomata.model_exporters.dot_exporters.wfa_dot_exporting_strategy import WFADotExportingStrategy
from IPython.display import display

exporter = WFADotExportingStrategy()
graph = exporter.create_graph(gpt2_property_model)

display(graph)

In [None]:
graph = exporter.create_graph(bert_property_model)

display(graph)

In [None]:
from utilities.syncronic_model_guided_language_model import SyncronicModelGuidedLanguageModel

gpt2_syncrhronic_model = SyncronicModelGuidedLanguageModel(gpt2_wrapper, gpt2_property_model, model_name="GUIDED_GPT2", max_seq_length=10, normalize_outputs=True, top_k=3)
bert_syncrhronic_model = SyncronicModelGuidedLanguageModel(bert_wrapper, bert_property_model, model_name="GUIDED_BERT_SMALL", max_seq_length=10, normalize_outputs=True, top_k=3)

In [None]:
from utilities.hypothesis_aware_sample_probabilistic_teacher import HypothesisAwareSampleProbabilisticTeacher
from pymodelextractor.learners.observation_tree_learners.bounded_pdfa_quantization_n_ary_tree_learner import BoundedPDFAQuantizationNAryTreeLearner
from pythautomata.utilities.probability_partitioner import QuantizationProbabilityPartitionerPlus
from pythautomata.model_comparators.wfa_partition_comparison_strategy import WFAPartitionComparator
partitioner = QuantizationProbabilityPartitionerPlus(1000)
comparator = WFAPartitionComparator(partitioner)
max_states = 50
max_query_length = 50


In [None]:
gpt2_teacher = HypothesisAwareSampleProbabilisticTeacher(gpt2_syncrhronic_model, comparator, 30)
bert_teacher = HypothesisAwareSampleProbabilisticTeacher(bert_syncrhronic_model, comparator, 30)

In [None]:
learner = BoundedPDFAQuantizationNAryTreeLearner(partitioner, max_states, max_query_length, None, generate_partial_hipothesis = True, pre_cache_queries_for_building_hipothesis = True,  check_probabilistic_hipothesis = False, omit_zero_transitions=True)

In [None]:
gpt2_learning_result = learner.learn(gpt2_teacher, verbose=False)

In [None]:
bert_learning_result = learner.learn(bert_teacher, verbose=False)

In [None]:
display(exporter.create_graph(gpt2_learning_result.model))

In [None]:
display(exporter.create_graph(bert_learning_result.model))