In [None]:
import json
from difflib import SequenceMatcher
import itertools

In [None]:
def create_tokenizers(token_dict_1, token_dict_2):
    node_type_token_dict = {
        node_type: chr(i)
        for i, node_type in enumerate(
            set(
                token_dict_1.keys()
                | token_dict_2.keys()
            )
        )
    }

    token_node_type_dict = {token: node_type for node_type, token in node_type_token_dict.items()}
    return node_type_token_dict, token_node_type_dict

In [None]:
def tokenize_seqs(node_type_token_dict, seqs):
    def _tokenize(path) -> str:
        return "".join(list(map(lambda x: node_type_token_dict[x], path)))

    tokenized_seqs = [_tokenize(seq) for seq in seqs]
    return tokenized_seqs

In [None]:
def detokenize_seqs(token_node_type_dict, seqs):
    def _detokenize(tokenized_seq):
        return [token_node_type_dict[token] for token in tokenized_seq]

    detokenized_seqs = [_detokenize(seq) for seq in seqs]
    return detokenized_seqs

In [None]:
def recursive_submatching(seqs, seq_len):
    submatches = set(seqs)
    for i, j in itertools.combinations(seqs, 2):
        matcher = SequenceMatcher(None, i, j)
        matching_blocks = list(matcher.get_matching_blocks())
        for matching_block in matching_blocks:
            a, b, size = matching_block.a, matching_block.b, matching_block.size
            if matching_block.size > 3:
                match_1 = i[a : a + size]
                submatches.add(match_1)
                if i in submatches:
                    submatches.remove(i)
                if j in submatches:
                    submatches.remove(j)
    print(len(submatches))
    if seq_len - len(submatches) > 0:
        return recursive_submatching(set(submatches), len(submatches))
    return seqs

In [None]:
def get_unq_sequences(seqs):
    total_sequences = 0
    unique_sequences = set()
    for i, (_, sequences) in enumerate(seqs['results'].items()):
        total_sequences += len(sequences)
        unique_sequences |= set(sequences)

    unique_sequences_stripped = set()
    for seq in unique_sequences:
        new_seq = seq.strip(seqs['node_type_token_dict']['input'])
        new_seq = new_seq.strip(seqs['node_type_token_dict']['output'])
        if len(new_seq) > 2:
            unique_sequences_stripped.add(new_seq)

    unique_sequences_decoded = []
    for sequence in unique_sequences_stripped:
        decoded_sequence = [seqs['token_node_type_dict'][i] for i in sequence]
        unique_sequences_decoded.append(decoded_sequence)

    print(f"Total Sequences: {total_sequences:,}, Total Unique Sequences: {len(unique_sequences_stripped):,}, Total Paths Compared: {seqs['total_path_pairs_analyzed']:,}")
    return unique_sequences_stripped, unique_sequences_decoded, total_sequences 

# PyTorch

In [None]:
with open('./onnx_parsing_results/torch_mismatch_seq_match_results.json', 'r') as f:
    mismatched_seq = json.load(f)

with open('./onnx_parsing_results/torch_correct_mismatch_seq_match_results.json', 'r') as f:
    mismatched_correct = json.load(f)

with open('./onnx_parsing_results/torch_test_mismatch_seq_match_results.json', 'r') as f:
    mismatched_test = json.load(f)

In [None]:
print("Mismatched Ops:", len(mismatched_seq['node_type_token_dict'].keys()))
print("Mismatched-Correct Ops:", len(mismatched_correct['node_type_token_dict'].keys()))
print("Mismatched-Test Ops:", len(mismatched_test['node_type_token_dict'].keys()))


In [None]:
mismatched_seq['node_type_token_dict'].keys() - mismatched_correct['node_type_token_dict'].keys()

In [None]:
mismatched_test['node_type_token_dict'].keys() - mismatched_correct['node_type_token_dict'].keys()


## Mismatched Sequences

In [None]:
(
    unique_mismatched_sequences,
    unique_mismatched_sequences_decoded,
    total_mismatched_sequences,
) = get_unq_sequences(mismatched_seq)

In [None]:
len(recursive_submatching(unique_mismatched_sequences, len(unique_mismatched_sequences)))

## Correct-Mismatched Sequences

In [None]:
(
    unique_corr_mismatched_sequences,
    unique_corr_mismatched_sequences_decoded,
    total_corr_mismatched_sequences,
) = get_unq_sequences(mismatched_correct)

In [None]:
len(recursive_submatching(unique_corr_mismatched_sequences, len(unique_corr_mismatched_sequences)))

### Create New Tokenizers and Calculate Non-Overlapping

In [None]:
node_type_token_dict, token_node_type_dict = create_tokenizers(
    mismatched_correct["node_type_token_dict"], mismatched_seq["node_type_token_dict"]
)

In [None]:
mismatched_sequences = set(tokenize_seqs(node_type_token_dict, unique_mismatched_sequences_decoded))
mismatched_correct_sequences = set(tokenize_seqs(node_type_token_dict, unique_corr_mismatched_sequences_decoded))

#### Nonoverlapping

In [None]:
seqs = mismatched_sequences - mismatched_correct_sequences
len(seqs)

In [None]:
detokenize_seqs(token_node_type_dict, seqs)

In [None]:
reduced_sequences = recursive_submatching(seqs, len(seqs))

In [None]:
detokenize_seqs(token_node_type_dict, reduced_sequences)

In [None]:
filtered = list(
    filter(
        lambda x: (len(x) > 2),
        detokenize_seqs(token_node_type_dict, reduced_sequences),
    )
)
len(filtered)

In [None]:
filtered_ops = set()
for filt in filtered:
    filtered_ops = filtered_ops.union(filt)
filtered_ops

## Mismatched-Test Sequences

In [None]:
(
    unique_test_mismatched_sequences,
    unique_test_mismatched_sequences_decoded,
    total_test_mismatched_sequences,
) = get_unq_sequences(mismatched_test)

In [None]:
len(recursive_submatching(unique_test_mismatched_sequences, len(unique_test_mismatched_sequences)))

### Create New Tokenizers and Calculate Non-Overlapping

In [None]:
node_type_token_dict, token_node_type_dict = create_tokenizers(
    mismatched_test["node_type_token_dict"], mismatched_seq["node_type_token_dict"]
)

In [None]:
mismatched_sequences = set(tokenize_seqs(node_type_token_dict, unique_mismatched_sequences_decoded))
mismatched_test_sequences = set(tokenize_seqs(node_type_token_dict, unique_test_mismatched_sequences_decoded))

#### Nonoverlapping

In [None]:
seqs = mismatched_sequences - mismatched_test_sequences
len(seqs)

In [None]:
reduced_sequences = recursive_submatching(seqs, len(seqs))

In [None]:
detokenize_seqs(token_node_type_dict, reduced_sequences)

In [None]:
list(
    filter(
        lambda x: (len(x) > 2) and ("input" not in x) and ("output" not in x),
        detokenize_seqs(token_node_type_dict, reduced_sequences),
    )
)

# tf2onnx

In [None]:
with open('./onnx_parsing_results/tf2onnx_mismatch_seq_match_results.json', 'r') as f:
    mismatched_seq = json.load(f)
with open('./onnx_parsing_results/tf2onnx_correct_mismatch_seq_match_results.json', 'r') as f:
    mismatched_correct = json.load(f)
with open('./onnx_parsing_results/tf2onnx_test_mismatch_seq_match_results.json', 'r') as f:
    mismatched_test = json.load(f)

In [None]:
print("Mismatched Ops:", len(mismatched_seq['node_type_token_dict'].keys()))
print("Mismatched-Correct Ops:", len(mismatched_correct['node_type_token_dict'].keys()))
print("Mismatched-Test Ops:", len(mismatched_test['node_type_token_dict'].keys()))


## Mismatched Sequences

In [None]:
(
    unique_mismatched_sequences,
    unique_mismatched_sequences_decoded,
    total_mismatched_sequences,
) = get_unq_sequences(mismatched_seq)

In [None]:
len(recursive_submatching(unique_mismatched_sequences, len(unique_mismatched_sequences)))

## Correct-Mismatched Sequences

In [None]:
(
    unique_corr_mismatched_sequences,
    unique_corr_mismatched_sequences_decoded,
    total_corr_mismatched_sequences,
) = get_unq_sequences(mismatched_correct)

In [None]:
len(recursive_submatching(unique_corr_mismatched_sequences, len(unique_corr_mismatched_sequences)))

### Create New Tokenizers and Calculate Non-Overlapping

In [None]:
node_type_token_dict, token_node_type_dict = create_tokenizers(
    mismatched_correct["node_type_token_dict"], mismatched_seq["node_type_token_dict"]
)

In [None]:
mismatched_sequences = set(tokenize_seqs(node_type_token_dict, unique_mismatched_sequences_decoded))
mismatched_correct_sequences = set(tokenize_seqs(node_type_token_dict, unique_corr_mismatched_sequences_decoded))

#### Nonoverlapping

In [None]:
seqs = mismatched_sequences - mismatched_correct_sequences
len(seqs)

In [None]:
reduced_sequences = recursive_submatching(seqs, len(seqs))

In [None]:
x = detokenize_seqs(token_node_type_dict, reduced_sequences)

In [None]:
import torch

torch.save(x, './tf_reduced_seq.pt')
torch.save([node_type_token_dict, token_node_type_dict], './tf_tokenizer.pt')

## Mismatched-Test Sequences

In [None]:
(
    unique_test_mismatched_sequences,
    unique_test_mismatched_sequences_decoded,
    total_test_mismatched_sequences,
) = get_unq_sequences(mismatched_test)

In [None]:
len(recursive_submatching(unique_test_mismatched_sequences, len(unique_test_mismatched_sequences)))

### Create New Tokenizers and Calculate Non-Overlapping

In [None]:
node_type_token_dict, token_node_type_dict = create_tokenizers(
    mismatched_test["node_type_token_dict"], mismatched_seq["node_type_token_dict"]
)

In [None]:
mismatched_sequences = set(tokenize_seqs(node_type_token_dict, unique_mismatched_sequences_decoded))
mismatched_test_sequences = set(tokenize_seqs(node_type_token_dict, unique_test_mismatched_sequences_decoded))

#### Nonoverlapping

In [None]:
len(mismatched_sequences)

In [None]:
len(mismatched_test_sequences)

In [None]:
len(mismatched_sequences - mismatched_test_sequences)

In [None]:
seqs =  mismatched_sequences - mismatched_test_sequences
len(seqs)

In [None]:
reduced_sequences = recursive_submatching(seqs, len(seqs))

In [None]:
list(
    filter(
        lambda x: (len(x) > 2) and ("input" not in x) and ("output" not in x),
        detokenize_seqs(token_node_type_dict, reduced_sequences),
    )
)