In [1]:
print_order = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
sequence = [{print_order[i]} for i in range(len(print_order))]
sequence

[{'the'},
 {'quick'},
 {'brown'},
 {'fox'},
 {'jumps'},
 {'over'},
 {'the'},
 {'lazy'},
 {'dog'}]

In [2]:
window = 3  # Number of tokens to the left each token can attend to

def compute_attention_matrix(tokens: list[set[str]], window_size: int):
    length = len(tokens)
    matrix = [[None for _ in range(length)] for _ in range(length)]
    
    for current_idx, current_set in enumerate(tokens):
        for past_idx in range(current_idx + 1):  # Only look backward
            if current_idx - past_idx >= window_size:
                continue  # Out of sliding window range
            
            combined = set()
            combined.update(current_set)
            combined.update(tokens[past_idx])
            matrix[current_idx][past_idx] = combined
            
    return matrix

def apply_attention(attention_matrix: list[list[set]], value_seq: list[set[str]]) -> list[set[str]]:
    output = [set() for _ in value_seq]
    
    for i, row in enumerate(attention_matrix):
        for j, attention_set in enumerate(row):
            if attention_set is not None:
                output[i].update(value_seq[j])
                output[i].update(attention_set)
                
    return output

def show_attention_matrix(matrix: list[list[set[str]]]):
    for row in matrix:
        for cell in row:
            if cell is None:
                print("None", end="\t")
            else:
                print(sorted(cell, key=lambda x: print_order.index(x)), end="\t")
        print()

def show_token_sets(data: list[set[str]]):
    for idx, token_set in enumerate(data):
        print(f"{idx}: {sorted(token_set, key=lambda x: print_order.index(x))}")

def transformer_block(tokens: list[set[str]], layer_id: int) -> list[set[str]]:
    print(f"\nLayer {layer_id} — Input:")
    show_token_sets(tokens)
    
    attention = compute_attention_matrix(tokens, window)
    
    print(f"\nLayer {layer_id} — Attention Scores:")
    show_attention_matrix(attention)
    
    updated = apply_attention(attention, tokens)
    
    print(f"\nLayer {layer_id} — Output:")
    show_token_sets(updated)
    
    return updated

In [3]:
output_layer_1 = transformer_block(sequence, 1)


Layer 1 — Input:
0: ['the']
1: ['quick']
2: ['brown']
3: ['fox']
4: ['jumps']
5: ['over']
6: ['the']
7: ['lazy']
8: ['dog']

Layer 1 — Attention Scores:
['the']	None	None	None	None	None	None	None	None	
['the', 'quick']	['quick']	None	None	None	None	None	None	None	
['the', 'brown']	['quick', 'brown']	['brown']	None	None	None	None	None	None	
None	['quick', 'fox']	['brown', 'fox']	['fox']	None	None	None	None	None	
None	None	['brown', 'jumps']	['fox', 'jumps']	['jumps']	None	None	None	None	
None	None	None	['fox', 'over']	['jumps', 'over']	['over']	None	None	None	
None	None	None	None	['the', 'jumps']	['the', 'over']	['the']	None	None	
None	None	None	None	None	['over', 'lazy']	['the', 'lazy']	['lazy']	None	
None	None	None	None	None	None	['the', 'dog']	['lazy', 'dog']	['dog']	

Layer 1 — Output:
0: ['the']
1: ['the', 'quick']
2: ['the', 'quick', 'brown']
3: ['quick', 'brown', 'fox']
4: ['brown', 'fox', 'jumps']
5: ['fox', 'jumps', 'over']
6: ['the', 'jumps', 'over']
7: ['the', 'over', 'lazy'

In [4]:
output_layer_2 = transformer_block(output_layer_1, 2)


Layer 2 — Input:
0: ['the']
1: ['the', 'quick']
2: ['the', 'quick', 'brown']
3: ['quick', 'brown', 'fox']
4: ['brown', 'fox', 'jumps']
5: ['fox', 'jumps', 'over']
6: ['the', 'jumps', 'over']
7: ['the', 'over', 'lazy']
8: ['the', 'lazy', 'dog']

Layer 2 — Attention Scores:
['the']	None	None	None	None	None	None	None	None	
['the', 'quick']	['the', 'quick']	None	None	None	None	None	None	None	
['the', 'quick', 'brown']	['the', 'quick', 'brown']	['the', 'quick', 'brown']	None	None	None	None	None	None	
None	['the', 'quick', 'brown', 'fox']	['the', 'quick', 'brown', 'fox']	['quick', 'brown', 'fox']	None	None	None	None	None	
None	None	['the', 'quick', 'brown', 'fox', 'jumps']	['quick', 'brown', 'fox', 'jumps']	['brown', 'fox', 'jumps']	None	None	None	None	
None	None	None	['quick', 'brown', 'fox', 'jumps', 'over']	['brown', 'fox', 'jumps', 'over']	['fox', 'jumps', 'over']	None	None	None	
None	None	None	None	['the', 'brown', 'fox', 'jumps', 'over']	['the', 'fox', 'jumps', 'over']	['the', 'jumps'

In [5]:
output_layer_3 = transformer_block(output_layer_2, 3)


Layer 3 — Input:
0: ['the']
1: ['the', 'quick']
2: ['the', 'quick', 'brown']
3: ['the', 'quick', 'brown', 'fox']
4: ['the', 'quick', 'brown', 'fox', 'jumps']
5: ['quick', 'brown', 'fox', 'jumps', 'over']
6: ['the', 'brown', 'fox', 'jumps', 'over']
7: ['the', 'fox', 'jumps', 'over', 'lazy']
8: ['the', 'jumps', 'over', 'lazy', 'dog']

Layer 3 — Attention Scores:
['the']	None	None	None	None	None	None	None	None	
['the', 'quick']	['the', 'quick']	None	None	None	None	None	None	None	
['the', 'quick', 'brown']	['the', 'quick', 'brown']	['the', 'quick', 'brown']	None	None	None	None	None	None	
None	['the', 'quick', 'brown', 'fox']	['the', 'quick', 'brown', 'fox']	['the', 'quick', 'brown', 'fox']	None	None	None	None	None	
None	None	['the', 'quick', 'brown', 'fox', 'jumps']	['the', 'quick', 'brown', 'fox', 'jumps']	['the', 'quick', 'brown', 'fox', 'jumps']	None	None	None	None	
None	None	None	['the', 'quick', 'brown', 'fox', 'jumps', 'over']	['the', 'quick', 'brown', 'fox', 'jumps', 'over']	['quic

In [6]:
output_layer_4 = transformer_block(output_layer_3, 4)


Layer 4 — Input:
0: ['the']
1: ['the', 'quick']
2: ['the', 'quick', 'brown']
3: ['the', 'quick', 'brown', 'fox']
4: ['the', 'quick', 'brown', 'fox', 'jumps']
5: ['the', 'quick', 'brown', 'fox', 'jumps', 'over']
6: ['the', 'quick', 'brown', 'fox', 'jumps', 'over']
7: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy']
8: ['the', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog']

Layer 4 — Attention Scores:
['the']	None	None	None	None	None	None	None	None	
['the', 'quick']	['the', 'quick']	None	None	None	None	None	None	None	
['the', 'quick', 'brown']	['the', 'quick', 'brown']	['the', 'quick', 'brown']	None	None	None	None	None	None	
None	['the', 'quick', 'brown', 'fox']	['the', 'quick', 'brown', 'fox']	['the', 'quick', 'brown', 'fox']	None	None	None	None	None	
None	None	['the', 'quick', 'brown', 'fox', 'jumps']	['the', 'quick', 'brown', 'fox', 'jumps']	['the', 'quick', 'brown', 'fox', 'jumps']	None	None	None	None	
None	None	None	['the', 'quick', 'brown', 'fox', 'jumps', 'over']	['the'

In [7]:
output_layer_5 = transformer_block(output_layer_4, 5)


Layer 5 — Input:
0: ['the']
1: ['the', 'quick']
2: ['the', 'quick', 'brown']
3: ['the', 'quick', 'brown', 'fox']
4: ['the', 'quick', 'brown', 'fox', 'jumps']
5: ['the', 'quick', 'brown', 'fox', 'jumps', 'over']
6: ['the', 'quick', 'brown', 'fox', 'jumps', 'over']
7: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy']
8: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog']

Layer 5 — Attention Scores:
['the']	None	None	None	None	None	None	None	None	
['the', 'quick']	['the', 'quick']	None	None	None	None	None	None	None	
['the', 'quick', 'brown']	['the', 'quick', 'brown']	['the', 'quick', 'brown']	None	None	None	None	None	None	
None	['the', 'quick', 'brown', 'fox']	['the', 'quick', 'brown', 'fox']	['the', 'quick', 'brown', 'fox']	None	None	None	None	None	
None	None	['the', 'quick', 'brown', 'fox', 'jumps']	['the', 'quick', 'brown', 'fox', 'jumps']	['the', 'quick', 'brown', 'fox', 'jumps']	None	None	None	None	
None	None	None	['the', 'quick', 'brown', 'fox', 'jumps', 'over

In [8]:
output_layer_6 = transformer_block(output_layer_5, 6)


Layer 6 — Input:
0: ['the']
1: ['the', 'quick']
2: ['the', 'quick', 'brown']
3: ['the', 'quick', 'brown', 'fox']
4: ['the', 'quick', 'brown', 'fox', 'jumps']
5: ['the', 'quick', 'brown', 'fox', 'jumps', 'over']
6: ['the', 'quick', 'brown', 'fox', 'jumps', 'over']
7: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy']
8: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog']

Layer 6 — Attention Scores:
['the']	None	None	None	None	None	None	None	None	
['the', 'quick']	['the', 'quick']	None	None	None	None	None	None	None	
['the', 'quick', 'brown']	['the', 'quick', 'brown']	['the', 'quick', 'brown']	None	None	None	None	None	None	
None	['the', 'quick', 'brown', 'fox']	['the', 'quick', 'brown', 'fox']	['the', 'quick', 'brown', 'fox']	None	None	None	None	None	
None	None	['the', 'quick', 'brown', 'fox', 'jumps']	['the', 'quick', 'brown', 'fox', 'jumps']	['the', 'quick', 'brown', 'fox', 'jumps']	None	None	None	None	
None	None	None	['the', 'quick', 'brown', 'fox', 'jumps', 'over