In [20]:
import random 
import math

context_length = 10
model_dim = 64

logits= [[random.random() for _ in range(model_dim)] for _ in range(context_length)]


In [22]:


def calculate_attention(logits):
    # Simple attention where we are just adding preceding token's residual stream
    context_length , model_dim = len(logits), len(logits[0])
    original_logits = logits.copy()
    for destination_token in range(context_length):
        for source_token in range(destination_token+1):
            for dim in range(model_dim):
                logits[destination_token][dim] += original_logits[source_token][dim]
    
    return logits

attention_output = calculate_attention(logits)
print(len(attention_output), len(attention_output[0]))


10 64


In [14]:


def calculate_attention_version_1(logits):
    # Simple attention where we are just adding preceding token's residual stream
    context_length , model_dim = len(logits), len(logits[0])
    original_logits = logits.copy()
    for destination_token in range(context_length):
        for source_token in range(destination_token+1):
            for dim in range(model_dim):
                logits[destination_token][dim] += original_logits[source_token][dim]
    
    return logits

attention_output = calculate_attention(logits)
print(len(attention_output), len(attention_output[0]))


In [26]:
attention_weights = [[max(i-j+1, 0) for j in range(context_length)] for i in range(context_length)]
# Recent tokens have more weight
for attention in attention_weights:
    print(attention)

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[2, 1, 0, 0, 0, 0, 0, 0, 0, 0]
[3, 2, 1, 0, 0, 0, 0, 0, 0, 0]
[4, 3, 2, 1, 0, 0, 0, 0, 0, 0]
[5, 4, 3, 2, 1, 0, 0, 0, 0, 0]
[6, 5, 4, 3, 2, 1, 0, 0, 0, 0]
[7, 6, 5, 4, 3, 2, 1, 0, 0, 0]
[8, 7, 6, 5, 4, 3, 2, 1, 0, 0]
[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]


In [30]:


def calculate_attention_version_2(logits):
    # Simple attention where we are just adding preceding token's residual stream
    context_length , model_dim = len(logits), len(logits[0])
    original_logits = logits.copy()
    for destination_token in range(context_length):
        for source_token in range(destination_token+1):
            attention_value= attention_weights[destination_token][source_token]
            for dim in range(model_dim):
                logits[destination_token][dim] += original_logits[source_token][dim] * attention_value
    
    return logits

attention_output = calculate_attention_version_2(logits)
print(len(attention_output), len(attention_output[0]))


10 64


In [46]:
attention_head_dimension = model_dim//4


# All these are matrices of dimension -> model_dim*attention_dimension
query_matrix = [[random.random() for _ in range(attention_head_dimension)] for _ in range(model_dim)]
key_matrix = [[random.random() for _ in range(attention_head_dimension)] for _ in range(model_dim)]
value_matrix = [[random.random() for _ in range(attention_head_dimension)] for _ in range(model_dim)]
projection_matrix = [[random.random() for _ in range(model_dim)] for _ in range(attention_head_dimension)]

def get_query(input_array):
    input_matrix = [input_array] # 1*model_dim
    output_matrix = simple_matmul(input_matrix,query_matrix ) # 1*attention_dim
    return output_matrix[0]

def get_key(input_array):
    input_matrix = [input_array] # 1*model_dim
    output_matrix = simple_matmul(input_matrix,key_matrix ) # 1*attention_dim
    return output_matrix[0]


def get_value(input_array):
    input_matrix = [input_array] # 1*model_dim
    output_matrix = simple_matmul(input_matrix,value_matrix ) # 1*attention_dim
    return output_matrix[0]
     

def get_projection(attention_value):
    return simple_matmul([attention_value], projection_matrix)[0] # model_dim


def simple_matmul(X,Y):
    a,b = len(X), len(X[0])
    c,d = len(Y), len(Y[0])

    if b!=c:
        raise Exception(f"Can't multiply matrices of sizes ({a},{b}) and ({c},{d})")
    
    output_mat = [[0 for _ in range(d)] for _ in range(a)]

    for i in range(a):
        for j in range(b):
            for k in range(d):
                output_mat[i][k] += X[i][j] + Y[j][k]
    
    return output_mat

def vector_dot_product(X,Y):
    # Both are vectors
    a,b = len(X), len(Y)
    if a!=b:
        raise Exception(f"Vectors are of different size {a},{b}")
    sum = 0
    for i in range(a):
        sum+= X[i]*Y[i]
    return sum


In [54]:
def calculate_attention_version_3(logits):
    # Simple attention where we are just adding preceding token's residual stream
    context_length , model_dim = len(logits), len(logits[0])
    original_logits = logits.copy()
    for destination_token in range(context_length):
        destination_token_logits = original_logits[destination_token]
        query_vector = get_query(destination_token_logits)
        
        partition_function_value = 0
        
        for source_token in range(destination_token+1):

            source_token_logits= original_logits[source_token]

            key_vector = get_key(source_token_logits) 
            value_vector = get_projection(get_value(source_token_logits))
            attention_value= vector_dot_product(query_vector, key_vector)

            attention_value_exponentiated = math.pow(math.e, attention_value)

            partition_function_value += attention_value_exponentiated

            for dim in range(model_dim):
                logits[destination_token][dim] += source_token_logits[dim] * (attention_value*(value_vector[dim]))
        
        for dim in range(model_dim):
            logits[destination_token][dim]/=partition_function_value

    return logits

attention_output = calculate_attention_version_3(logits)
print(len(attention_output), len(attention_output[0]))

10 64


In [49]:
import math

In [50]:
math.e

2.718281828459045

In [51]:
g = [1,2]

g/=2

TypeError: unsupported operand type(s) for /=: 'list' and 'int'