In [1]:
import tensorflow as tf
import numpy as np
import time
import h5py

2024-01-11 13:00:24.564225: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-11 13:00:24.668583: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-11 13:00:24.668617: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-11 13:00:24.683533: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-11 13:00:24.720523: I tensorflow/core/platform/cpu_feature_guar

In [2]:
rng = np.random.default_rng(1)


In [3]:
weights_path = './vicuna_weight.h5'

weights = []
w_input = []
attn_weights = []
aw_input = []
q_weights = []
k_weights = []

with h5py.File(weights_path, 'r') as weight_file:
    for layer_name in weight_file:
        w = np.squeeze(np.array(weight_file[layer_name])).astype(np.float32)
        if "model" in layer_name and "embed_tokens" not in layer_name and "layernorm" not in layer_name:
            weights.append(w)
            w_input.append(rng.random(w.shape, dtype = np.float32))
        if "attn" in layer_name:
            attn_weights.append(w)
            aw_input.append(rng.random(w.shape[1], dtype = np.float32))
            if "q_proj" in layer_name:
                q_weights.append(w)
            if "k_proj" in layer_name:
                k_weights.append(w)


In [4]:
def timer(input1, input2, f, runner):
    runs = 10
    times = []
    for _ in range(runs):
        times.append(runner(input1, input2, f))
    times = np.array(times)
    print(f"{runner.__name__[:-6]}tensorflow_with_load")
    print(f"{np.average(times)}ms +/- {np.std(times)}ms")

In [5]:
def elewise_mul_tf(input1, input2, hidden_dim):
    return tf.multiply(input1[:hidden_dim], input2[:hidden_dim])

def elewise_mul_runner(inputs1, inputs2, f=None):
    total_time = 0
    for i in range(len(inputs1)):
        input1 = inputs1[i].flatten()
        input2 = inputs2[i].flatten()
        hd = len(input1)
        with tf.device('/CPU:0'):
            inp1 = tf.convert_to_tensor(input1, np.float32) 
            inp2 = tf.convert_to_tensor(input2, np.float32) 
            hidden_dim = tf.constant(hd, dtype=tf.int32)
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            inp1 = tf.identity(inp1)
            inp2 = tf.identity(inp2)
            hidden_dim = tf.identity(hidden_dim)
            res = elewise_mul_tf(inp1, inp2, hidden_dim)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del inp2
        del inp1
        total_time += (end_time - start_time) * 1000
    return total_time

In [6]:
def matmul_tf(weight, input):
    return tf.linalg.matvec(weight, input)

def matmul_runner(weights, inputs, f=None):
    total_time = 0
    for i in range(len(inputs)):
        weight = weights[i]
        input = inputs[i]
        with tf.device('/CPU:0'):
            w = tf.convert_to_tensor(weight, np.float32) 
            inp = tf.convert_to_tensor(input, np.float32) 
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            w = tf.identity(w)
            inp = tf.identity(inp)
            res = matmul_tf(w, inp)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del inp
        del w
        total_time += (end_time - start_time) * 1000
    return total_time

In [7]:
def multiquery_attention_part1_tf(token_position, head, head_size, key_cache_layer, q):
    return tf.linalg.matvec(key_cache_layer[:token_position][:, (head * head_size):(head * head_size) + head_size], q[(head * head_size):(head * head_size) + head_size])/tf.sqrt(tf.cast(head_size * 1, tf.float32))

def multiquery_attention_part1_runner(k_matrixes, q_matrixes, f=None):
    total_time = 0
    for i in range(len(k_matrixes)):
        k_matrix = k_matrixes[i]
        q_matrix = q_matrixes[i]
        token_position = k_matrix.shape[0] - 1

        num_head = 32
        head = int(rng.integers(low=0, high=num_head))
        head_size = k_matrix.shape[0] // num_head


        q_matrix = q_matrix.flatten()
        with tf.device('/CPU:0'):
            key_cache_layer = tf.convert_to_tensor(k_matrix, np.float32) 
            q = tf.convert_to_tensor(q_matrix, np.float32) 
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            key_cache_layer = tf.identity(key_cache_layer) 
            q = tf.identity(q) 
            res = multiquery_attention_part1_tf(token_position, head, head_size, key_cache_layer, q)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del key_cache_layer
        del q
        total_time += (end_time - start_time) * 1000
    return total_time

In [8]:
def multiquery_attention_part2_tf(token_position, head, head_size, key_cache_layer, attention):
    return tf.linalg.matvec(tf.transpose(key_cache_layer[:token_position+1][:, (head * head_size):(head * head_size) + head_size]), attention[:token_position+1])

def multiquery_attention_part2_runner(k_matrixes, q_matrixes, f=None):
    total_time = 0
    for i in range(len(k_matrixes)):
        k_matrix = k_matrixes[i]
        q_matrix = q_matrixes[i]
        token_position = k_matrix.shape[0] - 1

        num_head = 32
        head = int(rng.integers(low=0, high=num_head))
        head_size = k_matrix.shape[0] // num_head

        q_matrix = q_matrix.flatten()
        with tf.device('/CPU:0'):
            key_cache_layer = tf.convert_to_tensor(k_matrix, np.float32) 
            q = tf.convert_to_tensor(q_matrix, np.float32) 
            attention = multiquery_attention_part1_tf(token_position, head, head_size, key_cache_layer, q)
            attention = tf.concat((attention, tf.constant([0.0])), axis=0)
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            key_cache_layer = tf.identity(key_cache_layer) 
            attention = tf.identity(attention)
            res = multiquery_attention_part2_tf(token_position, head, head_size, key_cache_layer, attention)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del key_cache_layer
        del attention
        total_time += (end_time - start_time) * 1000
    return total_time

In [9]:
def rmsnorm_part1_tf(input, weight):
    return tf.reduce_sum(tf.multiply(input, input))

def rmsnorm_part1_runner(weights, inputs, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        weight = weights[i].flatten()
        with tf.device('/CPU:0'):
            w = tf.convert_to_tensor(weight, np.float32) 
            inp = tf.convert_to_tensor(input, np.float32) 
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            w = tf.identity(w)
            inp = tf.identity(inp)
            res = rmsnorm_part1_tf(inp, w)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del w
        del inp
        total_time += (end_time - start_time) * 1000
    return total_time

In [10]:
def rmsnorm_part2_tf(input, weight, ss):
    return tf.multiply((1 / tf.sqrt(tf.cast((ss/tf.size(input, tf.float32)) + 1, tf.float32))), tf.multiply(input, weight))

def rmsnorm_part2_runner(weights, inputs, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        weight = weights[i].flatten()
        ssum = np.sum(input * input)
        with tf.device('/CPU:0'):
            w = tf.convert_to_tensor(weight, np.float32) 
            inp = tf.convert_to_tensor(input, np.float32) 
            ss = tf.convert_to_tensor(ssum, np.float32)
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            w = tf.identity(w)
            inp = tf.identity(inp)
            ss = tf.identity(ss)
            res = rmsnorm_part2_tf(inp, w, ss)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del w
        del inp
        total_time += (end_time - start_time) * 1000
    return total_time

In [11]:
def silu_tf(input, hidden_dim):
    return tf.multiply(tf.divide(1, (tf.exp(0 - input[:hidden_dim]) + 1)), input[:hidden_dim])

def silu_runner(inputs, _, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        hd = len(input)
        with tf.device('/CPU:0'):
            inp = tf.convert_to_tensor(input, np.float32)
            hidden_dim = tf.constant(hd, dtype=tf.int32)
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            inp = tf.identity(inp)
            hidden_dim = tf.identity(hidden_dim)
            res = silu_tf(inp, hidden_dim)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del inp
        total_time += (end_time - start_time) * 1000
    return total_time

In [12]:
def softmax_part1_tf(input, max_pos):
    return tf.reduce_max(input[:max_pos])

def softmax_part1_runner(inputs, _, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        mp = len(input)
        with tf.device('/CPU:0'):
            inp = tf.convert_to_tensor(input, np.float32)
            max_pos = tf.constant(mp, dtype=tf.int32)
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            inp = tf.identity(inp)
            max_pos = tf.identity(max_pos) 
            res = softmax_part1_tf(inp, max_pos)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del inp
        total_time += (end_time - start_time) * 1000
    return total_time

In [13]:
def softmax_part2_tf(input, max_pos, max_val):
    return tf.exp(tf.cast(input[:max_pos] - max_val, tf.float32))

def softmax_part2_runner(inputs, _, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        mp = len(input)
        with tf.device('/CPU:0'):
            inp = tf.convert_to_tensor(input, np.float32)
            max_pos = tf.constant(mp, dtype=tf.int32)
            max_val = tf.convert_to_tensor(np.max(input[:mp]), np.float32)
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            inp = tf.identity(inp)
            max_pos = tf.identity(max_pos) 
            max_val = tf.identity(max_val)
            res = softmax_part2_tf(inp, max_pos, max_val)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del inp
        total_time += (end_time - start_time) * 1000
    return total_time

In [14]:
def softmax_part3_tf(output, max_pos):
    return tf.reduce_sum(output[:max_pos])

def softmax_part3_runner(inputs, _, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        mp = len(input)
        output = np.exp(input[:mp]-np.max(input[:mp]))
        with tf.device('/CPU:0'):
            outp = tf.convert_to_tensor(output, np.float32)
            max_pos = tf.constant(mp, dtype=tf.int32)
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            outp = tf.identity(outp)
            max_pos = tf.identity(max_pos) 
            res = softmax_part3_tf(outp, max_pos)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del outp
        total_time += (end_time - start_time) * 1000
    return total_time

In [15]:
def softmax_part4_tf(unnormalized_output, max_pos, sum):
    return unnormalized_output[:max_pos]/sum

def softmax_part4_runner(inputs, _, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        mp = len(input)
        output = np.exp(input[:mp]-np.max(input[:mp]))
        s = np.sum(output[:mp])
        with tf.device('/CPU:0'):
            outp = tf.convert_to_tensor(output, np.float32)
            max_pos = tf.constant(mp, dtype=tf.int32)
            sum = tf.convert_to_tensor(s, np.float32)
        with tf.device('/GPU:0'):
            start_time = time.perf_counter()
            outp = tf.identity(outp)
            max_pos = tf.identity(max_pos) 
            sum = tf.identity(sum)
            res = softmax_part4_tf(outp, max_pos, sum)
        with tf.device('/CPU:0'):
            res = tf.identity(res)
            end_time = time.perf_counter()
        del outp
        total_time += (end_time - start_time) * 1000
    return total_time

In [16]:
timer(weights, w_input, None, elewise_mul_runner)

2024-01-11 13:00:32.444742: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: forward compatibility was attempted on non supported HW
2024-01-11 13:00:32.445134: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:129] retrieving CUDA diagnostic information for host: colinc
2024-01-11 13:00:32.445136: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:136] hostname: colinc
2024-01-11 13:00:32.445176: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:159] libcuda reported version is: 535.146.2
2024-01-11 13:00:32.445184: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:163] kernel reported version is: 535.129.3
2024-01-11 13:00:32.445186: E external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:244] kernel version 535.129.3 does not match DSO version 535.146.2 -- cannot find working devices in this configuration


elewise_mul_tensorflow_with_load
284.40932482481ms +/- 9.906515523455361ms


In [17]:
timer(attn_weights, aw_input, None, matmul_runner)

matmul_tensorflow_with_load
80.03217247314751ms +/- 10.781710929610968ms


In [18]:
timer(k_weights, q_weights, None, multiquery_attention_part1_runner)

multiquery_attention_part1_tensorflow_with_load
21.66841602884233ms +/- 2.439472383029676ms


In [19]:
timer(k_weights, q_weights, None, multiquery_attention_part2_runner)

multiquery_attention_part2_tensorflow_with_load
7.8336704056710005ms +/- 2.134037043607374ms


In [20]:
timer(weights, w_input, None, rmsnorm_part1_runner)

rmsnorm_part1_tensorflow_with_load
450.7356496527791ms +/- 2.241457025084523ms


In [21]:
timer(weights, w_input,None, rmsnorm_part2_runner)

rmsnorm_part2_tensorflow_with_load
453.855360718444ms +/- 4.165485436942907ms


In [22]:
timer(weights, None, None, silu_runner)

silu_tensorflow_with_load
820.5589587334543ms +/- 1.379051373853ms


In [23]:
timer(attn_weights, None, None, softmax_part1_runner)

softmax_part1_tensorflow_with_load
45.10223506949842ms +/- 7.360041188113004ms


In [24]:
timer(attn_weights, None, None, softmax_part2_runner)

softmax_part2_tensorflow_with_load
128.38186309672892ms +/- 2.414270580669231ms


In [25]:
timer(attn_weights, None, None, softmax_part3_runner)

softmax_part3_tensorflow_with_load
40.287359757348895ms +/- 0.34765948613478004ms


In [26]:
timer(attn_weights, None, None, softmax_part4_runner)

softmax_part4_tensorflow_with_load
97.43978586047888ms +/- 0.9419789740914143ms


: 