In [1]:
import tensorflow as tf
import numpy as np
import time
import h5py

2024-01-12 18:46:13.377137: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-12 18:46:13.483837: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-12 18:46:13.483868: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-12 18:46:13.498870: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-12 18:46:13.537414: I tensorflow/core/platform/cpu_feature_guar

In [2]:
rng = np.random.default_rng(1)

In [3]:
weights_path = './vicuna_weight.h5'

weights = []
w_input = []
attn_weights = []
aw_input = []
q_weights = []
k_weights = []

with h5py.File(weights_path, 'r') as weight_file:
    for layer_name in weight_file:
        w = np.squeeze(np.array(weight_file[layer_name])).astype(np.float32)
        if "model" in layer_name and "embed_tokens" not in layer_name and "layernorm" not in layer_name:
            weights.append(w)
            w_input.append(rng.random(w.shape, dtype = np.float32))
        if "attn" in layer_name:
            attn_weights.append(w)
            aw_input.append(rng.random(w.shape[1], dtype = np.float32))
            if "q_proj" in layer_name:
                q_weights.append(w)
            if "k_proj" in layer_name:
                k_weights.append(w)

In [4]:
def timer(input1, input2, f, runner):
    runs = 10
    times = []
    for _ in range(runs):
        times.append(runner(input1, input2, f))
    times = np.array(times)
    print(f"{runner.__name__[:-6]}tensorflow")
    print(f"{np.average(times)}ms +/- {np.std(times)}ms")

In [5]:
def transformer_part4_tf(input1, input2, hidden_dim):
    return (input1[:hidden_dim]) * (input2[:hidden_dim])

def transformer_part4_runner(inputs1, inputs2, f=None):
    total_time = 0
    for i in range(len(inputs1)):
        input1 = inputs1[i].flatten()
        input2 = inputs2[i].flatten()
        hd = len(input1)
        with tf.device('/GPU:0'):
            inp1 = tf.convert_to_tensor(input1, np.float32) 
            inp2 = tf.convert_to_tensor(input2, np.float32) 
            hidden_dim = tf.constant(hd, dtype=tf.int32)
            
            start_time = time.perf_counter()
            transformer_part4_tf(inp1, inp2, hidden_dim)
            end_time = time.perf_counter()
            del inp2
            del inp1
        total_time += (end_time - start_time) * 1000
    return total_time

In [6]:
def matmul_tf(weight, input):
    return tf.linalg.matvec(weight, input)

def matmul_runner(weights, inputs, f=None):
    total_time = 0
    for i in range(len(inputs)):
        weight = weights[i]
        input = inputs[i]
        with tf.device('/GPU:0'):
            w = tf.convert_to_tensor(weight, np.float32) 
            inp = tf.convert_to_tensor(input, np.float32) 
        
            start_time = time.perf_counter()
            matmul_tf(w, inp)
            end_time = time.perf_counter()
            del inp
            del w
        total_time += (end_time - start_time) * 1000
    return total_time

In [8]:
def transformer_part1_tf(token_position, head, head_size, key_cache_layer, q):
    return (tf.linalg.matvec(key_cache_layer[:token_position][:, (head) * (head_size):(head) * (head_size) + head_size], q[(head) * (head_size):(head) * (head_size) + head_size])) / (tf.sqrt(tf.cast((head_size) * (1), tf.float32)))

def transformer_part1_runner(k_matrixes, q_matrixes, f=None):
    total_time = 0
    for i in range(len(k_matrixes)):
        k_matrix = k_matrixes[i]
        q_matrix = q_matrixes[i]
        token_position = k_matrix.shape[0] - 1

        num_head = 32
        head = int(rng.integers(low=0, high=num_head))
        head_size = k_matrix.shape[0] // num_head

        q_matrix = q_matrix.flatten()
        with tf.device('/GPU:0'):
            key_cache_layer = tf.convert_to_tensor(k_matrix, np.float32) 
            q = tf.convert_to_tensor(q_matrix, np.float32) 
            
            start_time = time.perf_counter()
            transformer_part1_tf(token_position, head, head_size, key_cache_layer, q)
            end_time = time.perf_counter()
            del key_cache_layer
            del q
        total_time += (end_time - start_time) * 1000
    return total_time

In [22]:
def transformer_part2_tf(token_position, head, head_size, key_cache_layer, attention):
    return tf.linalg.matvec(tf.transpose(key_cache_layer[:(token_position) + (1)][:, (head) * (head_size):(head) * (head_size) + head_size]), attention[:(token_position) + (1)])

def transformer_part2_runner(k_matrixes, q_matrixes, f=None):
    total_time = 0
    for i in range(len(k_matrixes)):
        k_matrix = k_matrixes[i]
        q_matrix = q_matrixes[i]
        token_position = k_matrix.shape[0] - 1

        num_head = 32
        head = int(rng.integers(low=0, high=num_head))
        head_size = k_matrix.shape[0] // num_head
        
        q_matrix = q_matrix.flatten()
        with tf.device('/GPU:0'):
            key_cache_layer = tf.convert_to_tensor(k_matrix, np.float32) 
            q = tf.convert_to_tensor(q_matrix, np.float32) 

            attention = transformer_part1_tf(token_position, head, head_size, key_cache_layer, q)
            attention = tf.concat((attention, tf.constant([0.0])), axis=0)
        
            start_time = time.perf_counter()
            transformer_part2_tf(token_position, head, head_size, key_cache_layer, attention)
            end_time = time.perf_counter()
            del key_cache_layer
            del attention
        total_time += (end_time - start_time) * 1000
    return total_time

In [5]:
def rmsnorm_part1_tf(input, weight):
    return tf.reduce_sum((input) * (input))

def rmsnorm_part1_runner(weights, inputs, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        weight = weights[i].flatten()
        with tf.device('/GPU:0'):
            w = tf.convert_to_tensor(weight, np.float32) 
            inp = tf.convert_to_tensor(input, np.float32) 
            start_time = time.perf_counter()
            rmsnorm_part1_tf(inp, w)
            end_time = time.perf_counter()
            del w
            del inp
        total_time += (end_time - start_time) * 1000
    return total_time

In [6]:
def rmsnorm_part2_tf(input, weight, ss):
    return ((1) / (tf.sqrt(tf.cast(((ss) / (tf.size(input, tf.float32))) + (1), tf.float32)))) * ((input) * (weight))

def rmsnorm_part2_runner(weights, inputs, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        weight = weights[i].flatten()
        ssum = np.sum(input * input)

        with tf.device('/GPU:0'):
            w = tf.convert_to_tensor(weight, np.float32) 
            inp = tf.convert_to_tensor(input, np.float32) 
            ss = tf.convert_to_tensor(ssum, np.float32)

            start_time = time.perf_counter()
            rmsnorm_part2_tf(inp, w, ss)
            end_time = time.perf_counter()
            del w
            del inp
        total_time += (end_time - start_time) * 1000
    return total_time

In [11]:
def transformer_part3_tf(input, hidden_dim):
    return (input[:hidden_dim]) * ((1) / ((1) + (tf.exp((0) - (input[:hidden_dim])))))

def transformer_part3_runner(inputs, _, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        hd = len(input)
        with tf.device('/GPU:0'):
            inp = tf.convert_to_tensor(input, np.float32)
            hidden_dim = tf.constant(hd, dtype=tf.int32)
            start_time = time.perf_counter()
            transformer_part3_tf(inp, hidden_dim)
            end_time = time.perf_counter()
            del inp
        total_time += (end_time - start_time) * 1000
    return total_time

In [12]:
def softmax_part1_tf(input, max_pos):
    return tf.reduce_max(input[:max_pos])

def softmax_part1_runner(inputs, _, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        mp = len(input)
        with tf.device('/GPU:0'):
            inp = tf.convert_to_tensor(input, np.float32)
            max_pos = tf.constant(mp, dtype=tf.int32)
            start_time = time.perf_counter()
            softmax_part1_tf(inp, max_pos)
            end_time = time.perf_counter()
            del inp
        total_time += (end_time - start_time) * 1000
    return total_time

In [13]:
def softmax_part2_tf(input, max_pos, max_val):
    return tf.exp((input[:max_pos]) - (max_val))

def softmax_part2_runner(inputs, _, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        mp = len(input)
        with tf.device('/GPU:0'):
            inp = tf.convert_to_tensor(input, np.float32)
            max_pos = tf.constant(mp, dtype=tf.int32)
            max_val = tf.convert_to_tensor(np.max(input[:mp]), np.float32)

            start_time = time.perf_counter()
            softmax_part2_tf(inp, max_pos, max_val)
            end_time = time.perf_counter()
            del inp
        total_time += (end_time - start_time) * 1000
    return total_time

In [14]:
def softmax_part3_tf(output, max_pos):
    return tf.reduce_sum(output[:max_pos])

def softmax_part3_runner(inputs, _, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        mp = len(input)
        output = np.exp(input[:mp]-np.max(input[:mp]))
        with tf.device('/GPU:0'):
            outp = tf.convert_to_tensor(output, np.float32)
            max_pos = tf.constant(mp, dtype=tf.int32)
            
            start_time = time.perf_counter()
            softmax_part3_tf(outp, max_pos)
            end_time = time.perf_counter()
            del outp
        total_time += (end_time - start_time) * 1000
    return total_time

In [15]:
def softmax_part4_tf(unnormalized_output, max_pos, sum):
    return (unnormalized_output[:max_pos]) / (sum)

def softmax_part4_runner(inputs, _, f=None):
    total_time = 0
    for i in range(len(inputs)):
        input = inputs[i].flatten()
        mp = len(input)
        output = np.exp(input[:mp]-np.max(input[:mp]))
        s = np.sum(output[:mp])
        with tf.device('/GPU:0'):
            outp = tf.convert_to_tensor(output, np.float32)
            max_pos = tf.constant(mp, dtype=tf.int32)
            sum = tf.convert_to_tensor(s, np.float32)
        
            start_time = time.perf_counter()
            softmax_part4_tf(outp, max_pos, sum)
            end_time = time.perf_counter()
            del outp
        total_time += (end_time - start_time) * 1000
    return total_time

In [16]:
timer(weights, w_input, None, transformer_part4_runner)

2024-01-11 14:01:37.078693: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: forward compatibility was attempted on non supported HW
2024-01-11 14:01:37.079599: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:129] retrieving CUDA diagnostic information for host: colinc
2024-01-11 14:01:37.079602: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:136] hostname: colinc
2024-01-11 14:01:37.079769: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:159] libcuda reported version is: 535.146.2
2024-01-11 14:01:37.079777: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:163] kernel reported version is: 535.129.3
2024-01-11 14:01:37.079779: E external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:244] kernel version 535.129.3 does not match DSO version 535.146.2 -- cannot find working devices in this configuration


elewise_mul_tensorflow
263.5096057318151ms +/- 3.5164297249010175ms


In [17]:
timer(attn_weights, aw_input, None, matmul_runner)

matmul_tensorflow
39.25398522987962ms +/- 4.089616600534196ms


In [9]:
timer(k_weights, q_weights, None, transformer_part1_runner)

multiquery_attention_part1_tensorflow
14.341256799997382ms +/- 19.045698310160926ms


In [23]:
timer(k_weights, q_weights, None, transformer_part2_runner)

multiquery_attention_part2_tensorflow
6.627444829791784ms +/- 2.584835190491138ms


In [None]:
timer(weights, w_input, None, rmsnorm_part1_runner)

rmsnorm_part1_tensorflow
187.19183229841292ms +/- 2.2388239847541787ms


In [7]:
timer(weights, w_input,None, rmsnorm_part2_runner)

2024-01-12 18:48:16.311962: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-12 18:48:16.378035: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-12 18:48:16.378132: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

rmsnorm_part2_tensorflow
46.83127490001198ms +/- 51.92319959088294ms


In [None]:
timer(weights, None, None, transformer_part3_runner)

silu_tensorflow
24.09263262525201ms +/- 0.9104524488343121ms


In [None]:
timer(attn_weights, None, None, softmax_part1_runner)

softmax_part1_tensorflow
9.261212404817343ms +/- 1.3751016449060167ms


In [None]:
timer(attn_weights, None, None, softmax_part2_runner)

softmax_part2_tensorflow
7.931538671255112ms +/- 1.1648636374644534ms


In [None]:
timer(attn_weights, None, None, softmax_part3_runner)

softmax_part3_tensorflow
8.878669375553727ms +/- 0.16834484359706178ms


In [None]:
timer(attn_weights, None, None, softmax_part4_runner)

softmax_part4_tensorflow
9.228620165959ms +/- 0.46544980425183907ms
