warm up

In [1]:
import torch
import torch.nn.functional as F
import time

torch.manual_seed(42)

batch_size = 2
num_heads = 32
seq_len = 128
head_dim = 128 # hidden_dim = num_head * head_dim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Q = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
K = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
V = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()

start_time = time.time()

for i in range(10):
    scale = head_dim ** -0.5
    scores = torch.matmul(Q, K.transpose(-1, -2)) * scale

    attention_weights = F.softmax(scores, dim=-1)

    output = torch.matmul(attention_weights, V)

end_time = time.time()

print("Output shape:", output.shape, "Output is on", output.device)

average_time = (end_time - start_time)/10

print("Execution time: {:.6f} seconds".format(average_time))

del Q, K, V, scores, attention_weights, output
torch.cuda.empty_cache()


Output shape: torch.Size([2, 32, 128, 128]) Output is on cuda:0
Execution time: 0.008056 seconds


Baseline_final

In [None]:
import torch
import torch.nn.functional as F
import time

batch_size = 2
num_heads = 32
seq_len_set = [512,1024,2048,4096,8192] # 512,1024,2048,4096,
head_dim = 128 # hidden_dim = num_head * head_dim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for seq_len in seq_len_set:
    # batch_size = int(16384/seq_len)
    Q = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    K = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    V = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()

    
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

    
    start_event.record()

    for i in range(10):
        scale = head_dim ** -0.5
        scores = torch.matmul(Q, K.transpose(-1, -2)) * scale

        attention_weights = F.softmax(scores, dim=-1)

        output = torch.matmul(attention_weights, V)

    
    
    end_event.record()

    torch.cuda.synchronize()

    elapsed_time = start_event.elapsed_time(end_event)

    print("Output shape:", output.shape)

    result = output.sum()

    print("Reduced result:", result.item())

    print("Execution time: {:.6f} ms".format(elapsed_time/10),"\n")

    del Q, K, V, scores, attention_weights, output, result
    torch.cuda.empty_cache()




DMR_final

In [None]:
import torch
import torch.nn.functional as F
import time

batch_size = 2
num_heads = 32
seq_len_set = [512,1024,2048,4096,8192]
head_dim = 128 # hidden_dim = num_head * head_dim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for seq_len in seq_len_set:
    # batch_size = int(16384/seq_len)
        
    Q = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    K = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    V = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
 
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

          
    start_event.record()

    # for i in range(10):
    
    scale = head_dim ** -0.5
    scores = torch.matmul(Q, K.transpose(-1, -2)) * scale

      
    attention_weights = F.softmax(scores, dim=-1)
    attention_weights_dmr = F.softmax(scores, dim=-1)

      
    output = torch.matmul(attention_weights, V)

      
    end_event.record()

      
    torch.cuda.synchronize()

      
    elapsed_time = start_event.elapsed_time(end_event) 
     
    print("Output shape:", output.shape)

      
    result = output - attention_weights_dmr.sum()
    result = result.sum()
   
      
    print("Reduced result:", result.item())

      
    print("Execution time: {:.6f} ms".format(elapsed_time),"\n")

      
    del Q, K, V, scores, attention_weights, output, result, attention_weights_dmr
    torch.cuda.empty_cache()




DMR_with check

In [None]:
import torch
import torch.nn.functional as F
import time


  
batch_size = 2
num_heads = 32
seq_len_set = [512,1024,2048,4096,8192]
head_dim = 128 # hidden_dim = num_head * head_dim


  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for seq_len in seq_len_set:
     
    Q = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    K = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    V = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()

     
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

         
    start_event.record()

    # for i in range(10):
     
     
    scale = head_dim ** -0.5
    scores = torch.matmul(Q, K.transpose(-1, -2)) * scale

      
    attention_weights = F.softmax(scores, dim=-1)
    attention_weights_dmr = F.softmax(attention_weights, dim=-1)
    
    sum_1 = attention_weights.sum()
    sum_2 = attention_weights_dmr.sum()

    if torch.equal(sum_1, sum_2):
        sum_1 = torch.tensor(0.0, device=device)
    else:
        sum_2 = torch.tensor(0.0, device=device)
    

      
    output = torch.matmul(attention_weights_dmr, V)

      
    end_event.record()

      
    torch.cuda.synchronize()

      
    elapsed_time = start_event.elapsed_time(end_event) 
     
    print("Output shape:", output.shape)

      
    result = output.sum()

      
    print(f"Sum_1: {sum_1.item()}, Sum_2: {sum_2.item()}")

      
    print("Execution time: {:.6f} ms".format(elapsed_time),"\n")

      
    del Q, K, V, scores, attention_weights, output, result, attention_weights_dmr, sum_1, sum_2
    torch.cuda.empty_cache()




ABFT_GEMM_1

In [None]:
import torch
import torch.nn.functional as F
import time


  
batch_size = 2
num_heads = 32
seq_len_set = [512,1024,2048,4096,8192]
head_dim = 128 # hidden_dim = num_head * head_dim


  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for seq_len in seq_len_set:
    
    Q = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    K = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    V = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()

     
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

      
    start_event.record()

    for i in range(10):
         
         
        scale = head_dim ** -0.5
        scores = torch.matmul(Q, K.transpose(-1, -2)) * scale

        Q_sum_row = Q.sum(dim=2, keepdim=True)
        row_checksum = torch.matmul(Q_sum_row, K.transpose(-1, -2)) * scale

        sum_1 = scores.sum()
        sum_2 = row_checksum.sum()
        if torch.equal(sum_1, sum_2):
            sum_1 = torch.tensor(0.0, device=device)
        else:
            sum_2 = torch.tensor(0.0, device=device)


          
        attention_weights = F.softmax(scores, dim=-1)

          
        output = torch.matmul(attention_weights, V)

    
      
    end_event.record()

      
    torch.cuda.synchronize()

      
    elapsed_time = start_event.elapsed_time(end_event)

     
    print("Output shape:", output.shape)

      
    result = output.sum()

      
    print(f"Sum_1: {sum_1.item()}, Sum_2: {sum_2.item()}", "Reduced result:", result.item())

      
    print("Execution time: {:.6f} ms".format(elapsed_time/10),"\n")

      
    del Q, K, V, scores, attention_weights, output, result, Q_sum_row, row_checksum, sum_1, sum_2
    torch.cuda.empty_cache()




ABFT_GEMM1_elementcheck

In [None]:
import torch
import torch.nn.functional as F
import time


  
batch_size = 2
num_heads = 32
seq_len_set = [512,1024,2048,4096,8192]
head_dim = 128 # hidden_dim = num_head * head_dim


  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for seq_len in seq_len_set:
     
    Q = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    K = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    V = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()

    sum_1 = torch.tensor(0.0, device=device)
    sum_2 = torch.tensor(0.0, device=device)
    
     
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

      
    start_event.record()

    for i in range(10):
         
         
        scale = head_dim ** -0.5
        scores = torch.matmul(Q, K.transpose(-1, -2)) * scale

        Q_sum_row = Q.sum(dim=2, keepdim=True)
        row_checksum = torch.matmul(Q_sum_row, K.transpose(-1, -2)) * scale

        rowsum = scores.sum(dim=2, keepdim=True)
        
        if torch.equal(row_checksum, rowsum):
            sum_1 = torch.tensor(1.0, device=device)
        else:
            sum_2 = torch.tensor(1.0, device=device)


          
        attention_weights = F.softmax(scores, dim=-1)

          
        output = torch.matmul(attention_weights, V)

    
      
    end_event.record()

      
    torch.cuda.synchronize()

      
    elapsed_time = start_event.elapsed_time(end_event)

     
    print("row_checksum shape:", row_checksum.shape, "rowsum shape:", rowsum.shape)

      
    result = output.sum()

      
    print(f"Sum_1: {sum_1.item()}, Sum_2: {sum_2.item()}")

      
    print("Execution time: {:.6f} ms".format(elapsed_time/10),"\n")

      
    del Q, K, V, scores, attention_weights, output, result, Q_sum_row, row_checksum, sum_1, sum_2, rowsum
    torch.cuda.empty_cache()




ABFT_GEMM_2

In [None]:
import torch
import torch.nn.functional as F
import time


  
batch_size = 2
num_heads = 32
seq_len_set = [512,1024,2048,4096,8192]
head_dim = 128 # hidden_dim = num_head * head_dim


  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for seq_len in seq_len_set:
      
    Q = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    K = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    V = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()

     
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

      
    start_event.record()

    for i in range(10):
         
         
        scale = head_dim ** -0.5
        scores = torch.matmul(Q, K.transpose(-1, -2)) * scale

          
        attention_weights = F.softmax(scores, dim=-1)

          
        output = torch.matmul(attention_weights, V)

        V_sum_col = V.sum(dim=3, keepdim=True)
        col_checksum = torch.matmul(attention_weights, V_sum_col)

        sum_1 = output.sum()
        sum_2 = col_checksum.sum()
        if torch.equal(sum_1, sum_2):
            sum_1 = torch.tensor(0.0, device=device)
        else:
            sum_2 = torch.tensor(0.0, device=device)

    
      
    end_event.record()

      
    torch.cuda.synchronize()

      
    elapsed_time = start_event.elapsed_time(end_event)

     
    print("Output shape:", output.shape)

      
    result = output.sum()

      
    print(f"Sum_1: {sum_1.item()}, Sum_2: {sum_2.item()}", "Reduced result:", result.item())

      
    print("Execution time: {:.6f} ms".format(elapsed_time/10),"\n")

      
    del Q, K, V, scores, attention_weights, output, result, V_sum_col, col_checksum, sum_1, sum_2
    torch.cuda.empty_cache()




ABFT_optimize

In [None]:
import torch
import torch.nn.functional as F
import time


  
batch_size = 2
num_heads = 32
seq_len_set = [512,1024,2048,4096,8192]
head_dim = 128 # hidden_dim = num_head * head_dim


  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for seq_len in seq_len_set:
      
    Q = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    K = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    V = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    
     
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

      
    start_event.record()

    for i in range(10):
         
         
        flag = torch.tensor(1.0, device=device)
    
        scale = head_dim ** -0.5
        scores = torch.matmul(Q, K.transpose(-1, -2)) * scale

        Q_sum_row = Q.sum(dim=2, keepdim=True)
        row_checksum = torch.matmul(Q_sum_row, K.transpose(-1, -2)) * scale

        rowsum = scores.sum(dim=2, keepdim=True)
        if torch.equal(row_checksum, rowsum):
            flag = torch.tensor(1.0, device=device)

          
        attention_weights = F.softmax(scores, dim=-1)

          
        output = torch.matmul(attention_weights, V)

    
      
    end_event.record()

      
    torch.cuda.synchronize()

      
    elapsed_time = start_event.elapsed_time(end_event)

     
    print("Output shape:", output.shape)

      
    result = output.sum()

      
    print("Reduced result:", result.item(), "flag: ", flag)

      
    print("Execution time: {:.6f} ms".format(elapsed_time/10),"\n")

      
    del Q, K, V, scores, attention_weights, output, result
    torch.cuda.empty_cache()




DMR_optimize

In [None]:
import torch
import torch.nn.functional as F
import time


  
batch_size = 2
num_heads = 32
seq_len_set = [512,1024,2048,4096]
head_dim = 128 # hidden_dim = num_head * head_dim


  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for seq_len in seq_len_set:
      
    Q = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    K = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    V = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()

      
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

      
    start_event.record()

    for i in range(10):
         
         
        scale = head_dim ** -0.5
        scores = torch.matmul(Q, K.transpose(-1, -2)) * scale

          
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights_dmr = F.softmax(attention_weights, dim=-1)

          
        output = torch.matmul(attention_weights, V)

    
      
    end_event.record()

      
    torch.cuda.synchronize()

      
    elapsed_time = start_event.elapsed_time(end_event)

     
    print("Output shape:", output.shape)

      
    result = output.sum() + attention_weights_dmr.sum()

      
    print("Reduced result:", result.item())

      
    print("Execution time: {:.6f} ms".format(elapsed_time/10),"\n")

      
    del Q, K, V, scores, attention_weights, output, result, attention_weights_dmr
    torch.cuda.empty_cache()




change the loop order

In [None]:
import torch
import torch.nn.functional as F
import time


  
batch_size = 2
num_heads = 32
seq_len_set = [512,1024,2048,4096,8192] # 512,1024,2048,4096,
head_dim = 128 # hidden_dim = num_head * head_dim

time_list = {512: 0,
             1024: 0,
             2048: 0,
             4096: 0,
             8192: 0}

  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for i in range(10):
    for seq_len in seq_len_set:
        # batch_size = int(16384/seq_len)
          
        Q = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
        K = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
        V = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()

         
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)

          
        start_event.record()

         
         
        scale = head_dim ** -0.5
        scores = torch.matmul(Q, K.transpose(-1, -2)) * scale

          
        attention_weights = F.softmax(scores, dim=-1)

          
        output = torch.matmul(attention_weights, V)

        
          
        end_event.record()

          
        torch.cuda.synchronize()

          
        elapsed_time = start_event.elapsed_time(end_event)

        time_list[seq_len] += elapsed_time

         
        print("Output shape:", output.shape)

          
        result = output.sum()

          
        print("Reduced result:", result.item())

          
        print("Execution time: {:.6f} ms".format(elapsed_time),"\n")

          
        del Q, K, V, scores, attention_weights, output, result
        torch.cuda.empty_cache()


for key, value in time_list.items():
    final_time = value/10
    print(f"seq_len={key}; exe_time={final_time:.6f} ms")



**attention with all ft technique**

In [180]:
import torch
import torch.nn.functional as F
import time


  
# batch_size = 2
num_heads = 32
seq_len_set = [512] # 512,1024,2048,4096,
head_dim = 128 # hidden_dim = num_head * head_dim


  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for seq_len in seq_len_set:
    batch_size = int(16384/seq_len)
      
    Q = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    K = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    V = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()

     
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

      
    start_event.record()

    for i in range(10):
         
         
        scale = head_dim ** -0.5
        scores = torch.matmul(Q, K.transpose(-1, -2)) * scale

          
        attention_weights = F.softmax(scores, dim=-1)

          
        output = torch.matmul(attention_weights, V)

    
      
    end_event.record()

      
    torch.cuda.synchronize()

      
    elapsed_time = start_event.elapsed_time(end_event)

     
    print("Output shape:", output.shape)

      
    result = output.sum()

      
    print("Reduced result:", result.item())

      
    print("Execution time: {:.6f} ms".format(elapsed_time/10),"\n")

      
    del Q, K, V, scores, attention_weights, output, result
    torch.cuda.empty_cache()




Output shape: torch.Size([32, 32, 512, 128])
Reduced result: -3020.0
Execution time: 6.730676 ms 



In [174]:
import torch
import torch.nn.functional as F
import time


  
# batch_size = 2
num_heads = 32
seq_len_set = [512] # 512,1024,2048,4096,
head_dim = 128 # hidden_dim = num_head * head_dim


  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for seq_len in seq_len_set:
    batch_size = int(16384/seq_len)
      
    Q = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    K = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
    V = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()

     
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

      
    start_event.record()

    for i in range(10):
         
         
        scale = head_dim ** -0.5
        K_t = K.transpose(-1, -2)
        K_check = K_t.sum(dim=-1, keepdim=True)

        scores = torch.matmul(Q, K_t) * scale
        scores_check = torch.matmul(Q, K_check) * scale

          
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = F.softmax(attention_weights, dim=-1)

          
        V_check = V.sum(dim=-1, keepdim=True)
        output = torch.matmul(attention_weights, V)
        out_check = torch.matmul(attention_weights, V_check)

    
      
    end_event.record()

      
    torch.cuda.synchronize()

      
    elapsed_time = start_event.elapsed_time(end_event)

     
    print("Output shape:", output.shape)

      
    result = output.sum() + scores_check.sum() + out_check.sum()

      
    print("Reduced result:", result.item())

      
    print("Execution time: {:.6f} ms".format(elapsed_time/10),"\n")

      
    del Q, K, V, scores, attention_weights, output, result, V_check, scores_check, K_check, out_check
    torch.cuda.empty_cache()




Output shape: torch.Size([32, 32, 512, 128])
Reduced result: 21408.0
Execution time: 4.550621 ms 



**10 times average reaults**

In [46]:
import torch
import torch.nn.functional as F
import time

def baseline(test_len):
      
    # batch_size = 2
    num_heads = 16
    seq_len_set = [test_len] # 512,1024,2048,4096,
    head_dim = 64 # hidden_dim = num_head * head_dim


      
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for seq_len in seq_len_set:
        batch_size = int(16384/seq_len)
          
        Q = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
        K = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()
        V = (torch.rand(batch_size, num_heads, seq_len, head_dim, device=device) * 2 - 1).half()

         
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)

          
        start_event.record()

        for i in range(1):
             
             
            scale = head_dim ** -0.5
            scores = torch.matmul(Q, K.transpose(-1, -2)) * scale

              
            attention_weights = F.softmax(scores, dim=-1)

              
            output = torch.matmul(attention_weights, V)

        
          
        end_event.record()

          
        torch.cuda.synchronize()

          
        elapsed_time = start_event.elapsed_time(end_event)

         
        print("Output shape:", output.shape)

          
        result = output.sum()

          
        print("Reduced result:", result.item())

          
        print("Execution time: {:.6f} ms".format(elapsed_time),"\n")

          
        del Q, K, V, scores, attention_weights, output, result
        torch.cuda.empty_cache()

        return elapsed_time


total_time = 0
for i in range(10):
    total_time += baseline(512)
print("Seq_len: 512; Average execution time: {:.6f} ms".format(total_time/10),"\n")

Output shape: torch.Size([32, 16, 512, 64])
Reduced result: -3602.0
Execution time: 16.570047 ms 

Output shape: torch.Size([32, 16, 512, 64])
Reduced result: -768.0
Execution time: 2.262400 ms 

Output shape: torch.Size([32, 16, 512, 64])
Reduced result: 576.5
Execution time: 2.264928 ms 

Output shape: torch.Size([32, 16, 512, 64])
Reduced result: 627.5
Execution time: 2.264064 ms 

Output shape: torch.Size([32, 16, 512, 64])
Reduced result: 273.0
Execution time: 2.260288 ms 

Output shape: torch.Size([32, 16, 512, 64])
Reduced result: -3122.0
Execution time: 2.263616 ms 

Output shape: torch.Size([32, 16, 512, 64])
Reduced result: 4720.0
Execution time: 2.258400 ms 

Output shape: torch.Size([32, 16, 512, 64])
Reduced result: -6408.0
Execution time: 2.264832 ms 

Output shape: torch.Size([32, 16, 512, 64])
Reduced result: -3050.0
Execution time: 2.260544 ms 

Output shape: torch.Size([32, 16, 512, 64])
Reduced result: -2768.0
Execution time: 2.261216 ms 

Seq_len: 512; Average execu