In [1]:
import torch
import torch.nn as nn
import copy
from pyhocon import ConfigFactory
from models.resnet import resnet50im

In [2]:
conf = ConfigFactory.parse_file("./resnet50_imagenet.hocon")
getattr(conf, 'data_path')

In [4]:
t = resnet50im(pretrained=False)

In [7]:
f = getattr(conf, 'date', None)
print(f)

None


In [2]:
# This Matrix Adder (M, N) -> M
def MatrixAdder(tensor, AdderType="FP16"):
    epsilon = 1e-10
    if not (len(tensor.shape) == 2 or len(tensor.shape) == 3) :
        AssertionError(f"It only supported 2d Matrix, this tensor shape {tensor.shape}")

    if AdderType=="FP16":
        mantissa = 10
    elif AdderType=="BF16":
        mantissa = 7
    elif AdderType=="FP32":
        mantissa = 22
    else:
        AssertionError("This Adder only supported FP16|BF16|FP32")
    
    temp_tensor =tensor.clone()
    zero_mask_counter = []
    for i in range(tensor.shape[1] -1):
        prev = temp_tensor[:, i]
        prec = temp_tensor[:, i+1]
        log_prev = torch.log2(torch.abs(prev)+epsilon)
        log_prec = torch.log2(torch.abs(prec)+epsilon)
        zero_mask = torch.abs(log_prec-log_prev) > mantissa
        max_log_tensor = prec.clone()
        max_log_tensor[log_prec<log_prev] = prev[log_prec<log_prev] # 두 벡터 중 log2 의 value가 큰 값을 가지고 있는 vector 생성
        output = prec+prev # 두 벡터를 더함
        output[zero_mask] = max_log_tensor[zero_mask] # zero_mask에 해당하는 부분은 log2 value가 큰 값만 저장
        temp_tensor[:, i+1] = output
        zero_mask_counter.append(zero_mask.sum())
    
    
    return temp_tensor[:, -1], zero_mask_counter


In [59]:
# This TensorChannelAdder (N, C, H, W) -> (1, C, 1, 1)
# It is similar result tensor.sum(axis=0).sum(axis=2).sum(axis=3)
# We change tensor.transpose(0, 1).reshape(C, chunk, N*H*W//chunk).sum(axis=2).sum(axis=1)
def TensorMeanSim(tensor, chunk=1024,AdderType="FP16"):
    epsilon = 1e-10
    if not len(tensor.shape) ==4 :
        AssertionError(f"It only supported 4d Matrix, but this tensor shape {tensor.shape}")

    if AdderType=="FP16":
        mantissa = 10
    elif AdderType=="BF16":
        mantissa = 7
    elif AdderType=="FP32":
        mantissa = 22
    elif AdderType=="test":
        mantissa= 100
    else:
        AssertionError("This Adder only supported FP16|BF16|FP32")
    
    temp_tensor =tensor.clone()
    zero_mask_counter = []
    
    n, c, h, w = tensor.shape

    
    if not n*h*w % chunk == 0: 
        AssertionError(f"The n*h*w should always be divisible chunk but result {n*h*w % chunk}")
    #change (c, n*h*w//chunk, chunk) 
    chunk_tensor = temp_tensor.transpose(1, 0).reshape(c, chunk, n*h*w//chunk)
    
    # first chunk based Adder (last dim size is equal to n*h*w divided by chunk, so last dim adder is always chunk adder)
    # (C, chunk, 0) + (C, chunk, 1) = C*chunk adder
    # accumulated that result of adder is final values (chunk_tensor[:, :, -1])
    for i in range(chunk_tensor.shape[-1] -1):
        prev = chunk_tensor[:, :, i]
        prec = chunk_tensor[:, :, i+1]
        log_prev = torch.log2(torch.abs(prev)+epsilon)
        log_prec = torch.log2(torch.abs(prec)+epsilon)
        zero_mask = torch.abs(log_prec-log_prev) > mantissa
        max_log_tensor = prec.clone()
        max_log_tensor[log_prec<log_prev] = prev[log_prec<log_prev] # 두 벡터 중 log2 의 value가 큰 값을 가지고 있는 vector 생성
        output = prec+prev # 두 벡터를 더함
        output[zero_mask] = max_log_tensor[zero_mask] # zero_mask에 해당하는 부분은 log2 value가 큰 값만 저장
        chunk_tensor[:,:, i+1] = output
        zero_mask_counter.append(zero_mask.sum())
    
    sum_tensor = chunk_tensor[:, :, -1] # C, chunk_size
    print(f"chunk based sum result : {sum(zero_mask_counter)}/{c * chunk * (chunk_tensor.shape[-1]-1)} = {sum(zero_mask_counter) / (c * chunk * (chunk_tensor.shape[-1]-1)) * 100}%")
    
    for j in range(chunk_tensor.shape[1]-1):
        prev = sum_tensor[:, j]
        prec = sum_tensor[:, j+1]        
        log_prev = torch.log2(torch.abs(prev)+epsilon)
        log_prec = torch.log2(torch.abs(prec)+epsilon)
        zero_mask = torch.abs(log_prec-log_prev) > mantissa
        max_log_tensor = prec.clone()
        max_log_tensor[log_prec<log_prev] = prev[log_prec<log_prev] # 두 벡터 중 log2 의 value가 큰 값을 가지고 있는 vector 생성
        output = prec+prev # 두 벡터를 더함
        output[zero_mask] = max_log_tensor[zero_mask] # zero_mask에 해당하는 부분은 log2 value가 큰 값만 저장
        sum_tensor[:, j+1] = output
        zero_mask_counter.append(zero_mask.sum())
        
    # chunk_tensor[:, :, -1] is same to chunk_tensor.sum(dim=-2), and then finally  

    print(f"final sum result : {sum(zero_mask_counter)}/{c*(chunk-1)+c * chunk * (chunk_tensor.shape[-1]-1)} =\
        {sum(zero_mask_counter)/(c*(chunk-1)+ c * chunk * (chunk_tensor.shape[-1]-1)) * 100}%")
    
    return sum_tensor[:, -1]/(n*h*w), sum(zero_mask_counter), (n*h*w-1)*c


In [None]:
# This TensorChannelAdder (N, C, H, W) -> (1, C, 1, 1)
# It is similar result tensor.sum(axis=0).sum(axis=2).sum(axis=3)
# We change tensor.transpose(0, 1).reshape(C, chunk, N*H*W//chunk).sum(axis=2).sum(axis=1)
def BatchNormMeanSim(tensor, chunk=1024,AdderType="FP16"):
    epsilon = 1e-10
    if not len(tensor.shape) ==4 :
        AssertionError(f"It only supported 4d Matrix, but this tensor shape {tensor.shape}")

    if AdderType=="FP16":
        mantissa = 10
    elif AdderType=="BF16":
        mantissa = 7
    elif AdderType=="FP32":
        mantissa = 22
    elif AdderType=="test":
        mantissa= 100
    else:
        AssertionError("This Adder only supported FP16|BF16|FP32")
    
    temp_tensor =tensor.clone()
    zero_mask_counter = []
    
    n, c, h, w = tensor.shape

    
    if not n*h*w % chunk == 0: 
        AssertionError(f"The n*h*w should always be divisible chunk but result {n*h*w % chunk}")
    #change (c, n*h*w//chunk, chunk) 
    chunk_tensor = temp_tensor.transpose(1, 0).reshape(c, chunk, n*h*w//chunk)
    
    # first chunk based Adder (last dim size is equal to n*h*w divided by chunk, so last dim adder is always chunk adder)
    # (C, chunk, 0) + (C, chunk, 1) = C*chunk adder
    # accumulated that result of adder is final values (chunk_tensor[:, :, -1])
    for i in range(chunk_tensor.shape[-1] -1):
        prev = chunk_tensor[:, :, i]
        prec = chunk_tensor[:, :, i+1]
        log_prev = torch.log2(torch.abs(prev)+epsilon)
        log_prec = torch.log2(torch.abs(prec)+epsilon)
        zero_mask = torch.abs(log_prec-log_prev) > mantissa
        max_log_tensor = prec.clone()
        max_log_tensor[log_prec<log_prev] = prev[log_prec<log_prev] # 두 벡터 중 log2 의 value가 큰 값을 가지고 있는 vector 생성
        output = prec+prev # 두 벡터를 더함
        output[zero_mask] = max_log_tensor[zero_mask] # zero_mask에 해당하는 부분은 log2 value가 큰 값만 저장
        chunk_tensor[:,:, i+1] = output
        zero_mask_counter.append(zero_mask.sum())
    
    sum_tensor = chunk_tensor[:, :, -1] # C, chunk_size
    print(f"chunk based sum result : {sum(zero_mask_counter)}/{c * chunk * (chunk_tensor.shape[-1]-1)} = {sum(zero_mask_counter) / (c * chunk * (chunk_tensor.shape[-1]-1)) * 100}%")
    
    for j in range(chunk_tensor.shape[1]-1):
        prev = sum_tensor[:, j]
        prec = sum_tensor[:, j+1]        
        log_prev = torch.log2(torch.abs(prev)+epsilon)
        log_prec = torch.log2(torch.abs(prec)+epsilon)
        zero_mask = torch.abs(log_prec-log_prev) > mantissa
        max_log_tensor = prec.clone()
        max_log_tensor[log_prec<log_prev] = prev[log_prec<log_prev] # 두 벡터 중 log2 의 value가 큰 값을 가지고 있는 vector 생성
        output = prec+prev # 두 벡터를 더함
        output[zero_mask] = max_log_tensor[zero_mask] # zero_mask에 해당하는 부분은 log2 value가 큰 값만 저장
        sum_tensor[:, j+1] = output
        zero_mask_counter.append(zero_mask.sum())
        
    # chunk_tensor[:, :, -1] is same to chunk_tensor.sum(dim=-2), and then finally  

    print(f"final sum result : {sum(zero_mask_counter)}/{c*(chunk-1)+c * chunk * (chunk_tensor.shape[-1]-1)} =\
        {sum(zero_mask_counter)/(c*(chunk-1)+ c * chunk * (chunk_tensor.shape[-1]-1)) * 100}%")
    
    return sum_tensor[:, -1]/(n*h*w), sum(zero_mask_counter), (n*h*w-1)*c

In [63]:
a = torch.randn(32,48,64,128)
a = a.type(torch.bfloat16)

m, z, t = TensorAdderSim(a, AdderType="BF16")

chunk based sum result : 661010/12533760 = 5.273836612701416%
final sum result : 665735/12582864 =        5.290806770324707%


In [64]:
a.transpose(1,0).reshape(a.shape[1], -1).sum(dim=1)

tensor([-732.0000,  720.0000,  118.5000,  -76.0000,  860.0000,  832.0000,
         142.0000, -260.0000,  205.0000, -648.0000,  -36.5000, -976.0000,
        -624.0000, -386.0000, -139.0000,  304.0000, -308.0000, -211.0000,
        -167.0000, -426.0000,  195.0000, -131.0000,    6.8125,  -95.5000,
         268.0000, -376.0000, -808.0000, 1280.0000,    7.6562,  528.0000,
         540.0000,  604.0000,  179.0000, -127.5000,   81.5000,  198.0000,
         768.0000, -364.0000, -176.0000,  828.0000,  532.0000, -308.0000,
        -588.0000, -342.0000, -139.0000, -336.0000,  402.0000,   41.2500],
       dtype=torch.bfloat16)

In [65]:
a.sum(dim=3).sum(dim=2).sum(dim=0)

tensor([-732.0000,  720.0000,  119.0000,  -76.0000,  864.0000,  832.0000,
         145.0000, -260.0000,  204.0000, -648.0000,  -37.0000, -976.0000,
        -620.0000, -384.0000, -138.0000,  306.0000, -308.0000, -211.0000,
        -166.0000, -426.0000,  193.0000, -128.0000,    6.6250,  -96.5000,
         268.0000, -376.0000, -812.0000, 1280.0000,    5.9375,  528.0000,
         536.0000,  604.0000,  178.0000, -128.0000,   83.0000,  199.0000,
         768.0000, -362.0000, -177.0000,  828.0000,  536.0000, -308.0000,
        -584.0000, -342.0000, -138.0000, -338.0000,  404.0000,   40.2500],
       dtype=torch.bfloat16)

In [None]:
# This TensorChannelAdder (N, C, H, W) -> (1, C, 1, 1)
# It is similar result tensor.sum(axis=0).sum(axis=2).sum(axis=3)
# We change tensor.transpose(0, 1).reshape(C, chunk, N*H*W//chunk).sum(axis=2).sum(axis=1)
def BatchNormStdSim(tensor, mean_tensor, chunk=1024, AdderType="FP16"):
    epsilon = 1e-10
    if not len(tensor.shape) ==4 :
        AssertionError(f"It only supported 4d Matrix, but this tensor shape {tensor.shape}")

    if AdderType=="FP16":
        mantissa = 10
    elif AdderType=="BF16":
        mantissa = 7
    elif AdderType=="FP32":
        mantissa = 22
    elif AdderType=="test":
        mantissa= 100
    else:
        AssertionError("This Adder only supported FP16|BF16|FP32")
    
    temp_tensor =tensor.clone()
    zero_mask_counter = []
    
    n, c, h, w = tensor.shape

    
    if not n*h*w % chunk == 0: 
        AssertionError(f"The n*h*w should always be divisible chunk but result {n*h*w % chunk}")
    #change (c, n*h*w//chunk, chunk) 

    if mean_tensor.dim() == 1:
        #change 4d tensor
        mean_tensor = mean_tensor.reshape(1, -1, 1, 1)
    elif mean_tensor.dim() !=4:
        AssertionError("mean_tensor input only 1d or 4d tensor")
    
    if not mean_tensor.shape[1] == c:
        AssertionError("mean_tensor and tensor is required same shape")
    # first computing (X-mean)**2

    mean_tensor = torch.zeros_like(temp_tensor) + mean_tensor # broadcasting and same shape result tensor
    log_temp_tensor = torch.log2(torch.abs(temp_tensor) + epsilon)
    log_temp_mean = torch.log2(torch.abs(mean_tensor) + epsilon)
    zero_mask = torch.abs(log_temp_tensor - log_temp_mean) > mantissa
    output = temp_tensor - mean_tensor # X - mean(X)
    max_log_tensor = temp_tensor.clone()
    max_log_tensor[log_temp_tensor<log_temp_mean] = mean_tensor[log_temp_tensor<log_temp_mean] # get log2 max_value
    output[zero_mask]=max_log_tensor[zero_mask]
    var = output**2 # (X - mean(X))^2

    chunk_tensor = var.transpose(1, 0).reshape(c, chunk, n*h*w//chunk)

    
    # second chunk based Adder (last dim size is equal to n*h*w divided by chunk, so last dim adder is always chunk adder)
    # (C, chunk, 0) + (C, chunk, 1) = C*chunk adder
    # accumulated that result of adder is final values (chunk_tensor[:, :, -1])
    for i in range(chunk_tensor.shape[-1] -1):
        prev = chunk_tensor[:, :, i]
        prec = chunk_tensor[:, :, i+1]
        log_prev = torch.log2(torch.abs(prev)+epsilon)
        log_prec = torch.log2(torch.abs(prec)+epsilon)
        zero_mask = torch.abs(log_prec-log_prev) > mantissa
        max_log_tensor = prec.clone()
        max_log_tensor[log_prec<log_prev] = prev[log_prec<log_prev] # 두 벡터 중 log2 의 value가 큰 값을 가지고 있는 vector 생성
        output = prec+prev # 두 벡터를 더함
        output[zero_mask] = max_log_tensor[zero_mask] # zero_mask에 해당하는 부분은 log2 value가 큰 값만 저장
        chunk_tensor[:,:, i+1] = output
        zero_mask_counter.append(zero_mask.sum())
    
    sum_tensor = chunk_tensor[:, :, -1] # C, chunk_size
    print(f"chunk based sum result : {sum(zero_mask_counter)}/{c * chunk * (chunk_tensor.shape[-1]-1)} = {sum(zero_mask_counter) / (c * chunk * (chunk_tensor.shape[-1]-1)) * 100}%")
    
    for j in range(chunk_tensor.shape[1]-1):
        prev = sum_tensor[:, j]
        prec = sum_tensor[:, j+1]        
        log_prev = torch.log2(torch.abs(prev)+epsilon)
        log_prec = torch.log2(torch.abs(prec)+epsilon)
        zero_mask = torch.abs(log_prec-log_prev) > mantissa
        max_log_tensor = prec.clone()
        max_log_tensor[log_prec<log_prev] = prev[log_prec<log_prev] # 두 벡터 중 log2 의 value가 큰 값을 가지고 있는 vector 생성
        output = prec+prev # 두 벡터를 더함
        output[zero_mask] = max_log_tensor[zero_mask] # zero_mask에 해당하는 부분은 log2 value가 큰 값만 저장
        sum_tensor[:, j+1] = output
        zero_mask_counter.append(zero_mask.sum())
        
    # chunk_tensor[:, :, -1] is same to chunk_tensor.sum(dim=-2), and then finally  

    print(f"final sum result : {sum(zero_mask_counter)}/{c*(chunk-1)+c * chunk * (chunk_tensor.shape[-1]-1)} =\
        {sum(zero_mask_counter)/(c*(chunk-1)+ c * chunk * (chunk_tensor.shape[-1]-1)) * 100}%")
    
    return torch.sqrt(sum_tensor[:, -1]/(n*h*w)), sum(zero_mask_counter), (n*h*w-1)*c

In [3]:
c = torch.tensor([1,2,3])
c.dim()

1

In [10]:
a = torch.arange(60).reshape(2,10,3)
b = torch.arange(10).reshape(10) *3

print(a)
print(b)
print(a-b.repeat())

tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8],
         [ 9, 10, 11],
         [12, 13, 14],
         [15, 16, 17],
         [18, 19, 20],
         [21, 22, 23],
         [24, 25, 26],
         [27, 28, 29]],

        [[30, 31, 32],
         [33, 34, 35],
         [36, 37, 38],
         [39, 40, 41],
         [42, 43, 44],
         [45, 46, 47],
         [48, 49, 50],
         [51, 52, 53],
         [54, 55, 56],
         [57, 58, 59]]])
tensor([ 0,  3,  6,  9, 12, 15, 18, 21, 24, 27])


RuntimeError: The size of tensor a (3) must match the size of tensor b (10) at non-singleton dimension 2

In [35]:
g=torch.zeros(a.size())
g+=b.unsqueeze(0).unsqueeze(2)
a-g

tensor([[[ 0.,  1.,  2.],
         [ 0.,  1.,  2.],
         [ 0.,  1.,  2.],
         [ 0.,  1.,  2.],
         [ 0.,  1.,  2.],
         [ 0.,  1.,  2.],
         [ 0.,  1.,  2.],
         [ 0.,  1.,  2.],
         [ 0.,  1.,  2.],
         [ 0.,  1.,  2.]],

        [[30., 31., 32.],
         [30., 31., 32.],
         [30., 31., 32.],
         [30., 31., 32.],
         [30., 31., 32.],
         [30., 31., 32.],
         [30., 31., 32.],
         [30., 31., 32.],
         [30., 31., 32.],
         [30., 31., 32.]]])

In [36]:
g

tensor([[[ 0.,  0.,  0.],
         [ 3.,  3.,  3.],
         [ 6.,  6.,  6.],
         [ 9.,  9.,  9.],
         [12., 12., 12.],
         [15., 15., 15.],
         [18., 18., 18.],
         [21., 21., 21.],
         [24., 24., 24.],
         [27., 27., 27.]],

        [[ 0.,  0.,  0.],
         [ 3.,  3.,  3.],
         [ 6.,  6.,  6.],
         [ 9.,  9.,  9.],
         [12., 12., 12.],
         [15., 15., 15.],
         [18., 18., 18.],
         [21., 21., 21.],
         [24., 24., 24.],
         [27., 27., 27.]]])

tensor([[[0],
         [1],
         [2],
         [3],
         [4],
         [5],
         [6],
         [7],
         [8],
         [9]]])
