In [377]:
import pandas as pd
from IPython.display import display, HTML
from glob import glob
import base64

def create_download_link( df, title = "Download CSV file", filename = "data.csv"):
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)



In [371]:
def compute_bytes(d):
    tot = 2.0* d['Floating Point Operations(Half Precision)'] + \
          4.0* d['Floating Point Operations(Single Precision)'] + \
          8.0* d['Floating Point Operations(Double Precision)'] + \
          4.0* d['Integer Instructions']
    tot = tot * d['Invocations']
    return tot

def req_nonlocal_bytes(d):
    tot = (d['Requested Global Load Throughput(bytes/sec)'] + d['Requested Global Store Throughput(bytes/sec)']+ d['Shared Memory Load Throughput(bytes/sec)'] + d['Shared Memory Store Throughput(bytes/sec)']) * (d['Avg. Duration(ns)'] * d['Invocations'])
    #tot = (d['Requested Global Load Throughput(bytes/sec)']) * (d['Avg. Duration(ns)'] * d['Invocations'])
    tot = tot *  1.0E-9 # nanoseconds to seconds
    return tot

def trans_nonlocal_bytes(d):
    tot = (d['Global Load Throughput(bytes/sec)'] + d['Global Store Throughput(bytes/sec)']+ d['Shared Memory Load Throughput(bytes/sec)'] + d['Shared Memory Store Throughput(bytes/sec)']) * d['Avg. Duration(ns)'] * d['Invocations']
    tot = tot* 1.0E-9 # nanoseconds to seconds
    return tot

def req_global_bytes(d):
    tot = (d['Requested Global Load Throughput(bytes/sec)'] + d['Requested Global Store Throughput(bytes/sec)']) * (d['Avg. Duration(ns)'] * d['Invocations'])
    #tot = (d['Requested Global Load Throughput(bytes/sec)']) * (d['Avg. Duration(ns)'] * d['Invocations'])
    tot = tot *  1.0E-9 # nanoseconds to seconds
    return tot

def trans_global_bytes(d):
    tot = (d['Global Load Throughput(bytes/sec)'] + d['Global Store Throughput(bytes/sec)']) * d['Avg. Duration(ns)'] * d['Invocations']
    tot = tot* 1.0E-9 # nanoseconds to seconds
    return tot

def l1_miss_bytes(d):
    tot = (d['Unified Cache Transactions'])* \
           (1.0- (d['Unified Cache Hit Rate(%)']/100.0)) * \
           128.0 *\
           d['Invocations']
    return tot

def l1_access_bytes(d):
    tot = (d['Unified Cache Transactions'])* \
           128.0 *\
           d['Invocations']
    return tot

def dram_bytes(d):
    return (d['Device Memory Read Transactions'] + d['Device Memory Write Transactions']) * 128.0 * d['Invocations']

def unified_hits(d):
    return d['Unified Cache Transactions'] * (d['Unified Cache Hit Rate(%)'] / 100.0) * d['Invocations']

def unified_misses(d):
    return d['Unified Cache Transactions'] * (1.0 - (d['Unified Cache Hit Rate(%)'] / 100.0)) * d['Invocations']
                                              
    

In [372]:
# Ingest Data

data = {}
metrics_filename = 'metrics.csv'
for i in glob('*/'+metrics_filename):
    import re

    # Load data to dataframe, convert to float
    d = pd.read_csv(i, index_col='Name', skipinitialspace=True)
    for col in d.columns:
        d[col] = d[col].astype(float)
    
    # remove the 'memset' pseudo kernel
    if 'memset (0)' in d.index :
        d.drop(['memset (0)'], inplace=True)
    
    # get the name of the dataset from the path
    dset_name = re.sub('/{}$'.format(metrics_filename), '', i)
    data[dset_name] = d 
    



In [373]:
# Print datasets and columns
dsets = pd.DataFrame(sorted(data.keys()))

cols = None
for n, d in data.items():
    c = set(d.columns)
    
    if cols is None:
        cols = set() | c
    else:
        cols = cols | c
    if len(c ^ cols) > 0:
        print("Dataset {} is missing cols {} and has extra cols {}".format(n, cols-c, c-cols))
        print()
columns = pd.DataFrame([sorted(list(cols))])
pd.set_option('display.max_colwidth', -1)
display(HTML(dsets.to_html()))
print((columns.transpose()))


Dataset deeplab_coco_2n_2b_forward is missing cols {'L2 Cache Hit Rate(%)', 'Total number of local load requests from Multiprocessor', 'Total number of local store requests from Multiprocessor', 'Total number of global load requests from Multiprocessor'} and has extra cols set()

Dataset deeplab_coco_2n_2b_backward is missing cols {'L2 Cache Hit Rate(%)', 'Total number of local load requests from Multiprocessor', 'Total number of local store requests from Multiprocessor', 'Total number of global load requests from Multiprocessor'} and has extra cols set()



Unnamed: 0,0
0,deeplab_coco_2n_2b_backward
1,deeplab_coco_2n_2b_forward
2,gat_sparse_backward_10n_1b
3,gat_sparse_forward_10n_1b
4,gcn_backward_10n_1b
5,gcn_forward_10n_1b
6,graphsage_backward_10n_1b
7,graphsage_forward_10n_1b
8,mpnnv1_backward_10n_1b
9,mpnnv1_forward_10n_1b


                                                           0
0   Avg. Duration(ns)                                       
1   Avg. Dynamic Shared Memory                              
2   Device Memory Read Transactions                         
3   Device Memory Write Transactions                        
4   FP Instructions(Double)                                 
5   FP Instructions(Single)                                 
6   Floating Point Operations(Double Precision)             
7   Floating Point Operations(Half Precision)               
8   Floating Point Operations(Single Precision)             
9   Global Hit Rate in unified l1/tex(%)                    
10  Global Load Throughput(bytes/sec)                       
11  Global Memory Load Efficiency(%)                        
12  Global Memory Store Efficiency(%)                       
13  Global Store Throughput(bytes/sec)                      
14  HP Instructions(Half)                                   
15  Integer Instructions

In [379]:
def gen_stats(d, n, agg=True):
    req_bytes = req_nonlocal_bytes(d)
    trans_bytes = trans_nonlocal_bytes(d)
    req_gl_bytes = req_global_bytes(d)
    trans_gl_bytes = trans_global_bytes(d)
    l1m_bytes = l1_miss_bytes(d)
    l1_bytes = l1_access_bytes(d)
    dram_bytes_m = dram_bytes(d)
    flops = compute_bytes(d)
    u_hits = unified_hits(d)
    u_misses = unified_misses(d)
    
    if agg:
        req_bytes = req_bytes.sum()
        trans_bytes = trans_bytes.sum()
        req_gl_bytes = req_gl_bytes.sum()
        trans_gl_bytes = trans_gl_bytes.sum()
        l1m_bytes = l1m_bytes.sum()
        l1_bytes = l1_bytes.sum()
        dram_bytes_m = dram_bytes_m.sum()
        flops = flops.sum()
        u_hits = u_hits.sum()
        u_misses = u_misses.sum()

    stats = {}
    stats['c_to_m'] = (flops)/(l1_bytes)
    stats['c_to_l1m'] = (flops)/(l1m_bytes)
    stats['c_to_dram'] = (flops)/(dram_bytes_m)
    stats['nonlocal_load_efficiency'] = (req_bytes)/(trans_bytes)
    stats['global_load_efficiency'] = (req_gl_bytes)/(trans_gl_bytes)
    stats['unified_hit_rate'] = (u_hits)/(u_hits + u_misses)
    
    s = pd.Series(stats)
    s.name = n
    return s

res = pd.concat([gen_stats(d,n) for n,d in sorted(data.items())], axis=1).transpose()

#print(res.to_csv())
display(create_download_link(res))
display(HTML(res.to_html()))



Unnamed: 0,c_to_m,c_to_l1m,c_to_dram,nonlocal_load_efficiency,global_load_efficiency,unified_hit_rate
deeplab_coco_2n_2b_backward,13.830716,32.466585,32.749419,0.826497,0.43215,0.574002
deeplab_coco_2n_2b_forward,4.884669,10.780311,38.828292,0.725249,0.591893,0.54689
gat_sparse_backward_10n_1b,8.692258,15.216129,5.419515,0.909049,0.600083,0.428747
gat_sparse_forward_10n_1b,7.991358,11.208288,3.509798,0.71744,0.511085,0.287013
gcn_backward_10n_1b,4.114401,6.662628,4.204908,0.953458,0.745915,0.382466
gcn_forward_10n_1b,4.230199,7.500044,4.236785,0.945587,0.734459,0.435977
graphsage_backward_10n_1b,13.129392,30.573864,30.437148,0.780647,0.39504,0.570568
graphsage_forward_10n_1b,23.067216,70.316996,45.949826,0.734367,0.276127,0.671954
mpnnv1_backward_10n_1b,11.044919,27.911602,20.642375,0.76664,0.354949,0.604289
mpnnv1_forward_10n_1b,26.490655,48.24634,29.951004,0.970529,0.827548,0.450929


In [380]:
res = pd.concat([gen_stats(d,n) for n,d in sorted(data.items()) if 'deeplab' in n or 'mpnnv1'], axis=1).transpose()

display(create_download_link(res))
display(HTML(res.to_html()))


Unnamed: 0,c_to_m,c_to_l1m,c_to_dram,nonlocal_load_efficiency,global_load_efficiency,unified_hit_rate
deeplab_coco_2n_2b_backward,13.830716,32.466585,32.749419,0.826497,0.43215,0.574002
deeplab_coco_2n_2b_forward,4.884669,10.780311,38.828292,0.725249,0.591893,0.54689
gat_sparse_backward_10n_1b,8.692258,15.216129,5.419515,0.909049,0.600083,0.428747
gat_sparse_forward_10n_1b,7.991358,11.208288,3.509798,0.71744,0.511085,0.287013
gcn_backward_10n_1b,4.114401,6.662628,4.204908,0.953458,0.745915,0.382466
gcn_forward_10n_1b,4.230199,7.500044,4.236785,0.945587,0.734459,0.435977
graphsage_backward_10n_1b,13.129392,30.573864,30.437148,0.780647,0.39504,0.570568
graphsage_forward_10n_1b,23.067216,70.316996,45.949826,0.734367,0.276127,0.671954
mpnnv1_backward_10n_1b,11.044919,27.911602,20.642375,0.76664,0.354949,0.604289
mpnnv1_forward_10n_1b,26.490655,48.24634,29.951004,0.970529,0.827548,0.450929


In [260]:
from IPython.display import display


from IPython.display import display, HTML
for n, d in data.items():
    s = gen_stats(d, n, agg=False)
    #print(n)
    #display(HTML(s.to_html()))
    
csv = ''
for n, d in data.items():
    s = gen_stats(d, n, agg=False)
    csv += "{}\n".format(n)
    csv += s.to_csv()
    csv += "\n"

print(csv)
display(create_download_link(res))


mpnnv1_forward_10n_1b
Name,c_to_m,nonlocal_load_efficiency,global_load_efficiency,l1_hit_rate
"void kernelPointwiseApply3<TensorMulOp<float>, float, float, float, unsigned int, int=1, int=1, int=1>(OffsetInfo<TensorMulOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=1>, float, float)",0.6439077814829091,0.998828394210889,0.7952575251131117,0.49418999999999996
"void kernelPointwiseApply2<TensorSigmoidOp<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<TensorSigmoidOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)",24.265113699389904,0.986863711001642,0.7952575251131117,0.49333000000000005
"void kernelPointwiseApply2<CopyOp<float, unsigned char>, float, unsigned char, unsigned int, int=1, int=1>(OffsetInfo<unsigned char, float, unsigned char>, OffsetInfo<CopyOp<float, unsigned char>, float, unsigned int>, float, float)",0.0,0.9970729111229377,0.7952575251131117,0.57706
"void kernelPointwiseApply2<Tens

In [261]:

from IPython.display import display, HTML
for n, d in data.items():
    s = gen_stats(d, n, agg=False)
    print(n)
    display(HTML(d.to_html()))



mpnnv1_forward_10n_1b


Unnamed: 0_level_0,Invocations,Avg. Duration(ns),Registers/Thread,Static Shared Memory,Avg. Dynamic Shared Memory,Global Memory Store Efficiency(%),L2 Cache Hit Rate(%),HP Instructions(Half),Unified Cache Transactions,Floating Point Operations(Double Precision),Floating Point Operations(Single Precision),Total number of global load requests from Multiprocessor,Total number of local load requests from Multiprocessor,Total number of local store requests from Multiprocessor,Shared Memory Load Throughput(bytes/sec),Floating Point Operations(Half Precision),Shared Memory Store Throughput(bytes/sec),Global Memory Load Efficiency(%),Global Hit Rate in unified l1/tex(%),L2 Transactions (Texture Reads),Local Hit Rate(%),Requested Global Load Throughput(bytes/sec),Requested Global Store Throughput(bytes/sec),Global Load Throughput(bytes/sec),L2 Transactions (Texture Writes),Global Store Throughput(bytes/sec),Unified Cache Hit Rate(%),Shared Memory Efficiency(%),FP Instructions(Single),FP Instructions(Double),Integer Instructions,Shared Store Transactions,Shared Load Transactions
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
"void kernelPointwiseApply3<TensorMulOp<float>, float, float, float, unsigned int, int=1, int=1, int=1>(OffsetInfo<TensorMulOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=1>, float, float)",23.0,1132.0,11.0,0.0,0.0,98.852,0.0,0.0,661.0,0.0,2641.0,661.0,0.0,0.0,0.0,0.0,0.0,98.852,0.0,661.0,0.0,14493000000.0,7246000000.0,14510000000.0,330.0,7255000000.0,49.419,0.0,2641.0,0.0,69506.0,0.0,0.0
"void kernelPointwiseApply2<TensorSigmoidOp<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<TensorSigmoidOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)",10.0,1152.0,14.0,0.0,0.0,98.684,0.0,0.0,38.0,0.0,4200.0,38.0,0.0,0.0,0.0,0.0,0.0,98.684,0.0,38.0,0.0,601000000.0,601000000.0,609000000.0,38.0,609000000.0,49.333,0.0,5100.0,0.0,7748.0,0.0,0.0
"void kernelPointwiseApply2<CopyOp<float, unsigned char>, float, unsigned char, unsigned int, int=1, int=1>(OffsetInfo<unsigned char, float, unsigned char>, OffsetInfo<CopyOp<float, unsigned char>, float, unsigned int>, float, float)",13.0,1255.0,10.0,0.0,0.0,98.98,0.0,0.0,555.0,0.0,0.0,555.0,0.0,0.0,0.0,0.0,0.0,95.165,0.0,139.0,0.0,3747000000.0,14991000000.0,3758000000.0,555.0,15001000000.0,57.706,0.0,0.0,0.0,85224.0,0.0,0.0
"void kernelPointwiseApply2<TensorGTValueOp<float, unsigned char>, unsigned char, float, unsigned int, int=1, int=2>(OffsetInfo<unsigned char, unsigned char, float>, OffsetInfo<TensorGTValueOp<float, unsigned char>, unsigned char, unsigned int>, unsigned char, float)",13.0,1272.0,8.0,0.0,0.0,95.165,69.613,0.0,555.0,0.0,0.0,555.0,0.0,0.0,0.0,0.0,0.0,33.749,0.0,145.0,0.0,655000000.0,3556000000.0,3712000000.0,139.0,3566000000.0,73.902,0.0,4442.0,0.0,142974.0,0.0,0.0
"void kernelReduceContigDim<float, unsigned int, float, thrust::identity<float>, ReduceAdd<float>, thrust::identity<float>, int=1, int=1>(TensorInfo<float, unsigned int>, TensorInfo<float, unsigned int>, unsigned int, unsigned int, float, float, thrust::identity<float>, float)",13.0,1915.0,15.0,0.0,324.0,12.5,33.523,0.0,307.0,0.0,5769.0,307.0,0.0,0.0,72649000000.0,0.0,6713000000.0,74.769,0.0,370.0,0.0,3223000000.0,138000000.0,4525000000.0,76.0,1111000000.0,43.025,27.039,7615.0,0.0,122538.0,134.0,1269.0
"void CatArrayBatchedCopy<float, unsigned int, int=2>(float*, CatArrInputTensor<float, unsigned int>*, OutputTensorSizeStride<unsigned int, unsigned int=4>, int, unsigned int)",10.0,2038.0,16.0,0.0,0.0,64.243,73.682,0.0,7154.0,0.0,0.0,6238.0,0.0,0.0,0.0,0.0,0.0,33.377,0.0,1997.0,0.0,6788000000.0,4648000000.0,20321000000.0,710.0,7235000000.0,66.109,0.0,3650.0,0.0,716414.0,0.0,0.0
"void kernelReduceNoncontigDim_shared<float, unsigned int, float, thrust::identity<float>, ReduceAdd<float>, thrust::identity<float>, int=1, int=1>(TensorInfo<float, unsigned int>, TensorInfo<float, unsigned int>, unsigned int, unsigned int, unsigned int, float, float, thrust::identity<float>, float, float volatile *, int*)",10.0,2057.0,23.0,2064.0,0.0,75.0,23.077,0.0,50.0,0.0,780.0,50.0,0.0,0.0,1353000000.0,0.0,1353000000.0,75.0,0.0,50.0,0.0,409000000.0,15000000.0,545000000.0,2.0,21000000.0,33.333,98.992,780.0,0.0,28968.0,31.0,31.0
"void CatArrayBatchedCopy<float, unsigned int, int=3>(float*, CatArrInputTensor<float, unsigned int>*, OutputTensorSizeStride<unsigned int, unsigned int=4>, int, unsigned int)",1.0,3104.0,20.0,0.0,0.0,64.424,56.546,0.0,16279.0,0.0,0.0,11713.0,0.0,0.0,0.0,0.0,0.0,58.594,0.0,4640.0,0.0,22307000000.0,18717000000.0,38071000000.0,3541.0,29054000000.0,58.171,0.0,36500.0,0.0,1988828.0,0.0,0.0
"void kernelReduceNoncontigDim<float, unsigned int, float, thrust::identity<float>, ReduceAdd<float>, thrust::identity<float>, int=1, int=2>(TensorInfo<float, unsigned int>, TensorInfo<float, unsigned int>, unsigned int, unsigned int, unsigned int, float, float, thrust::identity<float>, float)",3.0,3658.0,32.0,0.0,0.0,99.967,17.042,0.0,57050.0,0.0,456250.0,57050.0,0.0,0.0,0.0,0.0,0.0,82.391,0.0,69220.0,0.0,126579000000.0,5062000000.0,153631000000.0,2282.0,5064000000.0,48.703,0.0,474500.0,0.0,2957974.0,0.0,0.0
"void kernelPointwiseApply2<TensorMaxValueOp<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<TensorMaxValueOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)",69.0,3869.0,8.0,0.0,0.0,100.0,0.0,0.0,17855.0,0.0,0.0,17855.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,17855.0,0.0,81501000000.0,81501000000.0,81501000000.0,17855.0,81501000000.0,50.0,0.0,142840.0,0.0,2858087.0,0.0,0.0


gcn_inference


Unnamed: 0_level_0,Total Duration (ms),Kernel Time/GPU Time,FOPs(HP)/Bytes Read+Write,FOPs(SP)/Bytes Read+Write,FOPs(DP)/Bytes Read+Write,Global Store Req Throughput/Global Store Actual Throughput,Global Load Req Throughput/Global Load Actual Throughput,Invocations,Avg. Duration(ns),Registers/Thread,Static Shared Memory,Avg. Dynamic Shared Memory,Global Memory Store Efficiency(%),Unified Cache Transactions,HP Instructions(Half),Floating Point Operations(Double Precision),Floating Point Operations(Single Precision),Global Hit Rate in unified l1/tex(%),Local Hit Rate(%),L2 Transactions (Texture Reads),Requested Global Load Throughput(bytes/sec),Requested Global Store Throughput(bytes/sec),L2 Transactions (Texture Writes),Global Load Throughput(bytes/sec),Global Store Throughput(bytes/sec),Unified Cache Hit Rate(%),Shared Memory Efficiency(%),FP Instructions(Single),FP Instructions(Double),Integer Instructions,Shared Store Transactions,Shared Load Transactions,Floating Point Operations(Half Precision),Shared Memory Load Throughput(bytes/sec),Shared Memory Store Throughput(bytes/sec),Global Memory Load Efficiency(%)
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
"void kernelTransformReduceInnermostDimIndex<long, long, MaxValuePair<long, long>>(long*, long*, long*, unsigned int, unsigned int, thrust::pair<long, long>, long)",7886.49588,0.45018,0.0,0.0,0.0,,0.888785,20.0,394226570.0,16.0,8704.0,0.0,50.0,11723404.0,0.0,0.0,0.0,25.0,0.0,13188828.0,951000000.0,0.0,2.0,1070000000.0,0.0,25.0,68.636,0.0,0.0,468946000.0,80.0,30.0,0.0,0.0,0.0,88.889
"void kernelTransformReduceInnermostDimIndex<float, long, MaxValuePair<float, long>>(float*, long*, float*, unsigned int, unsigned int, thrust::pair<float, long>, float)",7884.5314,0.450068,0.0,0.0,0.0,0.374981,0.66129,10.0,143001.0,18.0,6528.0,0.0,37.5,1048344.0,0.0,0.0,0.0,39.806,0.0,1354110.0,200000000000.0,14661000000.0,174724.0,303000000000.0,39098000000.0,41.177,29.063,19569090.0,0.0,91212020.0,1397848.0,2096688.0,0.0,1880000000000.0,1250000000000.0,66.129
"void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__merge_sort::BlockSortAgent<thrust::device_ptr<long>, thrust::device_ptr<long>, long, ThrustLTOp<long>, thrust::detail::integral_constant<bool, bool=1>, thrust::detail::integral_constant<bool, bool=0>>, bool, thrust::device_ptr<long>, thrust::device_ptr<long>, long, long*, long*, ThrustLTOp<long>>(thrust::device_ptr<long>, thrust::device_ptr<long>, long, long, ThrustLTOp<long>, bool, bool=1)",504.64624,0.028806,0.0,0.0,0.0,1.0,0.994184,20.0,6193701.0,88.0,0.0,16912.0,100.0,12456204.0,0.0,0.0,0.0,2.273,0.0,11815002.0,60687000000.0,60568000000.0,11723402.0,61042000000.0,60568000000.0,1.154,25.105,0.0,0.0,4155477000.0,102605940.0,196581671.0,0.0,4060000000000.0,2120000000000.0,99.419
sgemm_32x32x32_NN,388.55268,0.02218,0.0,0.004116,0.0,0.802817,0.214527,20.0,1004137.0,153.0,16912.0,0.0,87.273,14051290.0,0.0,0.0,5204575000.0,82.336,0.0,12979354.0,443000000000.0,26448000000.0,1033782.0,2070000000000.0,32944000000.0,76.37,144.834,2688262000.0,0.0,334576500.0,7863480.0,13979520.0,0.0,1780000000000.0,1000000000000.0,21.917
"void convert_CooToCsr_kernel<int=0>(int const *, int, int, int*)",189.10506,0.010795,0.0,0.0,0.0,0.139147,0.820514,20.0,288362.0,12.0,0.0,512.0,13.917,2953931.0,0.0,0.0,0.0,48.684,0.0,3600101.0,328000000000.0,3231000000.0,209253.0,400000000000.0,23220000000.0,47.402,99.479,61440.0,0.0,429127300.0,738482.0,1476964.0,0.0,656000000000.0,328000000000.0,82.051
"void kernelTransformReduceInnermostDimIndex<long, long, MinValuePair<long, long>>(long*, long*, long*, unsigned int, unsigned int, thrust::pair<long, long>, long)",123.87402,0.007071,0.0,0.0,0.0,,0.888681,20.0,394324794.0,16.0,8704.0,0.0,50.0,11723404.0,0.0,0.0,0.0,25.0,0.0,13188828.0,950000000.0,0.0,2.0,1069000000.0,0.0,25.0,68.636,0.0,0.0,468946000.0,80.0,30.0,0.0,0.0,0.0,88.889
"void csrMmt_hyb_core<float, int=4, int=4>(float, float, float const *, float const *, float const *, int const *, int const *, float const *, float*, int, int, int, int, int, int, bool)",109.50896,0.006251,0.0,0.008455,0.0,0.551991,0.517272,10.0,7735415.0,32.0,3464.0,0.0,55.196,53124017.0,0.0,0.0,769168700.0,49.007,0.0,55065434.0,218000000000.0,1927000000.0,844138.0,422000000000.0,3491000000.0,48.75,72.463,394019900.0,0.0,3334726000.0,2808860.0,34084358.0,0.0,564000000000.0,46478000000.0,51.727
"void csrMmt_hyb_core<float, int=3, int=5>(float, float, float const *, float const *, float const *, int const *, int const *, float const *, float*, int, int, int, int, int, int, bool)",77.35415,0.004416,0.0,0.024991,0.0,0.835539,0.290259,10.0,18910506.0,32.0,3496.0,0.0,83.719,77750922.0,0.0,0.0,768237800.0,38.754,0.0,87408038.0,64574000000.0,442000000.0,313053.0,222000000000.0,529000000.0,38.797,58.628,393089000.0,0.0,4274879000.0,6884572.0,42220677.0,0.0,286000000000.0,46599000000.0,29.026
_ZN2at6native18elementwise_kernelILi512ELi1EZNS0_16gpu_unary_kernelIZNS0_17gpu_binary_kernelIZNS0_15div_kernel_implIlEEvRNS_14TensorIteratorEEUlllE_EEvS6_RKT_EUllE0_EEvS6_SA_EUliE_EEviT1_,51.8122,0.002958,0.0,0.0,0.0,1.0,1.0,40.0,881208.0,23.0,0.0,0.0,100.0,5861702.0,0.0,0.0,0.0,0.0,0.0,5861701.0,213000000000.0,213000000000.0,5861701.0,213000000000.0,213000000000.0,0.0,0.0,23446800.0,0.0,2527760000.0,0.0,0.0,0.0,0.0,0.0,100.0
"void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__unique_by_key::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*, int>, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long, int*>(bool=1, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*)",35.40876,0.002021,0.0,0.0,0.0,0.499977,,20.0,2655.0,8.0,0.0,0.0,49.998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,110000000000.0,18327.0,0.0,221000000000.0,0.0,0.0,0.0,0.0,806592.0,0.0,0.0,0.0,0.0,0.0,0.0


gat_sparse_backward_10n_1b


Unnamed: 0_level_0,Invocations,Avg. Duration(ns),Registers/Thread,Static Shared Memory,Avg. Dynamic Shared Memory,Global Memory Store Efficiency(%),L2 Cache Hit Rate(%),HP Instructions(Half),Unified Cache Transactions,Floating Point Operations(Double Precision),Floating Point Operations(Single Precision),Total number of global load requests from Multiprocessor,Total number of local load requests from Multiprocessor,Total number of local store requests from Multiprocessor,Shared Memory Load Throughput(bytes/sec),Floating Point Operations(Half Precision),Shared Memory Store Throughput(bytes/sec),Global Memory Load Efficiency(%),Global Hit Rate in unified l1/tex(%),L2 Transactions (Texture Reads),Local Hit Rate(%),Requested Global Load Throughput(bytes/sec),Requested Global Store Throughput(bytes/sec),Global Load Throughput(bytes/sec),L2 Transactions (Texture Writes),Global Store Throughput(bytes/sec),Unified Cache Hit Rate(%),Shared Memory Efficiency(%),FP Instructions(Single),FP Instructions(Double),Integer Instructions,Shared Store Transactions,Shared Load Transactions
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
"void kernelPointwiseApply1<TensorFillOp<long>, long, unsigned int, int=1>(OffsetInfo<TensorFillOp<long>, long, unsigned int>, long, long)",450.0,907.0,10.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10412000000.0,0.0,664.0,10420000000.0,0.0,0.0,0.0,0.0,38824.0,0.0,0.0
"void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<long>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<long>, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::use_default, thrust::use_default)",180.0,989.0,10.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44863000000.0,0.0,3316.0,44863000000.0,0.0,0.0,0.0,0.0,113520.0,0.0,0.0
"void kernelPointwiseApply2<CopyOp<int, long>, int, long, unsigned int, int=1, int=1>(OffsetInfo<long, int, long>, OffsetInfo<CopyOp<int, long>, int, unsigned int>, int, int)",180.0,1055.0,8.0,0.0,0.0,100.0,0.0,0.0,1658.0,0.0,0.0,1658.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,3316.0,0.0,17457000000.0,17457000000.0,34914000000.0,1658.0,17457000000.0,25.0,0.0,0.0,0.0,265472.0,0.0,0.0
"void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__unique_by_key::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*, int>, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long, int*>(bool=1, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*)",90.0,1181.0,8.0,0.0,0.0,46.875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,132000000.0,0.0,20.0,283000000.0,0.0,0.0,0.0,0.0,1160.0,0.0,0.0
"void kernelPointwiseApply2<Tensor_neg_Float_Op, float, float, unsigned int, int=1, int=1>(OffsetInfo<float, unsigned int, int=1>, OffsetInfo<float, unsigned int, int=1>, unsigned int, Tensor_neg_Float_Op)",180.0,1184.0,8.0,0.0,0.0,99.999,0.0,0.0,2164.0,0.0,17313.0,2164.0,0.0,0.0,0.0,0.0,0.0,99.999,0.0,2164.0,0.0,23880000000.0,23880000000.0,23881000000.0,2164.0,23881000000.0,49.999,0.0,17313.0,0.0,347104.0,0.0,0.0
"void kernelPointwiseApply2<TensorCAddOp<long>, long, long, unsigned int, int=1, int=1>(OffsetInfo<TensorCAddOp<long>, long, unsigned int>, OffsetInfo<long, long, int=1>, long, long)",270.0,1241.0,20.0,0.0,0.0,100.0,33.333,0.0,6632.0,0.0,0.0,3316.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,6632.0,0.0,57664000000.0,28831000000.0,57664000000.0,3316.0,28831000000.0,0.0,0.0,0.0,0.0,490960.0,0.0,0.0
"void kernelPointwiseApply1<TensorDivConstantOp<long>, long, unsigned int, int=1>(OffsetInfo<TensorDivConstantOp<long>, long, unsigned int>, long, long)",180.0,1314.0,24.0,0.0,0.0,100.0,50.0,0.0,3316.0,0.0,0.0,1658.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,3316.0,0.0,32950000000.0,32950000000.0,32950000000.0,3316.0,32950000000.0,0.0,0.0,13264.0,0.0,517493.0,0.0,0.0
"void kernelPointwiseApply2<TensorAddOp<long>, long, long, unsigned int, int=1, int=1>(OffsetInfo<TensorAddOp<long>, long, unsigned int>, OffsetInfo<long, long, int=1>, long, long)",90.0,1348.0,11.0,0.0,0.0,100.0,33.333,0.0,6632.0,0.0,0.0,3316.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,6632.0,0.0,58221000000.0,29110000000.0,58221000000.0,3316.0,29110000000.0,0.0,0.0,0.0,0.0,292000.0,0.0,0.0
"void kernelPointwiseApply2<TensorAddOp<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<TensorAddOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)",180.0,1407.0,10.0,0.0,0.0,99.306,33.333,0.0,1282.0,0.0,5127.0,1282.0,0.0,0.0,0.0,0.0,0.0,99.306,0.0,1282.0,0.0,15490000000.0,7745000000.0,15491000000.0,641.0,7745000000.0,50.0,0.0,5127.0,0.0,104119.0,0.0,0.0
"void kernelPointwiseApply3<TensorAddOp<long>, long, long, long, unsigned int, int=1, int=1, int=1>(OffsetInfo<TensorAddOp<long>, long, unsigned int>, OffsetInfo<long, long, int=1>, OffsetInfo<long, long, int=1>, long, long)",180.0,1469.0,10.0,0.0,0.0,100.0,0.0,0.0,6632.0,0.0,0.0,3316.0,0.0,0.0,0.0,0.0,0.0,66.667,0.0,9948.0,0.0,49982000000.0,24991000000.0,74974000000.0,3316.0,24991000000.0,0.0,0.0,0.0,0.0,371584.0,0.0,0.0


graphsage_training


Unnamed: 0_level_0,Invocations,Avg. Duration(ns),Registers/Thread,Static Shared Memory,Avg. Dynamic Shared Memory,Global Memory Store Efficiency(%),Unified Cache Transactions,HP Instructions(Half),Floating Point Operations(Double Precision),Floating Point Operations(Single Precision),Global Hit Rate in unified l1/tex(%),L2 Transactions (Texture Reads),Local Hit Rate(%),Requested Global Load Throughput(bytes/sec),Requested Global Store Throughput(bytes/sec),L2 Transactions (Texture Writes),Global Load Throughput(bytes/sec),Global Store Throughput(bytes/sec),Unified Cache Hit Rate(%),Shared Memory Efficiency(%),FP Instructions(Single),FP Instructions(Double),Integer Instructions,Shared Store Transactions,Shared Load Transactions,Shared Memory Load Throughput(bytes/sec),Floating Point Operations(Half Precision),Shared Memory Store Throughput(bytes/sec),Global Memory Load Efficiency(%)
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
"void kernelPointwiseApply1<TensorFillOp<float>, float, unsigned int, int=-2>(TensorInfo<TensorFillOp<float>, float>, float, float)",10.0,1888.0,10.0,0.0,0.0,12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000000.0,1.0,0.0,16000000.0,0.0,0.0,0.0,0.0,2054.0,0.0,0.0,0.0,0.0,0.0,0.0
"void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=-2, int=2>(TensorInfo<float, float>, TensorInfo<CopyOp<float, float>, float>, float, float)",10.0,2273.0,10.0,0.0,0.0,100.0,384.0,0.0,0.0,0.0,68.359,486.0,0.0,5404000000.0,5404000000.0,384.0,21620000000.0,5404000000.0,62.188,0.0,3072.0,0.0,127998.0,0.0,0.0,0.0,0.0,0.0,25.0
"void kernelPointwiseApply2<TensorAddOp<float>, float, float, unsigned int, int=-2, int=-2>(TensorInfo<TensorAddOp<float>, float>, TensorInfo<float, float>, float, float)",30.0,2788.0,10.0,0.0,0.0,100.0,6730.0,0.0,0.0,26922.0,50.0,6730.0,0.0,77234000000.0,38616000000.0,3365.0,77234000000.0,38616000000.0,50.0,0.0,26922.0,0.0,377088.0,0.0,0.0,0.0,0.0,0.0,100.0
"void kernelPointwiseApply2<TensorCAddOp<float>, float, float, unsigned int, int=-2, int=-2>(TensorInfo<TensorCAddOp<float>, float>, TensorInfo<float, float>, float, float)",30.0,2813.0,10.0,0.0,0.0,100.0,6730.0,0.0,0.0,53845.0,50.0,6730.0,0.0,76539000000.0,38269000000.0,3365.0,76539000000.0,38269000000.0,50.0,0.0,26922.0,0.0,377088.0,0.0,0.0,0.0,0.0,0.0,100.0
"void cunn_SoftMaxBackward<int=2, float, float, LogSoftMaxBackwardEpilogue>(float*, float*, float*, int)",10.0,4797.0,23.0,0.0,128.0,30.0,3072.0,0.0,0.0,67584.0,0.0,3840.0,0.0,7684000000.0,2561000000.0,1280.0,25615000000.0,8538000000.0,0.0,14.732,145408.0,0.0,2070528.0,3072.0,18432.0,491827000000.0,0.0,81970000000.0,30.0
"void kernelPointwiseApply2<ThresholdUpdateOutput<float>, float, float, unsigned int, int=-2, int=-2>(TensorInfo<ThresholdUpdateOutput<float>, float>, TensorInfo<float, float>, float, float)",20.0,5639.0,8.0,0.0,0.0,100.0,29205.0,0.0,0.0,0.0,50.0,29205.0,0.0,165716000000.0,165716000000.0,29205.0,165716000000.0,165716000000.0,50.0,0.0,233644.0,0.0,3271462.0,0.0,0.0,0.0,0.0,0.0,100.0
sgemm_32x32x32_TT,10.0,5807.0,153.0,16912.0,0.0,100.0,8192.0,0.0,0.0,10665980.0,79.043,2996.0,0.0,16926000000.0,90274000000.0,16384.0,78989000000.0,90274000000.0,58.826,130.189,6340608.0,0.0,2375680.0,43008.0,65536.0,1444393000000.0,0.0,947883000000.0,21.429
"void gemmSN_NN_kernel<float, float, float, int=128, int=2, int=4, int=8, int=3, int=4>(cublasGemmSmallNParams<float, float, float>, float const *, float const *, float, float, int)",10.0,6463.0,51.0,7680.0,0.0,100.0,17152.0,0.0,0.0,801792.0,50.0,17152.0,0.0,84911000000.0,1900000000.0,384.0,169824000000.0,1900000000.0,50.0,118.286,411648.0,0.0,625664.0,576.0,10624.0,210378000000.0,0.0,11405000000.0,50.0
"void cunn_SoftMaxForward<int=2, float, float, LogSoftMaxForwardEpilogue>(float*, float*, int)",10.0,6994.0,19.0,0.0,128.0,30.0,3072.0,0.0,0.0,886784.0,0.0,3840.0,0.0,5269000000.0,1756000000.0,1280.0,17567000000.0,5855000000.0,0.0,14.732,676864.0,0.0,1702912.0,6144.0,36864.0,674614000000.0,0.0,112435000000.0,30.0
"void kernelPointwiseApply3<ThresholdUpdateGradInput<float>, float, float, float, unsigned int, int=-2, int=-2, int=-2>(TensorInfo<ThresholdUpdateGradInput<float>, float>, TensorInfo<float, float>, TensorInfo<float, float>, float, float)",20.0,8147.0,12.0,0.0,0.0,100.0,56943.0,0.0,0.0,0.0,47.729,56946.0,0.0,173714000000.0,114704000000.0,29205.0,223657000000.0,114704000000.0,48.532,0.0,233644.0,0.0,3738752.0,0.0,0.0,0.0,0.0,0.0,78.334


gat_dense_forward_10n_1b


Unnamed: 0_level_0,Invocations,Avg. Duration(ns),Registers/Thread,Static Shared Memory,Avg. Dynamic Shared Memory,Global Memory Store Efficiency(%),L2 Cache Hit Rate(%),Unified Cache Transactions,HP Instructions(Half),Floating Point Operations(Double Precision),Floating Point Operations(Single Precision),Total number of global load requests from Multiprocessor,Total number of local load requests from Multiprocessor,Total number of local store requests from Multiprocessor,Shared Memory Load Throughput(bytes/sec),Floating Point Operations(Half Precision),Shared Memory Store Throughput(bytes/sec),Global Memory Load Efficiency(%),Global Hit Rate in unified l1/tex(%),Local Hit Rate(%),L2 Transactions (Texture Reads),Requested Global Load Throughput(bytes/sec),Requested Global Store Throughput(bytes/sec),Global Load Throughput(bytes/sec),L2 Transactions (Texture Writes),Global Store Throughput(bytes/sec),Unified Cache Hit Rate(%),Shared Memory Efficiency(%),FP Instructions(Single),FP Instructions(Double),Integer Instructions,Shared Store Transactions,Shared Load Transactions
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
"void kernelPointwiseApply2<ELUupdateOutput_functor<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<ELUupdateOutput_functor<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)",90.0,1856.0,8.0,0.0,0.0,99.998,0.0,2670.0,0.0,0.0,133939.0,2670.0,0.0,0.0,0.0,0.0,0.0,99.998,0.0,0.0,2670.0,27406000000.0,27406000000.0,27407000000.0,2670.0,27407000000.0,49.999,0.0,205335.0,0.0,428736.0,0.0,0.0
"void at::native::_GLOBAL__N__54_tmpxft_0000343e_00000000_10_SoftMax_compute_70_cpp1_ii_826a4626::cunn_SoftMaxForward<int=2, float, float, at::native::_GLOBAL__N__54_tmpxft_0000343e_00000000_10_SoftMax_compute_70_cpp1_ii_826a4626::LogSoftMaxForwardEpilogue>(float*, float, int)",10.0,14681.0,25.0,0.0,128.0,50.0,82.862,16248.0,0.0,0.0,2464280.0,16248.0,0.0,0.0,777250000000.0,0.0,129541000000.0,50.0,0.0,0.0,14217.0,14168000000.0,4722000000.0,28336000000.0,4739.0,9445000000.0,33.338,10.119,1974132.0,0.0,4497988.0,16248.0,97488.0
sgemm_32x32x32_NN,100.0,88681.0,153.0,16912.0,0.0,90.0,14.771,726338.0,0.0,0.0,250516352.0,726338.0,0.0,0.0,714147000000.0,0.0,366257000000.0,22.827,78.644,0.0,564061.0,206610000000.0,891000000.0,913754000000.0,3114.0,1045000000.0,80.006,68.231,125927296.0,0.0,9108736.0,266288.0,518976.0
"void kernelPointwiseApply1<TensorFillOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorFillOp<float>, float, unsigned int>, float, float)",90.0,141988.0,8.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,207674000000.0,0.0,916658.0,207674000000.0,50.0,0.0,0.0,0.0,102666100.0,0.0,0.0
sgemm_32x32x32_NN_vec,80.0,170156.0,153.0,16912.0,0.0,100.0,14.399,1381080.0,0.0,0.0,475009920.0,345270.0,0.0,0.0,718500000000.0,0.0,363321000000.0,90.88,14.658,0.0,1076571.0,214548000000.0,506000000.0,236079000000.0,2708.0,506000000.0,21.922,67.042,238174080.0,0.0,16896640.0,485520.0,960160.0
"void kernelPointwiseApply2<TensorGTValueOp<float, unsigned char>, unsigned char, float, unsigned int, int=1, int=1>(OffsetInfo<unsigned char, unsigned char, float>, OffsetInfo<TensorGTValueOp<float, unsigned char>, unsigned char, unsigned int>, unsigned char, float)",90.0,186063.0,8.0,0.0,0.0,100.0,15.0,916658.0,0.0,0.0,0.0,916658.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,916658.0,159590000000.0,39897000000.0,159590000000.0,229165.0,39897000000.0,58.333,0.0,7333264.0,0.0,139332500.0,0.0,0.0
"void kernelPointwiseApply1<TensorDivConstantOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorDivConstantOp<float>, float, unsigned int>, float, float)",110.0,257840.0,8.0,0.0,0.0,100.0,50.0,796059.0,0.0,0.0,6368477.0,796059.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,796059.0,98911000000.0,98911000000.0,98911000000.0,796059.0,98911000000.0,50.0,0.0,6368477.0,0.0,89159290.0,0.0,0.0
"void kernelPointwiseApply2<TensorMulConstantOp<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<TensorMulConstantOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)",90.0,297778.0,8.0,0.0,0.0,100.0,0.0,916658.0,0.0,0.0,7333264.0,916658.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,916658.0,98839000000.0,98839000000.0,98839000000.0,916658.0,98839000000.0,50.0,0.0,7333264.0,0.0,146665700.0,0.0,0.0
"void kernelPointwiseApply2<LeakyReLUUpdateOutput<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<LeakyReLUUpdateOutput<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)",90.0,301041.0,8.0,0.0,0.0,100.0,0.0,916658.0,0.0,0.0,4048178.0,916658.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,916658.0,98863000000.0,98863000000.0,98863000000.0,916658.0,98863000000.0,50.0,0.0,11381442.0,0.0,146665700.0,0.0,0.0
_ZN2at4cuda21kernelPointwiseApply4IZN84_GLOBAL__N__60_tmpxft_00003553_00000000_10_TensorCompare_compute_70_cpp1_ii_865d2fc810where_cudaIfEEvRNS_6TensorERKS4_S7_S7_EUlRfRKhRKfSC_E_fhffjLi1ELi1ELi1ELi1EEEvNS0_6detail10TensorInfoIT0_T4_EENSF_IT1_SH_EENSF_IT2_SH_EENSF_IT3_SH_EESH_T_,90.0,333500.0,9.0,0.0,0.0,100.0,0.0,1833316.0,0.0,0.0,0.0,1833316.0,0.0,0.0,0.0,0.0,0.0,98.925,0.0,0.0,1158272.0,109206000000.0,87365000000.0,110393000000.0,916658.0,87365000000.0,54.854,0.0,0.0,0.0,241998200.0,0.0,0.0


gat_sparse_forward_10n_1b


Unnamed: 0_level_0,Invocations,Avg. Duration(ns),Registers/Thread,Static Shared Memory,Avg. Dynamic Shared Memory,Global Memory Store Efficiency(%),L2 Cache Hit Rate(%),HP Instructions(Half),Unified Cache Transactions,Floating Point Operations(Double Precision),Floating Point Operations(Single Precision),Total number of global load requests from Multiprocessor,Total number of local load requests from Multiprocessor,Total number of local store requests from Multiprocessor,Shared Memory Load Throughput(bytes/sec),Floating Point Operations(Half Precision),Shared Memory Store Throughput(bytes/sec),Global Memory Load Efficiency(%),Global Hit Rate in unified l1/tex(%),L2 Transactions (Texture Reads),Local Hit Rate(%),Requested Global Load Throughput(bytes/sec),Requested Global Store Throughput(bytes/sec),Global Load Throughput(bytes/sec),L2 Transactions (Texture Writes),Global Store Throughput(bytes/sec),Unified Cache Hit Rate(%),Shared Memory Efficiency(%),FP Instructions(Single),FP Instructions(Double),Integer Instructions,Shared Store Transactions,Shared Load Transactions
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
"void kernelPointwiseApply1<TensorFillOp<long>, long, unsigned int, int=1>(OffsetInfo<TensorFillOp<long>, long, unsigned int>, long, long)",540.0,903.0,10.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16134000000.0,0.0,1106.0,16140000000.0,0.0,0.0,0.0,0.0,63334.0,0.0,0.0
"void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<long>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<thrust::counting_iterator<long, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::device_ptr<long>, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<long>, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::use_default, thrust::use_default)",360.0,905.0,10.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42221000000.0,0.0,3316.0,42221000000.0,0.0,0.0,0.0,0.0,113520.0,0.0,0.0
"void kernelPointwiseApply1<TensorFillOp<unsigned char>, unsigned char, unsigned int, int=1>(OffsetInfo<TensorFillOp<unsigned char>, unsigned char, unsigned int>, unsigned char, unsigned char)",360.0,944.0,9.0,0.0,0.0,3.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,12000000.0,0.0,0.0,0.0,0.0,2059.0,0.0,0.0
"void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__tabulate::functor<thrust::device_ptr<long>, LinspaceOp<long, long>, long>, long>, thrust::cuda_cub::__tabulate::functor<thrust::device_ptr<long>, LinspaceOp<long, long>, long>, long>(thrust::device_ptr<long>, long)",180.0,993.0,19.0,0.0,0.0,98.611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24000000.0,0.0,2.0,25000000.0,0.0,0.0,0.0,0.0,3718.0,0.0,0.0
"void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__unique_by_key::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*, int>, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long, int*>(bool=1, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*)",180.0,1006.0,8.0,0.0,0.0,46.875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,125000000.0,0.0,20.0,268000000.0,0.0,0.0,0.0,0.0,1160.0,0.0,0.0
"void kernelPointwiseApply2<CopyOp<int, long>, int, long, unsigned int, int=1, int=1>(OffsetInfo<long, int, long>, OffsetInfo<CopyOp<int, long>, int, unsigned int>, int, int)",360.0,1031.0,8.0,0.0,0.0,100.0,0.0,0.0,1658.0,0.0,0.0,1658.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,3316.0,0.0,16436000000.0,16436000000.0,32874000000.0,1658.0,16436000000.0,25.0,0.0,0.0,0.0,265472.0,0.0,0.0
"void kernelPointwiseApply2<LeakyReLUUpdateOutput<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<LeakyReLUUpdateOutput<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)",90.0,1112.0,8.0,0.0,0.0,100.0,0.0,0.0,1658.0,0.0,6616.0,1658.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,1658.0,0.0,17969000000.0,17969000000.0,17969000000.0,1658.0,17969000000.0,50.0,0.0,19880.0,0.0,265472.0,0.0,0.0
"void kernelPointwiseApply2<TensorMulConstantOp<long>, long, long, unsigned int, int=1, int=1>(OffsetInfo<TensorMulConstantOp<long>, long, unsigned int>, OffsetInfo<long, long, int=1>, long, long)",180.0,1124.0,14.0,0.0,0.0,100.0,0.0,0.0,3316.0,0.0,0.0,1658.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,3316.0,0.0,32300000000.0,32300000000.0,32300000000.0,3316.0,32300000000.0,0.0,0.0,0.0,0.0,451168.0,0.0,0.0
"void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*, int>, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, unsigned long, int*>(bool=1, thrust::cuda_cub::cub::ScanTileState<int, bool=1>, int*)",90.0,1136.0,8.0,0.0,0.0,49.986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21718000000.0,0.0,3590.0,43448000000.0,0.0,0.0,0.0,0.0,157740.0,0.0,0.0
"void kernelPointwiseApply2<Tensor_neg_Float_Op, float, float, unsigned int, int=1, int=1>(OffsetInfo<float, unsigned int, int=1>, OffsetInfo<float, unsigned int, int=1>, unsigned int, Tensor_neg_Float_Op)",90.0,1174.0,8.0,0.0,0.0,100.0,0.0,0.0,1658.0,0.0,13264.0,1658.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,1658.0,0.0,18080000000.0,18080000000.0,18080000000.0,1658.0,18080000000.0,50.0,0.0,13264.0,0.0,265472.0,0.0,0.0


mpnnv2_backward_10n_1b


Unnamed: 0_level_0,Invocations,Avg. Duration(ns),Registers/Thread,Static Shared Memory,Avg. Dynamic Shared Memory,Global Memory Store Efficiency(%),L2 Cache Hit Rate(%),HP Instructions(Half),Unified Cache Transactions,Floating Point Operations(Double Precision),Floating Point Operations(Single Precision),Total number of global load requests from Multiprocessor,Total number of local load requests from Multiprocessor,Total number of local store requests from Multiprocessor,Shared Memory Load Throughput(bytes/sec),Floating Point Operations(Half Precision),Shared Memory Store Throughput(bytes/sec),Global Memory Load Efficiency(%),Global Hit Rate in unified l1/tex(%),L2 Transactions (Texture Reads),Local Hit Rate(%),Requested Global Load Throughput(bytes/sec),Requested Global Store Throughput(bytes/sec),Global Load Throughput(bytes/sec),L2 Transactions (Texture Writes),Global Store Throughput(bytes/sec),Unified Cache Hit Rate(%),Shared Memory Efficiency(%),FP Instructions(Single),FP Instructions(Double),Integer Instructions,Shared Store Transactions,Shared Load Transactions
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
"void kernelPointwiseApply1<TensorAddConstantOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorAddConstantOp<float>, float, unsigned int>, float, float)",56.0,1005.0,8.0,0.0,0.0,97.396,50.0,0.0,1116.0,0.0,8934.0,1116.0,0.0,0.0,0.0,0.0,0.0,97.396,0.0,1116.0,0.0,16454000000.0,16454000000.0,16454000000.0,1116.0,16454000000.0,49.048,0.0,8934.0,0.0,126024.0,0.0,0.0
"void kernelPointwiseApply3<TensorAddCMulOp<float>, float, float, float, unsigned int, int=1, int=1, int=1>(OffsetInfo<TensorAddCMulOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=1>, float, float)",56.0,1108.0,12.0,0.0,0.0,97.396,50.0,0.0,3350.0,0.0,26802.0,3350.0,0.0,0.0,0.0,0.0,0.0,97.396,0.0,3350.0,0.0,41454000000.0,13817000000.0,41457000000.0,1116.0,13818000000.0,49.048,0.0,17868.0,0.0,233235.0,0.0,0.0
"void kernelPointwiseApply2<Tensor_sqrt_Float_Op, float, float, unsigned int, int=1, int=1>(OffsetInfo<float, unsigned int, int=1>, OffsetInfo<float, unsigned int, int=1>, unsigned int, Tensor_sqrt_Float_Op)",56.0,1177.0,10.0,0.0,0.0,97.396,0.0,0.0,1116.0,0.0,52730.0,1116.0,0.0,0.0,0.0,0.0,0.0,97.396,0.0,1116.0,0.0,16176000000.0,16176000000.0,16177000000.0,1116.0,16177000000.0,49.048,0.0,45299.0,0.0,198959.0,0.0,0.0
"void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=2, int=1>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float)",112.0,1192.0,10.0,0.0,0.0,54.31,26.709,0.0,20.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,98.454,0.0,20.0,0.0,329000000.0,329000000.0,333000000.0,36.0,616000000.0,30.503,0.0,0.0,0.0,6682.0,0.0,0.0
"void kernelPointwiseApply1<TensorMulConstantOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorMulConstantOp<float>, float, unsigned int>, float, float)",112.0,1254.0,8.0,0.0,0.0,97.396,50.0,0.0,1116.0,0.0,8934.0,1116.0,0.0,0.0,0.0,0.0,0.0,97.396,0.0,1116.0,0.0,19893000000.0,19893000000.0,19894000000.0,1116.0,19894000000.0,49.048,0.0,8934.0,0.0,126024.0,0.0,0.0
"void kernelPointwiseApply1<TensorFillOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorFillOp<float>, float, unsigned int>, float, float)",1.0,1280.0,8.0,0.0,0.0,12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000000.0,0.0,1.0,21000000.0,0.0,0.0,0.0,0.0,2058.0,0.0,0.0
"void kernelPointwiseApply3<TensorAddOp<float>, float, float, float, unsigned int, int=1, int=1, int=1>(OffsetInfo<TensorAddOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=1>, float, float)",1146.0,1313.0,11.0,0.0,0.0,97.681,0.0,0.0,2129.0,0.0,8516.0,2129.0,0.0,0.0,0.0,0.0,0.0,97.681,0.0,2129.0,0.0,27651000000.0,13825000000.0,27652000000.0,1064.0,13826000000.0,49.296,0.0,8516.0,0.0,222421.0,0.0,0.0
"void kernelPointwiseApply2<TensorCAddOp<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<TensorCAddOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)",56.0,1351.0,10.0,0.0,0.0,97.396,33.333,0.0,2233.0,0.0,17868.0,2233.0,0.0,0.0,0.0,0.0,0.0,97.396,0.0,2233.0,0.0,31560000000.0,15780000000.0,31562000000.0,1116.0,15780000000.0,49.048,0.0,8934.0,0.0,179630.0,0.0,0.0
"void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=1, int=2>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float)",69.0,1371.0,8.0,0.0,0.0,98.718,2.476,0.0,21.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,70.009,4.386,27.0,0.0,350000000.0,350000000.0,490000000.0,21.0,353000000.0,43.204,0.0,0.0,0.0,6881.0,0.0,0.0
"void kernelPointwiseApply2<TensorAddOp<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<TensorAddOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)",56.0,1397.0,10.0,0.0,0.0,97.396,33.333,0.0,2233.0,0.0,8934.0,2233.0,0.0,0.0,0.0,0.0,0.0,97.396,0.0,2233.0,0.0,31011000000.0,15505000000.0,31013000000.0,1116.0,15506000000.0,49.048,0.0,8934.0,0.0,179630.0,0.0,0.0


graphsage_inference


Unnamed: 0_level_0,Invocations,Avg. Duration(ns),Registers/Thread,Static Shared Memory,Avg. Dynamic Shared Memory,Global Memory Store Efficiency(%),Unified Cache Transactions,HP Instructions(Half),Floating Point Operations(Double Precision),Floating Point Operations(Single Precision),Global Hit Rate in unified l1/tex(%),L2 Transactions (Texture Reads),Local Hit Rate(%),Requested Global Load Throughput(bytes/sec),Requested Global Store Throughput(bytes/sec),L2 Transactions (Texture Writes),Global Load Throughput(bytes/sec),Global Store Throughput(bytes/sec),Unified Cache Hit Rate(%),Shared Memory Efficiency(%),FP Instructions(Single),FP Instructions(Double),Integer Instructions,Shared Store Transactions,Shared Load Transactions,Shared Memory Load Throughput(bytes/sec),Floating Point Operations(Half Precision),Shared Memory Store Throughput(bytes/sec),Global Memory Load Efficiency(%)
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
"void kernelPointwiseApply2<ThresholdUpdateOutput<float>, float, float, unsigned int, int=-2, int=-2>(TensorInfo<ThresholdUpdateOutput<float>, float>, TensorInfo<float, float>, float, float)",20.0,4011.0,8.0,0.0,0.0,100.0,15199.0,0.0,0.0,0.0,50.0,15199.0,0.0,121258000000.0,121258000000.0,15199.0,121258000000.0,121258000000.0,50.0,0.0,121593.0,0.0,1702745.0,0.0,0.0,0.0,0.0,0.0,100.0
"void gemmSN_NN_kernel<float, float, float, int=128, int=2, int=4, int=8, int=3, int=4>(cublasGemmSmallNParams<float, float, float>, float const *, float const *, float, float, int)",10.0,6105.0,51.0,7680.0,0.0,91.912,8448.0,0.0,0.0,391572.0,45.496,9120.0,0.0,43940000000.0,982000000.0,204.0,87882000000.0,1068000000.0,45.685,115.594,201072.0,0.0,307708.0,288.0,5312.0,111362000000.0,0.0,6037000000.0,50.0
sgemm_32x32x32_NT_vec,23.0,17331.0,153.0,16912.0,0.0,89.766,433323.0,0.0,0.0,113610000.0,13.354,407869.0,0.0,786114000000.0,26328000000.0,16144.0,930398000000.0,29807000000.0,17.369,152.844,57692160.0,0.0,5031134.0,139642.0,263891.0,1948947000000.0,0.0,1031315000000.0,91.137
sgemm_32x32x32_NT,7.0,30004.0,153.0,16912.0,0.0,100.0,711291.0,0.0,0.0,185688100.0,79.922,711799.0,0.0,755437000000.0,8531000000.0,8000.0,3780869000000.0,8531000000.0,79.787,163.269,93347840.0,0.0,6791168.0,197632.0,385024.0,1642525000000.0,0.0,843104000000.0,19.981
"void kernelReduceContigDim<thrust::identity<float>, ReduceAdd<float, float>, ReduceAdd<float, float>, float, float, unsigned int, int=-2, int=1>(TensorInfo<float, ReduceAdd<float, float>>, TensorInfo<float, ReduceAdd<float, float>>, ReduceAdd<float, float>, ReduceAdd<float, float>, ReduceAdd<float, float>, float, thrust::identity<float>, float)",20.0,59172.0,13.0,0.0,512.0,12.5,609820.0,0.0,0.0,4993856.0,48.332,702285.0,0.0,329423000000.0,63000000.0,949.0,379789000000.0,513000000.0,48.256,37.37,5115450.0,0.0,28843179.0,4749.0,18049.0,39042000000.0,0.0,10274000000.0,87.49
"void kernelPointwiseApply3<TensorDivOp<float>, float, float, float, unsigned int, int=-2, int=-2, int=2>(TensorInfo<TensorDivOp<float>, float>, TensorInfo<float, float>, TensorInfo<float, float>, float, float)",20.0,104267.0,14.0,0.0,0.0,100.0,1218304.0,0.0,0.0,90372.0,58.329,761492.0,0.0,192821000000.0,186949000000.0,609151.0,233702000000.0,186949000000.0,54.998,0.0,34136590.0,0.0,268193915.0,0.0,0.0,0.0,0.0,0.0,82.514
"void indexSelectLargeIndex<float, unsigned int, int=2, int=2, int=-2, bool=1>(TensorInfo<float, unsigned int>, TensorInfo<float, unsigned int>, TensorInfo<long, unsigned int>, int, int, unsigned int, unsigned int, long)",10.0,173137.0,32.0,0.0,0.0,100.0,6058227.0,0.0,0.0,0.0,53.924,2817970.0,76.604,79573000000.0,74646000000.0,818601.0,103183000000.0,74646000000.0,70.971,0.0,9693150.0,0.0,333118272.0,0.0,0.0,0.0,0.0,0.0,77.119
maxwell_sgemm_128x64_raggedMn_nn_splitK,10.0,840207.0,120.0,12544.0,0.0,88.588,14587211.0,0.0,0.0,9348724000.0,46.741,16463280.0,0.0,550824000000.0,43324000000.0,1284103.0,1140162000000.0,48905000000.0,46.492,179.716,4679194000.0,0.0,243704608.0,4004140.0,18646425.0,2840659000000.0,0.0,610003000000.0,48.411


mpnnv2_forward_n10_1b


Unnamed: 0_level_0,Invocations,Avg. Duration(ns),Registers/Thread,Static Shared Memory,Avg. Dynamic Shared Memory,Global Memory Store Efficiency(%),L2 Cache Hit Rate(%),HP Instructions(Half),Unified Cache Transactions,Floating Point Operations(Double Precision),Floating Point Operations(Single Precision),Total number of global load requests from Multiprocessor,Total number of local load requests from Multiprocessor,Total number of local store requests from Multiprocessor,Shared Memory Load Throughput(bytes/sec),Floating Point Operations(Half Precision),Shared Memory Store Throughput(bytes/sec),Global Memory Load Efficiency(%),Global Hit Rate in unified l1/tex(%),L2 Transactions (Texture Reads),Local Hit Rate(%),Requested Global Load Throughput(bytes/sec),Requested Global Store Throughput(bytes/sec),Global Load Throughput(bytes/sec),L2 Transactions (Texture Writes),Global Store Throughput(bytes/sec),Unified Cache Hit Rate(%),Shared Memory Efficiency(%),FP Instructions(Single),FP Instructions(Double),Integer Instructions,Shared Store Transactions,Shared Load Transactions
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
"void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=2, int=1>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float)",207.0,998.0,10.0,0.0,0.0,46.197,31.368,0.0,138.0,0.0,0.0,138.0,0.0,0.0,0.0,0.0,0.0,98.044,0.0,137.0,0.0,2301000000.0,2301000000.0,2307000000.0,318.0,5372000000.0,29.659,0.0,0.0,0.0,37752.0,0.0,0.0
"void kernelPointwiseApply2<TensorMaxValueOp<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<TensorMaxValueOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)",417.0,1203.0,8.0,0.0,0.0,100.0,0.0,0.0,2329.0,0.0,0.0,2329.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,2329.0,0.0,29898000000.0,29898000000.0,29898000000.0,2329.0,29898000000.0,50.0,0.0,18638.0,0.0,373461.0,0.0,0.0
"void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=2, int=2>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float)",138.0,1206.0,10.0,0.0,0.0,41.01,32.259,0.0,75.0,0.0,0.0,75.0,0.0,0.0,0.0,0.0,0.0,71.209,6.267,90.0,0.0,1237000000.0,1237000000.0,1610000000.0,235.0,3952000000.0,27.378,0.0,0.0,0.0,29156.0,0.0,0.0
"void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=1, int=2>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float)",556.0,1208.0,8.0,0.0,0.0,99.812,14.552,0.0,1788.0,0.0,0.0,1788.0,0.0,0.0,0.0,0.0,0.0,94.287,73.204,329.0,0.0,30200000000.0,30503000000.0,30227000000.0,1788.0,30504000000.0,68.355,0.0,0.0,0.0,472940.0,0.0,0.0
"void kernelPointwiseApply3<TensorMulOp<float>, float, float, float, unsigned int, int=1, int=-1, int=1>(OffsetInfo<TensorMulOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=-1>, float, float)",69.0,2020.0,27.0,0.0,0.0,99.768,10.182,0.0,613.0,0.0,2450.0,613.0,0.0,0.0,0.0,0.0,0.0,81.719,0.0,416.0,0.0,3639000000.0,3259000000.0,4446000000.0,306.0,3264000000.0,53.972,0.0,7350.0,0.0,284497.0,0.0,0.0
"void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=2, int=-1>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float)",69.0,2088.0,28.0,0.0,0.0,43.482,50.266,0.0,377.0,0.0,0.0,377.0,0.0,0.0,0.0,0.0,0.0,69.556,69.065,76.0,0.0,2151000000.0,4222000000.0,2932000000.0,849.0,9529000000.0,47.304,0.0,6020.0,0.0,369847.0,0.0,0.0
"void kernelReduceNoncontigDim_shared<float, unsigned int, float, thrust::identity<float>, ReduceAdd<float>, thrust::identity<float>, int=1, int=2>(TensorInfo<float, unsigned int>, TensorInfo<float, unsigned int>, unsigned int, unsigned int, unsigned int, float, float, thrust::identity<float>, float, float volatile *, int*)",70.0,2291.0,25.0,2064.0,0.0,95.621,44.793,0.0,332.0,0.0,4419.0,332.0,0.0,0.0,4828000000.0,0.0,4828000000.0,53.954,0.0,545.0,0.0,2974000000.0,141000000.0,5269000000.0,15.0,145000000.0,39.279,99.815,6491.0,0.0,202749.0,125.0,125.0
sgemm_32x32x32_NT,138.0,3222.0,153.0,16912.0,0.0,100.0,48.31,0.0,6160.0,0.0,1682773.0,6160.0,0.0,0.0,232684000000.0,0.0,150051000000.0,29.797,69.889,5745.0,0.0,37028000000.0,11511000000.0,123310000000.0,1760.0,11511000000.0,73.271,75.871,967338.0,0.0,310613.0,5717.0,8874.0
sgemm_32x32x32_NT_vec,418.0,4374.0,153.0,16912.0,0.0,89.342,39.148,0.0,22802.0,0.0,6198021.0,7063.0,0.0,0.0,417394000000.0,0.0,240306000000.0,99.17,0.409,22779.0,0.0,131505000000.0,10848000000.0,131637000000.0,1828.0,10980000000.0,10.031,71.416,3234380.0,0.0,419658.0,10431.0,18169.0


mpnnv1_backward_10n_1b


Unnamed: 0_level_0,Invocations,Avg. Duration(ns),Registers/Thread,Static Shared Memory,Avg. Dynamic Shared Memory,Global Memory Store Efficiency(%),L2 Cache Hit Rate(%),HP Instructions(Half),Unified Cache Transactions,Floating Point Operations(Double Precision),Floating Point Operations(Single Precision),Total number of global load requests from Multiprocessor,Total number of local load requests from Multiprocessor,Total number of local store requests from Multiprocessor,Shared Memory Load Throughput(bytes/sec),Floating Point Operations(Half Precision),Shared Memory Store Throughput(bytes/sec),Global Memory Load Efficiency(%),Global Hit Rate in unified l1/tex(%),L2 Transactions (Texture Reads),Local Hit Rate(%),Requested Global Load Throughput(bytes/sec),Requested Global Store Throughput(bytes/sec),Global Load Throughput(bytes/sec),L2 Transactions (Texture Writes),Global Store Throughput(bytes/sec),Unified Cache Hit Rate(%),Shared Memory Efficiency(%),FP Instructions(Single),FP Instructions(Double),Integer Instructions,Shared Store Transactions,Shared Load Transactions
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
"void kernelPointwiseApply2<CopyOp<float, float>, float, float, unsigned int, int=1, int=2>(OffsetInfo<float, float, float>, OffsetInfo<CopyOp<float, float>, float, unsigned int>, float, float)",10.0,1116.0,8.0,0.0,0.0,83.936,26.743,0.0,192.0,0.0,0.0,192.0,0.0,0.0,0.0,0.0,0.0,76.042,8.333,231.0,0.0,2987000000.0,2987000000.0,3928000000.0,230.0,3578000000.0,50.3,0.0,0.0,0.0,50601.0,0.0,0.0
"void kernelPointwiseApply1<TensorFillOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorFillOp<float>, float, unsigned int>, float, float)",1.0,1216.0,8.0,0.0,0.0,12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000000.0,0.0,1.0,18000000.0,0.0,0.0,0.0,0.0,2058.0,0.0,0.0
"void kernelPointwiseApply3<sigmoid_updateGradInput_functor<float>, float, float, float, unsigned int, int=1, int=1, int=1>(OffsetInfo<sigmoid_updateGradInput_functor<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=1>, float, float)",10.0,1228.0,12.0,0.0,0.0,98.438,0.0,0.0,64.0,0.0,756.0,64.0,0.0,0.0,0.0,0.0,0.0,98.438,0.0,64.0,0.0,1046000000.0,523000000.0,1062000000.0,32.0,530000000.0,49.206,0.0,756.0,0.0,7592.0,0.0,0.0
"void kernelPointwiseApply3<TensorMulOp<float>, float, float, float, unsigned int, int=1, int=1, int=1>(OffsetInfo<TensorMulOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=1>, float, float)",23.0,1267.0,11.0,0.0,0.0,98.636,0.0,0.0,555.0,0.0,2218.0,555.0,0.0,0.0,0.0,0.0,0.0,98.636,0.0,555.0,0.0,11153000000.0,5576000000.0,11169000000.0,277.0,5584000000.0,49.308,0.0,2218.0,0.0,58606.0,0.0,0.0
"void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::binary_transform_f<thrust::device_ptr<float>, thrust::device_ptr<float>, thrust::device_ptr<float>, thrust::cuda_cub::__transform::no_stencil_tag, mse_updateGradInput_functor<float, float>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::binary_transform_f<thrust::device_ptr<float>, thrust::device_ptr<float>, thrust::device_ptr<float>, thrust::cuda_cub::__transform::no_stencil_tag, mse_updateGradInput_functor<float, float>, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::device_ptr<float>, thrust::device_ptr<float>)",1.0,1312.0,11.0,0.0,0.0,100.0,0.0,0.0,32.0,0.0,240.0,32.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,30.0,0.0,491000000.0,245000000.0,491000000.0,15.0,245000000.0,50.0,0.0,240.0,0.0,5120.0,0.0,0.0
"void kernelPointwiseApply3<TensorMulOp<float>, float, float, float, unsigned int, int=1, int=2, int=1>(OffsetInfo<TensorMulOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=2>, float, float)",10.0,1315.0,11.0,0.0,0.0,98.438,17.5,0.0,64.0,0.0,252.0,64.0,0.0,0.0,0.0,0.0,0.0,90.625,0.0,48.0,0.0,676000000.0,489000000.0,746000000.0,32.0,497000000.0,55.307,0.0,252.0,0.0,10868.0,0.0,0.0
"void kernelPointwiseApply3<TensorAddOp<float>, float, float, float, unsigned int, int=1, int=1, int=1>(OffsetInfo<TensorAddOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=1>, float, float)",189.0,1638.0,11.0,0.0,0.0,97.565,0.0,0.0,4450.0,0.0,17799.0,4450.0,0.0,0.0,0.0,0.0,0.0,97.565,0.0,4450.0,0.0,55614000000.0,27807000000.0,55618000000.0,2225.0,27809000000.0,48.391,0.0,17799.0,0.0,463639.0,0.0,0.0
"void scal_kernel<float, float, int=1, bool=1, int=6, int=5, int=5, int=3>(cublasTransposeParams<float>, float const *, float*, float const *)",3.0,1664.0,18.0,0.0,0.0,41.667,86.285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1155000000.0,0.0,192.0,2772000000.0,25.0,0.0,0.0,0.0,79872.0,0.0,0.0
"void kernelPointwiseApply1<TensorAddConstantOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorAddConstantOp<float>, float, unsigned int>, float, float)",28.0,1928.0,8.0,0.0,0.0,98.048,50.0,0.0,4240.0,0.0,33920.0,4240.0,0.0,0.0,0.0,0.0,0.0,98.048,0.0,4240.0,0.0,60265000000.0,60265000000.0,60266000000.0,4240.0,60266000000.0,48.742,0.0,33920.0,0.0,475905.0,0.0,0.0
"void kernelPointwiseApply1<TensorMulConstantOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorMulConstantOp<float>, float, unsigned int>, float, float)",56.0,2092.0,8.0,0.0,0.0,98.048,50.0,0.0,4240.0,0.0,33920.0,4240.0,0.0,0.0,0.0,0.0,0.0,98.048,0.0,4240.0,0.0,56072000000.0,56072000000.0,56073000000.0,4240.0,56073000000.0,48.742,0.0,33920.0,0.0,475905.0,0.0,0.0


multiscale_backward_10n_10b


Unnamed: 0_level_0,Invocations,Avg. Duration(ns),Registers/Thread,Static Shared Memory,Avg. Dynamic Shared Memory,Global Memory Store Efficiency(%),L2 Cache Hit Rate(%),HP Instructions(Half),Unified Cache Transactions,Floating Point Operations(Double Precision),Floating Point Operations(Single Precision),Total number of global load requests from Multiprocessor,Total number of local load requests from Multiprocessor,Total number of local store requests from Multiprocessor,Shared Memory Load Throughput(bytes/sec),Floating Point Operations(Half Precision),Shared Memory Store Throughput(bytes/sec),Global Memory Load Efficiency(%),Global Hit Rate in unified l1/tex(%),L2 Transactions (Texture Reads),Local Hit Rate(%),Requested Global Load Throughput(bytes/sec),Requested Global Store Throughput(bytes/sec),Global Load Throughput(bytes/sec),L2 Transactions (Texture Writes),Global Store Throughput(bytes/sec),Unified Cache Hit Rate(%),Shared Memory Efficiency(%),FP Instructions(Single),FP Instructions(Double),Integer Instructions,Shared Store Transactions,Shared Load Transactions
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
"void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)",250.0,849.0,8.0,0.0,0.0,99.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2276000000.0,0.0,112.0,2278000000.0,49.666,0.0,896.0,0.0,14935.0,0.0,0.0
"void kernelPointwiseApply2<TensorAddOp<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<TensorAddOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)",740.0,1009.0,10.0,0.0,0.0,81.28,33.333,0.0,79.0,0.0,312.0,79.0,0.0,0.0,0.0,0.0,0.0,81.28,0.0,79.0,0.0,1519000000.0,759000000.0,1534000000.0,39.0,766000000.0,40.563,0.0,312.0,0.0,7656.0,0.0,0.0
cudnn::maxwell::gemm::computeBOffsetsKernel(cudnn::maxwell::gemm::ComputeBOffsetsParams),480.0,1044.0,12.0,0.0,0.0,43.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28000000.0,0.0,3.0,64000000.0,11.181,0.0,0.0,0.0,1337.0,0.0,0.0
"void kernelPointwiseApply1<TensorFillOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorFillOp<float>, float, unsigned int>, float, float)",10.0,1161.0,8.0,0.0,0.0,12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000000.0,0.0,1.0,20000000.0,0.0,0.0,0.0,0.0,2058.0,0.0,0.0
"void kernelPointwiseApply3<TensorAddOp<float>, float, float, float, unsigned int, int=1, int=1, int=1>(OffsetInfo<TensorAddOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=1>, float, float)",590.0,1293.0,11.0,0.0,0.0,99.622,0.0,0.0,3391.0,0.0,13561.0,3391.0,0.0,0.0,0.0,0.0,0.0,99.622,0.0,3390.0,0.0,46242000000.0,23121000000.0,46249000000.0,1695.0,23124000000.0,49.809,0.0,13561.0,0.0,353841.0,0.0,0.0
"void kernelPointwiseApply2<CopyOp<long, long>, long, long, unsigned int, int=1, int=1>(OffsetInfo<long, long, long>, OffsetInfo<CopyOp<long, long>, long, unsigned int>, long, long)",10.0,1315.0,8.0,0.0,0.0,95.068,0.0,0.0,98.0,0.0,0.0,49.0,0.0,0.0,0.0,0.0,0.0,48.939,0.0,194.0,0.0,1582000000.0,1582000000.0,3170000000.0,97.0,1589000000.0,0.0,0.0,0.0,0.0,9286.0,0.0,0.0
cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams),230.0,1317.0,16.0,0.0,0.0,67.15,61.143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1344000000.0,0.0,85.0,1573000000.0,44.581,0.0,0.0,0.0,28972.0,0.0,0.0
"void kernelPointwiseApply3<ThresholdUpdateGradInput<float>, float, float, float, unsigned int, int=1, int=1, int=1>(OffsetInfo<ThresholdUpdateGradInput<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=1>, float, float)",120.0,1342.0,10.0,0.0,0.0,98.887,0.0,0.0,952.0,0.0,0.0,952.0,0.0,0.0,0.0,0.0,0.0,76.862,0.0,951.0,0.0,11328000000.0,7500000000.0,14711000000.0,485.0,7506000000.0,48.469,0.0,3884.0,0.0,90843.0,0.0,0.0
cudnn::maxwell::gemm::computeWgradOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams),250.0,1883.0,14.0,0.0,0.0,9.375,97.273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2577000000.0,0.0,1553.0,27496000000.0,0.0,0.0,0.0,0.0,225660.0,0.0,0.0
"void calc_bias_diff<int=2, float, float, int=128, int=0>(cudnnTensorStruct, float const *, cudnnTensorStruct, float*, float, float, int)",250.0,1943.0,12.0,512.0,0.0,12.5,28.398,0.0,478.0,0.0,5019.0,478.0,0.0,0.0,2947000000.0,0.0,4420000000.0,81.473,0.0,564.0,0.0,4357000000.0,10000000.0,5211000000.0,9.0,91000000.0,42.625,59.688,5029.0,0.0,45822.0,116.0,77.0


gcn_training


Unnamed: 0_level_0,Invocations,Avg. Duration(ns),Registers/Thread,Static Shared Memory,Avg. Dynamic Shared Memory,Global Memory Store Efficiency(%),HP Instructions(Half),Unified Cache Transactions,Floating Point Operations(Double Precision),Floating Point Operations(Single Precision),Global Hit Rate in unified l1/tex(%),Local Hit Rate(%),L2 Transactions (Texture Reads),Requested Global Load Throughput(bytes/sec),Requested Global Store Throughput(bytes/sec),Global Load Throughput(bytes/sec),L2 Transactions (Texture Writes),Global Store Throughput(bytes/sec),Unified Cache Hit Rate(%),Shared Memory Efficiency(%),FP Instructions(Single),FP Instructions(Double),Integer Instructions,Shared Store Transactions,Shared Load Transactions,Floating Point Operations(Half Precision),Shared Memory Load Throughput(bytes/sec),Shared Memory Store Throughput(bytes/sec),Global Memory Load Efficiency(%)
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
"void kernelPointwiseApply1<TensorFillOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorFillOp<float>, float, unsigned int>, float, float)",10.0,1672.0,8.0,0.0,0.0,12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000000.0,0.0,1.0,18000000.0,0.0,0.0,0.0,0.0,2058.0,0.0,0.0,0.0,0.0,0.0,0.0
"void kernelPointwiseApply1<TensorFillOp<long>, long, unsigned int, int=1>(OffsetInfo<TensorFillOp<long>, long, unsigned int>, long, long)",20.0,1685.0,10.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4000000.0,0.0,1.0,18000000.0,0.0,0.0,0.0,0.0,2058.0,0.0,0.0,0.0,0.0,0.0,0.0
"void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__tabulate::functor<thrust::device_ptr<long>, LinspaceOp<long, long>, long>, long>, thrust::cuda_cub::__tabulate::functor<thrust::device_ptr<long>, LinspaceOp<long, long>, long>, long>(thrust::device_ptr<long>, long)",10.0,1699.0,19.0,0.0,0.0,93.182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,192000000.0,0.0,11.0,206000000.0,0.0,0.0,0.0,0.0,4281.0,0.0,0.0,0.0,0.0,0.0,0.0
"void scal_kernel<float, float, int=1, bool=1, int=6, int=5, int=5, int=3>(cublasTransposeParams<float>, float const *, float*, float const *)",10.0,1882.0,18.0,0.0,0.0,85.417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2787000000.0,0.0,192.0,3264000000.0,45.455,0.0,0.0,0.0,19968.0,0.0,0.0,0.0,0.0,0.0,0.0
_ZN2at6native18elementwise_kernelILi512ELi1EZNS0_16gpu_unary_kernelIZNS0_17gpu_binary_kernelIZNS0_15add_kernel_implIfEEvRNS_14TensorIteratorEN3c106ScalarEEUlffE_EEvS6_RKT_EUlfE0_EEvS6_SC_EUliE_EEviT1_,40.0,2049.0,8.0,0.0,0.0,96.354,0.0,323.0,0.0,5172.0,48.864,0.0,323.0,5047000000.0,5047000000.0,5051000000.0,323.0,5051000000.0,48.864,0.0,2586.0,0.0,36923.0,0.0,0.0,0.0,0.0,0.0,96.354
_ZN2at6native18elementwise_kernelILi512ELi1EZNS0_16gpu_unary_kernelIZNS0_17gpu_binary_kernelIZNS0_15mul_kernel_implIfEEvRNS_14TensorIteratorEEUlffE_EEvS6_RKT_EUlfE0_EEvS6_SA_EUliE_EEviT1_,80.0,2052.0,8.0,0.0,0.0,96.354,0.0,323.0,0.0,2586.0,48.864,0.0,323.0,5038000000.0,5038000000.0,5042000000.0,323.0,5042000000.0,48.864,0.0,2586.0,0.0,36923.0,0.0,0.0,0.0,0.0,0.0,96.354
"void kernelPointwiseApply3<TensorAddCMulOp<float>, float, float, float, unsigned int, int=1, int=1, int=1>(OffsetInfo<TensorAddCMulOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, OffsetInfo<float, float, int=1>, float, float)",40.0,2080.0,12.0,0.0,0.0,96.354,0.0,970.0,0.0,7758.0,48.864,0.0,970.0,14918000000.0,4972000000.0,14928000000.0,323.0,4975000000.0,48.864,0.0,5172.0,0.0,68673.0,0.0,0.0,0.0,0.0,0.0,96.354
_ZN2at6native18elementwise_kernelILi512ELi1EZNS0_17gpu_binary_kernelIZNS0_15add_kernel_implIfEEvRNS_14TensorIteratorEN3c106ScalarEEUlffE_EEvS5_RKT_EUliE_EEviT1_,120.0,2085.0,8.0,0.0,0.0,96.354,0.0,647.0,0.0,5172.0,48.864,0.0,647.0,9922000000.0,4961000000.0,9929000000.0,323.0,4964000000.0,48.864,0.0,2586.0,0.0,52440.0,0.0,0.0,0.0,0.0,0.0,96.354
"void kernelPointwiseApply2<Tensor_sqrt_Float_Op, float, float, unsigned int, int=1, int=1>(OffsetInfo<float, unsigned int, int=1>, OffsetInfo<float, unsigned int, int=1>, unsigned int, Tensor_sqrt_Float_Op)",40.0,2097.0,10.0,0.0,0.0,96.354,0.0,323.0,0.0,18103.0,48.864,0.0,323.0,4930000000.0,4930000000.0,4934000000.0,323.0,4934000000.0,48.864,0.0,15517.0,0.0,58328.0,0.0,0.0,0.0,0.0,0.0,96.354
_ZN2at6native18elementwise_kernelILi128ELi4EZNS0_16gpu_unary_kernelIZNS0_17div_constant_implIdEEvRNS_14TensorIteratorET_EUldE_EEvS5_RKS6_EUliE0_EEviT1_,20.0,2264.0,12.0,0.0,0.0,25.0,0.0,2.0,1.0,0.0,0.0,0.0,1.0,3000000.0,3000000.0,13000000.0,1.0,13000000.0,0.0,0.0,0.0,1.0,646.0,0.0,0.0,0.0,0.0,0.0,25.0


multiscale_forward_10n_10b


Unnamed: 0_level_0,Invocations,Avg. Duration(ns),Registers/Thread,Static Shared Memory,Avg. Dynamic Shared Memory,Global Memory Store Efficiency(%),L2 Cache Hit Rate(%),HP Instructions(Half),Unified Cache Transactions,Floating Point Operations(Double Precision),Floating Point Operations(Single Precision),Total number of global load requests from Multiprocessor,Total number of local load requests from Multiprocessor,Total number of local store requests from Multiprocessor,Shared Memory Load Throughput(bytes/sec),Floating Point Operations(Half Precision),Shared Memory Store Throughput(bytes/sec),Global Memory Load Efficiency(%),Global Hit Rate in unified l1/tex(%),L2 Transactions (Texture Reads),Local Hit Rate(%),Requested Global Load Throughput(bytes/sec),Requested Global Store Throughput(bytes/sec),Global Load Throughput(bytes/sec),L2 Transactions (Texture Writes),Global Store Throughput(bytes/sec),Unified Cache Hit Rate(%),Shared Memory Efficiency(%),FP Instructions(Single),FP Instructions(Double),Integer Instructions,Shared Store Transactions,Shared Load Transactions
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
"void kernelPointwiseApply2<TensorMaxValueOp<float>, float, float, unsigned int, int=1, int=1>(OffsetInfo<TensorMaxValueOp<float>, float, unsigned int>, OffsetInfo<float, float, int=1>, float, float)",120.0,998.0,8.0,0.0,0.0,98.887,0.0,0.0,486.0,0.0,0.0,486.0,0.0,0.0,0.0,0.0,0.0,98.887,0.0,485.0,0.0,8150000000.0,8150000000.0,8157000000.0,485.0,8157000000.0,49.654,0.0,3884.0,0.0,78937.0,0.0,0.0
"void kernelPointwiseApply1<TensorAddConstantOp<long>, long, unsigned int, int=1>(OffsetInfo<TensorAddConstantOp<long>, long, unsigned int>, long, long)",120.0,1172.0,8.0,0.0,0.0,25.0,50.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,1.0,0.0,4000000.0,4000000.0,17000000.0,1.0,17000000.0,0.0,0.0,0.0,0.0,2060.0,0.0,0.0
cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams),250.0,1244.0,16.0,0.0,0.0,67.15,61.302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1330000000.0,0.0,85.0,1553000000.0,44.581,0.0,0.0,0.0,28972.0,0.0,0.0
"void add_tensor_kernel_v3<int=2, float, float, int=128, int=1, int=1, int=4, int=2>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, float, float)",250.0,1822.0,28.0,0.0,0.0,87.283,65.169,0.0,719.0,0.0,9459.0,719.0,0.0,0.0,0.0,0.0,0.0,61.609,0.0,539.0,0.0,5066000000.0,4990000000.0,5696000000.0,481.0,5086000000.0,57.848,0.0,8225.0,0.0,103312.0,0.0,0.0
"void gemv2N_kernel_val<float, float, float, int=128, int=32, int=4, int=4, int=1>(float, float, cublasGemv2Params_v2<float, float, float>)",25.0,2266.0,32.0,2560.0,0.0,46.972,65.419,0.0,1885.0,0.0,26357.0,1885.0,0.0,0.0,70991000000.0,0.0,27837000000.0,42.017,0.0,4065.0,0.0,14877000000.0,72000000.0,38821000000.0,16.0,149000000.0,17.717,22.289,16460.0,0.0,205225.0,734.0,1870.0
"void CatArrayBatchedCopy<float, unsigned int, int=4>(float*, CatArrInputTensor<float, unsigned int>*, OutputTensorSizeStride<unsigned int, unsigned int=4>, int, unsigned int)",120.0,2379.0,24.0,0.0,0.0,91.404,74.517,0.0,9698.0,0.0,0.0,7767.0,0.0,0.0,0.0,0.0,0.0,31.433,0.0,2671.0,0.0,12675000000.0,9996000000.0,26456000000.0,1065.0,10965000000.0,69.73,0.0,23139.0,0.0,1412678.0,0.0,0.0
"void cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>(cudnnTensorStruct, float const *, cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, int=2, int=0>, cudnnTensorStruct*, float const *, float const , float, float, float*, float const *, float const *, float const *, float, float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool)",120.0,4152.0,36.0,144.0,3072.0,57.345,55.77,0.0,3584.0,0.0,227288.0,3584.0,0.0,0.0,30879000000.0,0.0,26044000000.0,36.948,56.181,1193.0,0.0,8476000000.0,7981000000.0,13937000000.0,1241.0,9980000000.0,77.731,26.111,221992.0,0.0,1108156.0,930.0,1028.0
"void CatArrayBatchedCopy<float, unsigned int, int=3>(float*, CatArrInputTensor<float, unsigned int>*, OutputTensorSizeStride<unsigned int, unsigned int=4>, int, unsigned int)",130.0,4156.0,20.0,0.0,0.0,93.334,71.154,0.0,30054.0,0.0,0.0,23358.0,0.0,0.0,0.0,0.0,0.0,35.707,0.0,9219.0,0.0,32721000000.0,27947000000.0,55680000000.0,4826.0,29965000000.0,68.348,0.0,53538.0,0.0,3956948.0,0.0,0.0
sgemm_32x32x32_NN_vec,180.0,8007.0,153.0,16912.0,0.0,92.114,23.686,0.0,46818.0,0.0,14977194.0,11704.0,0.0,0.0,460952000000.0,0.0,239565000000.0,96.28,2.884,47906.0,0.0,164684000000.0,2530000000.0,185344000000.0,816.0,2845000000.0,25.016,71.926,7562069.0,0.0,621482.0,17002.0,32694.0
maxwell_scudnn_128x32_relu_interior_nn,250.0,12399.0,82.0,8192.0,0.0,81.82,14.563,0.0,6206.0,0.0,2838265.0,6206.0,0.0,0.0,80838000000.0,0.0,23661000000.0,60.302,40.181,5276.0,0.0,11157000000.0,1042000000.0,14513000000.0,559.0,1236000000.0,70.305,51.473,1449916.0,0.0,383445.0,2690.0,9232.0


KeyError: 'gat_dense_forward_10n_1b/metrics.csv'