In [1]:
import glob
import pandas as pd
import numpy as np
percent_to_float = lambda x: float(x.split('%')[-2].split()[-1])

In [2]:
def get_kernels(device):
    profile_filelist = glob.glob('./profile_{}/*.txt'.format(device))
    

    kernals = set()
    max_percent = {}
    for profile_file in profile_filelist:
        with open(profile_file, 'r') as f:
            lines = f.readlines()
            for line in lines[3:-4]:
                kernal_name = line.split()[0]
                percent = percent_to_float(line)
                if percent > 2:
                    kernals.add(kernal_name)
                    if kernal_name not in max_percent.keys():
                        max_percent[kernal_name] = percent
                    else:
                        max_percent[kernal_name] = max(max_percent[kernal_name], percent)

    return kernals, max_percent


In [3]:
ks_1080, ks_max_1080 = get_kernels('1080')
ks_2080, ks_max_2080 = get_kernels('2080')
ks_titan, ks_max_titan = get_kernels('titan')
ks_titanp, ks_max_titanp = get_kernels('titanp')


In [15]:
ks_1080

{'Memcpy',
 'at::native::(anonymous',
 'aten::_conv_depthwise2d',
 'aten::_fake_quantize_per_tensor_affine_cachemask_ten...',
 'aten::_fused_moving_avg_obs_fq_helper',
 'aten::_log_softmax',
 'aten::_log_softmax_backward_data',
 'aten::_softmax',
 'aten::_softmax_backward_data',
 'aten::add',
 'aten::add_',
 'aten::addcdiv_',
 'aten::addcmul_',
 'aten::addmm',
 'aten::aminmax',
 'aten::bmm',
 'aten::cat',
 'aten::clamp',
 'aten::clamp_min',
 'aten::clamp_min_',
 'aten::convolution_backward',
 'aten::copy_',
 'aten::cudnn_batch_norm',
 'aten::cudnn_batch_norm_backward',
 'aten::cudnn_convolution',
 'aten::cudnn_convolution_transpose',
 'aten::div',
 'aten::elu_backward',
 'aten::embedding_dense_backward',
 'aten::exp',
 'aten::exp_',
 'aten::fake_quantize_per_channel_affine_cachemask',
 'aten::fill_',
 'aten::gelu_backward',
 'aten::hardswish_backward',
 'aten::hardtanh_backward',
 'aten::leaky_relu_',
 'aten::leaky_relu_backward',
 'aten::masked_fill_',
 'aten::max_pool2d_with_indices'

In [4]:
len(ks_1080)

115

In [14]:
data = []
profile_filelist = glob.glob('./profile_{}/*.txt'.format('1080'))
for profile_file in sorted(profile_filelist):
    row = {}
    profile_file_prefix = profile_file.split('/')[-1].split('.')[-2]
    
    batch_size = int(profile_file_prefix.split('_')[-1])
    model_name = profile_file_prefix.replace('_' + str(batch_size), '')
    # row['model_name'] = model_name
    # row['batch_size'] = batch_size
    row['name'] = profile_file_prefix
    with open(profile_file, 'r') as f:
        
        lines = f.readlines()
        for line in lines[3:-4]:
            kernal_name = line.split()[0]
            if kernal_name in ks_1080:
                percent = percent_to_float(line)
                row[kernal_name] = percent
    for kernal_name in ks_1080:
        if kernal_name not in row.keys():
            row[kernal_name] = 0.0
    
    print(row)
    data.append(row)

df = pd.DataFrame(data)
df.to_csv('profile_1080.csv', index=False)

{'name': 'LearningToPaint_1', 'aten::cudnn_convolution': 74.68, 'void': 0.09, 'maxwell_scudnn_winograd_128x128_ldg1_ldg4_mobile_rel...': 20.18, 'aten::addmm': 12.06, 'sgemm_32x32x32_NT_vec': 10.99, 'aten::cudnn_batch_norm': 2.67, 'aten::copy_': 2.53, 'aten::add_': 1.67, 'aten::clamp_min': 1.59, 'aten::mul': 1.0, 'Memcpy': 0.05, 'aten::add': 0.78, 'aten::sub': 0.59, 'aten::cat': 0.53, 'aten::div': 0.42, 'sgemm_32x32x32_NT': 0.37, 'aten::fill_': 0.0, 'maxwell_sgemm_32x128_nn': 0.0, 'sgemm_128x128x8_NN': 0.0, 'maxwell_fp16_scudnn_fp16_128x64_stridedB_splitK_larg...': 0.0, 'aten::native_batch_norm_backward': 0.0, 'maxwell_fp16_scudnn_fp16_128x64_relu_medium_nn_v0': 0.0, 'maxwell_fp16_scudnn_fp16_128x128_relu_small_nn_v0': 0.0, 'at::native::(anonymous': 0.0, 'aten::fake_quantize_per_channel_affine_cachemask': 0.0, 'maxwell_gcgemm_64x64_nt': 0.0, 'aten::_log_softmax_backward_data': 0.0, 'sgemm_128x128x8_NT_vec': 0.0, 'maxwell_sgemm_128x64_nn': 0.0, 'aten::clamp': 0.0, 'maxwell_scudnn_128x32_

In [5]:
inter_kernels = ks_1080 & ks_2080 & ks_titan & ks_titanp
union_kernels = ks_1080 | ks_2080 | ks_titan | ks_titanp

In [6]:
max_kernels = {}
for kernal in union_kernels:
    max_percent = -1
    if kernal in ks_titanp:
        max_percent = max(max_percent, ks_max_titanp[kernal])
    if kernal in ks_titan:
        max_percent = max(max_percent, ks_max_titan[kernal])
    if kernal in ks_1080:
        max_percent = max(max_percent, ks_max_1080[kernal])
    if kernal in ks_2080:
        max_percent = max(max_percent, ks_max_2080[kernal])
    if max_percent != -1:
        max_kernels[kernal] = max_percent


{'volta_sgemm_64x64_nn': 17.19,
 'aten::cudnn_batch_norm_backward': 19.73,
 'aten::sqrt': 6.98,
 'at::native::(anonymous': 4.42,
 'aten::threshold_backward': 9.61,
 'volta_sgemm_128x32_nn': 7.0,
 'maxwell_gcgemm_32x32_tn': 5.78,
 'torchvision::_roi_align_backward': 13.22,
 'volta_scudnn_128x32_relu_medium_nn_v1': 5.75,
 'maxwell_scudnn_128x64_stridedB_splitK_large_nn_v0': 12.97,
 'maxwell_scudnn_128x64_relu_large_nn_v1': 4.7,
 'aten::gelu_backward': 2.95,
 'aten::exp': 3.04,
 'maxwell_sgemm_64x64_nn': 5.52,
 'maxwell_fp16_scudnn_fp16_128x64_stridedB_splitK_medi...': 6.75,
 'aten::native_layer_norm': 5.15,
 'turing_fp16_s1688cudnn_fp16_128x128_ldg8_dgrad_relu_...': 5.64,
 'volta_scudnn_128x128_relu_small_nn_v1': 10.84,
 'aten::add': 13.82,
 'aten::hardswish_': 2.06,
 'aten::exp_': 2.02,
 'aten::copy_': 50.61,
 'volta_scudnn_128x64_relu_xregs_large_nn_v1': 22.42,
 'volta_sgemm_32x128_nt': 4.75,
 'aten::max_pool2d_with_indices': 3.49,
 'aten::neg': 3.59,
 'aten::fill_': 10.51,
 'volta_scu

In [7]:
no_aten_kernal = []
for k in union_kernels:
    if 'aten' not in k:
        no_aten_kernal.append(k)

In [8]:
max_kernels['turing_fp16_s1688cudnn_fp16_256x64_sliced1x2_ldg8_re...']

6.45

In [9]:
maxwell_kernels = set()
volta_kernels = set()
turing_kernels = set()
for k in no_aten_kernal:
    if 'maxwell' in k:
        maxwell_kernels.add(k)
    elif 'volta' in k:
        volta_kernels.add(k)
    elif 'turing' in k:
        turing_kernels.add(k)


In [10]:
turing_kernels

{'turing_fp16_s1688cudnn_fp16_128x128_ldg8_dgrad_relu_...',
 'turing_fp16_s1688cudnn_fp16_256x128_ldg8_dgrad_relu_...',
 'turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_f2f_ex...',
 'turing_fp16_s1688cudnn_fp16_256x64_sliced1x2_ldg8_re...',
 'turing_scudnn_128x128_stridedB_splitK_xregs_large_nn...',
 'turing_scudnn_128x32_stridedB_splitK_small_nn_v1',
 'turing_scudnn_128x32_stridedB_splitK_xregs_large_nn_...',
 'turing_scudnn_128x64_stridedB_splitK_xregs_large_nn_...'}

In [12]:
volta_kernels

{'volta_cgemm_32x64_tn',
 'volta_cgemm_64x32_tn',
 'volta_gcgemm_32x32_nt',
 'volta_gcgemm_32x32_tn',
 'volta_gcgemm_64x32_tn',
 'volta_gcgemm_64x64_tn',
 'volta_scudnn_128x128_3dconv_fprop_medium_nn_v1',
 'volta_scudnn_128x128_relu_medium_nn_v1',
 'volta_scudnn_128x128_relu_small_nn_v1',
 'volta_scudnn_128x128_stridedB_medium_nn_v1',
 'volta_scudnn_128x128_stridedB_small_nn_v1',
 'volta_scudnn_128x32_relu_interior_nn_v1',
 'volta_scudnn_128x32_relu_medium_nn_v1',
 'volta_scudnn_128x32_relu_small_nn_v1',
 'volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_interior...',
 'volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_medium_n...',
 'volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_small_nh...',
 'volta_scudnn_128x64_relu_interior_nn_v1',
 'volta_scudnn_128x64_relu_small_nn_v1',
 'volta_scudnn_128x64_relu_xregs_large_nn_v1',
 'volta_scudnn_128x64_sliced1x2_ldg4_relu_exp_medium_n...',
 'volta_scudnn_128x64_sliced1x2_ldg4_relu_exp_small_nh...',
 'volta_scudnn_128x64_stridedB_interior_nn_v1',
 'volta_sc

In [13]:
maxwell_kernels

{'maxwell_fp16_scudnn_fp16_128x128_relu_medium_nn_v0',
 'maxwell_fp16_scudnn_fp16_128x128_relu_small_nn_v0',
 'maxwell_fp16_scudnn_fp16_128x32_relu_small_nn_v0',
 'maxwell_fp16_scudnn_fp16_128x32_stridedB_splitK_larg...',
 'maxwell_fp16_scudnn_fp16_128x64_relu_medium_nn_v0',
 'maxwell_fp16_scudnn_fp16_128x64_stridedB_small_nn_v0...',
 'maxwell_fp16_scudnn_fp16_128x64_stridedB_splitK_larg...',
 'maxwell_fp16_scudnn_fp16_128x64_stridedB_splitK_medi...',
 'maxwell_gcgemm_32x32_tn',
 'maxwell_gcgemm_64x64_nt',
 'maxwell_gcgemm_64x64_tn',
 'maxwell_scudnn_128x128_3dconv_fprop_small_nn_v0',
 'maxwell_scudnn_128x128_relu_medium_nn_v1',
 'maxwell_scudnn_128x128_stridedB_small_nn_v0',
 'maxwell_scudnn_128x128_stridedB_splitK_medium_nn_v0',
 'maxwell_scudnn_128x32_relu_medium_nn_v1',
 'maxwell_scudnn_128x32_relu_small_nn_v1',
 'maxwell_scudnn_128x32_stridedB_small_nn_v0',
 'maxwell_scudnn_128x32_stridedB_splitK_large_nn_v0',
 'maxwell_scudnn_128x32_stridedB_splitK_medium_nn_v0',
 'maxwell_scudnn

In [14]:
common_kernels = set()

for k in maxwell_kernels:
    if k.replace("maxwell", "volta").replace('v0', 'v1') in volta_kernels:
        common_kernels.add(k.replace("maxwell_",""))

maxwell_common_kernels = set(['maxwell_' + k  for k in common_kernels])
volta_common_kernels = set([('volta_' + k).replace('v0', 'v1')  for k in common_kernels])


In [15]:
common_kernels

{'gcgemm_32x32_tn',
 'gcgemm_64x64_tn',
 'scudnn_128x128_relu_medium_nn_v1',
 'scudnn_128x128_stridedB_small_nn_v0',
 'scudnn_128x32_relu_medium_nn_v1',
 'scudnn_128x32_relu_small_nn_v1',
 'scudnn_128x64_relu_interior_nn_v1',
 'scudnn_128x64_relu_small_nn_v1',
 'scudnn_128x64_stridedB_small_nn_v0',
 'sgemm_128x128_nn',
 'sgemm_128x128_nt',
 'sgemm_128x128_tn',
 'sgemm_128x32_nn',
 'sgemm_128x32_nt',
 'sgemm_128x64_nn',
 'sgemm_128x64_nt',
 'sgemm_128x64_tn',
 'sgemm_32x128_nt',
 'sgemm_64x64_nn',
 'sgemm_64x64_nt'}

In [16]:
maxwell_feature_kernals = maxwell_kernels - maxwell_common_kernels
volta_feature_kernals = volta_kernels - volta_common_kernels

In [17]:
maxwell_feature_kernals

{'maxwell_fp16_scudnn_fp16_128x128_relu_medium_nn_v0',
 'maxwell_fp16_scudnn_fp16_128x128_relu_small_nn_v0',
 'maxwell_fp16_scudnn_fp16_128x32_relu_small_nn_v0',
 'maxwell_fp16_scudnn_fp16_128x32_stridedB_splitK_larg...',
 'maxwell_fp16_scudnn_fp16_128x64_relu_medium_nn_v0',
 'maxwell_fp16_scudnn_fp16_128x64_stridedB_small_nn_v0...',
 'maxwell_fp16_scudnn_fp16_128x64_stridedB_splitK_larg...',
 'maxwell_fp16_scudnn_fp16_128x64_stridedB_splitK_medi...',
 'maxwell_gcgemm_64x64_nt',
 'maxwell_scudnn_128x128_3dconv_fprop_small_nn_v0',
 'maxwell_scudnn_128x128_stridedB_splitK_medium_nn_v0',
 'maxwell_scudnn_128x32_stridedB_small_nn_v0',
 'maxwell_scudnn_128x32_stridedB_splitK_large_nn_v0',
 'maxwell_scudnn_128x32_stridedB_splitK_medium_nn_v0',
 'maxwell_scudnn_128x64_relu_large_nn_v1',
 'maxwell_scudnn_128x64_stridedB_splitK_interior_nn_v0...',
 'maxwell_scudnn_128x64_stridedB_splitK_large_nn_v0',
 'maxwell_scudnn_winograd_128x128_ldg1_ldg4_mobile_rel...',
 'maxwell_scudnn_winograd_128x128_l

In [18]:
volta_feature_kernals

{'volta_cgemm_32x64_tn',
 'volta_cgemm_64x32_tn',
 'volta_gcgemm_32x32_nt',
 'volta_gcgemm_64x32_tn',
 'volta_scudnn_128x128_3dconv_fprop_medium_nn_v1',
 'volta_scudnn_128x128_relu_small_nn_v1',
 'volta_scudnn_128x128_stridedB_medium_nn_v1',
 'volta_scudnn_128x32_relu_interior_nn_v1',
 'volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_interior...',
 'volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_medium_n...',
 'volta_scudnn_128x32_sliced1x4_ldg4_relu_exp_small_nh...',
 'volta_scudnn_128x64_relu_xregs_large_nn_v1',
 'volta_scudnn_128x64_sliced1x2_ldg4_relu_exp_medium_n...',
 'volta_scudnn_128x64_sliced1x2_ldg4_relu_exp_small_nh...',
 'volta_scudnn_128x64_stridedB_interior_nn_v1',
 'volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148...',
 'volta_sgemm_128x32_sliced1x4_nn',
 'volta_sgemm_128x32_sliced1x4_tn',
 'volta_sgemm_32x32_sliced1x4_tn',
 'volta_sgemm_64x32_sliced1x4_nn',
 'volta_sgemm_64x64_tn'}