In [105]:
import os
import numpy as np
import pandas as pd

In [106]:
q4_data_file = "ncu_report_full_q4_2layers_utilization.csv"
q8_data_file = "ncu_report_full_q8_2layers_utilization.csv"

interested_metrics = [
    "sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active", # tensor_precision_fu_utilization
    "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum",  # inst_fp_16
    "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum",
    "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum",
    "smsp__sass_thread_inst_executed_op_integer_pred_on.sum",

]



In [107]:
def is_interested(col_name):
    for m in interested_metrics:
        if m + " " in col_name:
            return True
    return False

In [108]:
q4_data = pd.read_csv(q4_data_file, thousands=',')
cols = q4_data.columns

In [109]:
selected_cols = cols[[0,3,4,6,8]]


In [110]:
list(selected_cols)

['ID',
 'Estimated Speedup',
 'Runtime Improvement""(0)',
 'Function Name',
 'Demangled Name']

In [111]:
appending_cols = [col for col in cols if is_interested(col)]
len(appending_cols)

5

In [112]:
appending_cols

['sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%]',
 'smsp__sass_thread_inst_executed_op_fp16_pred_on.sum [inst]',
 'smsp__sass_thread_inst_executed_op_fp32_pred_on.sum [inst]',
 'smsp__sass_thread_inst_executed_op_fp64_pred_on.sum [inst]',
 'smsp__sass_thread_inst_executed_op_integer_pred_on.sum [inst]']

In [113]:
q4_stat = q4_data.loc[:,list(selected_cols)+list(appending_cols)]
q4_stat

Unnamed: 0,ID,Estimated Speedup,"Runtime Improvement""""(0)",Function Name,Demangled Name,sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%],smsp__sass_thread_inst_executed_op_fp16_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_fp32_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_fp64_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_integer_pred_on.sum [inst]
0,0,0,0,rms_norm_f32,"void rms_norm_f32<(int)1024>(const float *, fl...",0,0,21504,0,34880
1,1,0,0,k_bin_bcast,"void k_bin_bcast<&op_mul, float, float, float>...",0,0,14336,0,245760
2,2,0,0,dequantize_mul_mat_vec,"void dequantize_mul_mat_vec<(int)1, (int)1, &c...",0,32768,33800,0,37896
3,3,0,0,soft_max_f32,"void soft_max_f32<(bool)1, (int)0, (int)0>(con...",0,0,408,0,824
4,4,0,0,k_argsort_f32_i32,void k_argsort_f32_i32<(ggml_sort_order)1>(con...,0,0,24,0,672
...,...,...,...,...,...,...,...,...,...,...
295,295,0,0,k_bin_bcast,"void k_bin_bcast<&op_mul, float, float, float>...",0,0,50176,0,860160
296,296,0,0,quantize_q8_1,"quantize_q8_1(const float *, void *, int, int)",0,0,215488,0,302848
297,297,0,0,mul_mat_vec_q,"void mul_mat_vec_q<(int)32, (int)8, block_q8_0...",0,14680064,15208448,0,37883904
298,298,0,0,k_bin_bcast,"void k_bin_bcast<&op_mul, float, float, float>...",0,0,14336,0,249855


In [114]:
q4_stat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 10 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   ID                                                                  300 non-null    int64  
 1   Estimated Speedup                                                   300 non-null    float64
 2   Runtime Improvement""(0)                                            300 non-null    float64
 3   Function Name                                                       300 non-null    object 
 4   Demangled Name                                                      300 non-null    object 
 5   sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%]  300 non-null    float64
 6   smsp__sass_thread_inst_executed_op_fp16_pred_on.sum [inst]          300 non-null    int64  
 7   smsp__sass_thread

In [115]:
# for col in q4_stat.columns[5:]:
#     q4_stat[col] = pd.to_numeric(q4_stat[col])

In [116]:
q4_stat.iloc[:,4:].dtypes

Demangled Name                                                         object
sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%]    float64
smsp__sass_thread_inst_executed_op_fp16_pred_on.sum [inst]              int64
smsp__sass_thread_inst_executed_op_fp32_pred_on.sum [inst]              int64
smsp__sass_thread_inst_executed_op_fp64_pred_on.sum [inst]              int64
smsp__sass_thread_inst_executed_op_integer_pred_on.sum [inst]           int64
dtype: object

In [117]:
q4_stat.iloc[:,4:]["smsp__sass_thread_inst_executed_op_fp16_pred_on.sum [inst]"]

0             0
1             0
2         32768
3             0
4             0
         ...   
295           0
296           0
297    14680064
298           0
299           0
Name: smsp__sass_thread_inst_executed_op_fp16_pred_on.sum [inst], Length: 300, dtype: int64

In [118]:
pd.options.display.float_format = '{:.0f}'.format


In [119]:
res = q4_stat.iloc[:,4:].groupby(['Demangled Name'], sort=True).mean()
res

Unnamed: 0_level_0,sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%],smsp__sass_thread_inst_executed_op_fp16_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_fp32_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_fp64_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_integer_pred_on.sum [inst]
Demangled Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"k_compute_batched_ptrs(const __half *, const __half *, char *, const void **, void **, long, long, long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, long, long)",0,0,64,0,2062
"k_sum_rows_f32(const float *, float *, int)",0,0,131,0,73
"quantize_q8_1(const float *, void *, int, int)",0,0,94551,0,132882
"silu_f32(const float *, float *, int)",0,0,71680,0,57344
turing_h1688gemm_256x64_ldg8_stages_32x1_tn,29,1839104,0,0,1775616
"void convert_unary<__half, float>(const void *, T2 *, int)",0,3072,0,0,12288
"void convert_unary<float, __half>(const void *, T2 *, int)",0,0,0,0,12288
"void cpy_f32_f16<&cpy_1_f32_f16>(const char *, char *, int, int, int, int, int, int, int, int, int, int, int)",0,0,4096,0,91141
"void cpy_f32_f16<&cpy_1_f32_f32>(const char *, char *, int, int, int, int, int, int, int, int, int, int, int)",0,0,16384,0,356414
"void dequantize_mul_mat_vec<(int)1, (int)1, &convert_f16>(const void *, const float *, float *, int, int)",0,32768,33800,0,37896


In [120]:
res.to_csv("./q4_utilization.csv")