In [1]:
import os
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:


q4_data_file = "ncu_report_full_q4_2layers_utilization.csv"
q8_data_file = "ncu_report_full_q8_2layers_utilization.csv"

interested_metrics = [
    "sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active", # tensor_precision_fu_utilization
    "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum",  # inst_fp_16
    "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum",
    "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum",
    "smsp__sass_thread_inst_executed_op_integer_pred_on.sum",

]




In [3]:
def is_interested(col_name):
    for m in interested_metrics:
        if m + " " in col_name:
            return True
    return False

In [4]:
def get_aggregated_stat(source_data_path):
    print("=== Reading data:", source_data_path)
    source_data = pd.read_csv(source_data_path, thousands=',')
    cols = source_data.columns
    selected_cols = cols[[0,3,4,6,8]]
    appending_cols = [col for col in cols if is_interested(col)]
    print("Interested metrics in data file:",len(appending_cols))
    source_stat = source_data.loc[:,list(selected_cols)+list(appending_cols)]
    print(source_stat.info())
    print("Data types:", source_stat.iloc[:,4:].dtypes)
    res = source_stat.iloc[:,4:].groupby(['Demangled Name'], sort=True).mean()
    print("=== End of Processing:", source_data_path)
    return res


In [12]:
q4_agg = get_aggregated_stat(q4_data_file)


=== Reading data: ncu_report_full_q4_2layers_utilization.csv
Interested metrics in data file: 5
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 10 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   ID                                                                  300 non-null    int64  
 1   Estimated Speedup                                                   300 non-null    float64
 2   Runtime Improvement""(0)                                            300 non-null    float64
 3   Function Name                                                       300 non-null    object 
 4   Demangled Name                                                      300 non-null    object 
 5   sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%]  300 non-null    float64
 6   smsp__sass_thread_

In [13]:
q8_agg = get_aggregated_stat(q8_data_file)

=== Reading data: ncu_report_full_q8_2layers_utilization.csv
Interested metrics in data file: 5
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 10 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   ID                                                                  300 non-null    int64  
 1   Estimated Speedup                                                   300 non-null    float64
 2   Runtime Improvement""(0)                                            300 non-null    float64
 3   Function Name                                                       300 non-null    object 
 4   Demangled Name                                                      300 non-null    object 
 5   sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%]  300 non-null    float64
 6   smsp__sass_thread_

In [27]:
# pd.options.display.float_format = '{:.0f}'.format
pd.options.display.float_format = '{:,.2f}'.format


In [16]:
q4_agg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21 entries, k_compute_batched_ptrs(const __half *, const __half *, char *, const void **, void **, long, long, long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, long, long) to void soft_max_f32<(bool)1, (int)64, (int)64>(const float *, const float *, float *, int, int, float)
Data columns (total 5 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%]  21 non-null     float64
 1   smsp__sass_thread_inst_executed_op_fp16_pred_on.sum [inst]          21 non-null     float64
 2   smsp__sass_thread_inst_executed_op_fp32_pred_on.sum [inst]          21 non-null     float64
 3   smsp__sass_thread_inst_executed_op_fp64_pred_on.sum [inst]          21 non-null     float64
 4   smsp__sass_th

In [17]:
q4_agg["smsp__sass_thread_inst_executed_op_fp16_pred_on.sum [inst]"]

Demangled Name
k_compute_batched_ptrs(const __half *, const __half *, char *, const void **, void **, long, long, long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, long, long)                0
k_sum_rows_f32(const float *, float *, int)                                                                                                                                                                                   0
quantize_q8_1(const float *, void *, int, int)                                                                                                                                                                                0
silu_f32(const float *, float *, int)                                                                                                                                                                                         0
turing_h1688gemm_256x64_ldg8_stages_32x1_tn                                              

In [28]:
q4_agg.sort_values(by=['sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%]'], ascending=False)

Unnamed: 0_level_0,sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%],smsp__sass_thread_inst_executed_op_fp16_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_fp32_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_fp64_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_integer_pred_on.sum [inst]
Demangled Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
turing_h1688gemm_256x64_ldg8_stages_32x1_tn,29.32,1839104.0,0.0,0.0,1775616.0
"void mul_mat_vec_q<(int)32, (int)4, block_q4_0, (int)2, &vec_dot_q4_0_q8_1>(const void *, const void *, float *, int, int)",0.06,9300413.22,10517993.74,0.0,56186078.61
"void mul_mat_vec_q<(int)32, (int)8, block_q8_0, (int)2, &vec_dot_q8_0_q8_1>(const void *, const void *, float *, int, int)",0.02,1048576.0,1180672.0,0.0,3245056.0
"k_compute_batched_ptrs(const __half *, const __half *, char *, const void **, void **, long, long, long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, long, long)",0.0,0.0,64.0,0.0,2062.0
"void k_bin_bcast<&op_add, float, float, float>(const T2 *, const T3 *, T4 *, int, int, int, int, int, int, int, int, int, int, int, int, int, int)",0.0,0.0,14336.0,0.0,245760.0
"void soft_max_f32<(bool)1, (int)0, (int)0>(const float *, const float *, float *, int, int, float)",0.0,0.0,408.0,0.0,824.0
"void rope<float, (bool)1>(const T1 *, T1 *, int, const int *, float, int, float, float, float, rope_corr_dims)",0.0,0.0,23040.0,0.0,39680.0
"void rms_norm_f32<(int)1024>(const float *, float *, int, float)",0.0,0.0,21504.0,0.0,34880.0
"void k_get_rows_float<float, float>(const T1 *, const int *, T2 *, long, long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long)",0.0,0.0,512.0,0.0,9824.0
"void k_bin_bcast<&op_mul, float, float, float>(const T2 *, const T3 *, T4 *, int, int, int, int, int, int, int, int, int, int, int, int, int, int)",0.0,0.0,25959.78,0.0,446685.0


In [29]:
q8_agg.sort_values(by=['sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%]'], ascending=False)

Unnamed: 0_level_0,sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%],smsp__sass_thread_inst_executed_op_fp16_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_fp32_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_fp64_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_integer_pred_on.sum [inst]
Demangled Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
turing_h1688gemm_256x64_ldg8_stages_32x1_tn,29.11,1839104.0,0.0,0.0,1775616.0
"void mul_mat_vec_q<(int)32, (int)8, block_q8_0, (int)2, &vec_dot_q8_0_q8_1>(const void *, const void *, float *, int, int)",0.02,10373412.57,11397156.57,0.0,30485686.86
"k_compute_batched_ptrs(const __half *, const __half *, char *, const void **, void **, long, long, long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, long, long)",0.0,0.0,64.0,0.0,2062.0
"void k_bin_bcast<&op_add, float, float, float>(const T2 *, const T3 *, T4 *, int, int, int, int, int, int, int, int, int, int, int, int, int, int)",0.0,0.0,14336.0,0.0,245760.0
"void soft_max_f32<(bool)1, (int)0, (int)0>(const float *, const float *, float *, int, int, float)",0.0,0.0,408.0,0.0,824.0
"void rope<float, (bool)1>(const T1 *, T1 *, int, const int *, float, int, float, float, float, rope_corr_dims)",0.0,0.0,23040.0,0.0,39680.0
"void rms_norm_f32<(int)1024>(const float *, float *, int, float)",0.0,0.0,21504.0,0.0,34880.0
"void k_get_rows_float<float, float>(const T1 *, const int *, T2 *, long, long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long)",0.0,0.0,512.0,0.0,9824.0
"void k_bin_bcast<&op_mul, float, float, float>(const T2 *, const T3 *, T4 *, int, int, int, int, int, int, int, int, int, int, int, int, int, int)",0.0,0.0,25959.78,0.0,446574.32
"void k_bin_bcast<&op_div, float, float, float>(const T2 *, const T3 *, T4 *, int, int, int, int, int, int, int, int, int, int, int, int, int, int)",0.0,0.0,9.0,0.0,121.0


In [19]:
q8_agg.sub(q4_agg, fill_value=0.0)

Unnamed: 0_level_0,sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%],smsp__sass_thread_inst_executed_op_fp16_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_fp32_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_fp64_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_integer_pred_on.sum [inst]
Demangled Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"k_compute_batched_ptrs(const __half *, const __half *, char *, const void **, void **, long, long, long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, long, long)",0,0,0,0,0
"k_sum_rows_f32(const float *, float *, int)",0,0,0,0,0
"quantize_q8_1(const float *, void *, int, int)",0,0,-600,0,-843
"silu_f32(const float *, float *, int)",0,0,0,0,0
turing_h1688gemm_256x64_ldg8_stages_32x1_tn,0,0,0,0,0
"void convert_unary<__half, float>(const void *, T2 *, int)",0,0,0,0,0
"void convert_unary<float, __half>(const void *, T2 *, int)",0,0,0,0,0
"void cpy_f32_f16<&cpy_1_f32_f16>(const char *, char *, int, int, int, int, int, int, int, int, int, int, int)",0,0,0,0,0
"void cpy_f32_f16<&cpy_1_f32_f32>(const char *, char *, int, int, int, int, int, int, int, int, int, int, int)",0,0,0,0,0
"void dequantize_mul_mat_vec<(int)1, (int)1, &convert_f16>(const void *, const float *, float *, int, int)",0,0,0,0,0


In [22]:
diff = q8_agg.sub(q4_agg, fill_value=0.0)
diff.loc[(diff!=0).any(axis=1)]

Unnamed: 0_level_0,sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active [%],smsp__sass_thread_inst_executed_op_fp16_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_fp32_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_fp64_pred_on.sum [inst],smsp__sass_thread_inst_executed_op_integer_pred_on.sum [inst]
Demangled Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"quantize_q8_1(const float *, void *, int, int)",0,0,-600,0,-843
turing_h1688gemm_256x64_ldg8_stages_32x1_tn,0,0,0,0,0
"void k_bin_bcast<&op_mul, float, float, float>(const T2 *, const T3 *, T4 *, int, int, int, int, int, int, int, int, int, int, int, int, int, int)",0,0,0,0,-111
"void mul_mat_vec_q<(int)32, (int)4, block_q4_0, (int)2, &vec_dot_q4_0_q8_1>(const void *, const void *, float *, int, int)",0,-9300413,-10517994,0,-56186079
"void mul_mat_vec_q<(int)32, (int)8, block_q8_0, (int)2, &vec_dot_q8_0_q8_1>(const void *, const void *, float *, int, int)",0,9324837,10216485,0,27240631
