In [56]:
import pandas as pd
from sklearn.metrics import f1_score, recall_score, precision_score

## Accuracy

### Eval Categorical

In [5]:
df = pd.read_csv("eval_files/categorical.csv")

In [6]:
# df.head(5)
df.keys()

Index(['Unnamed: 0', 'skill', 'reader', 'adapter', 'timestamp', 'answer_base',
       'logits_answer_base', 'answer_quantized_model',
       'logits_answer_quantized_model', 'answer_onnx_model',
       'logits_answer_onnx_model', 'answer_onnx_opt_model',
       'logits_answer_onnx_opt_model', 'answer_quant_onnx_model',
       'logits_answer_quant_onnx_model', 'answer_quant_onnx_opt_model',
       'logits_answer_quant_onnx_opt_model', 'data_id', 'dataset', 'question',
       'context', 'answer_dataset'],
      dtype='object')

In [7]:
cat_df = df[df["skill"] == "categorical"]

In [27]:
def get_mean_diff_of_logits(datafr, model_name):
    l = datafr[model_name].str.replace("([\[\]])", "", regex=True)

    float_list = []
    for v in l:
        s = v.split()
        float_list.append([float(s[0]), float(s[1])])

    r_list = []
    for fe in float_list:
        r = fe[0] - fe[1]
        r_list.append(r)
        
    mean_diff = sum(r_list) / len(r_list)
    return abs(mean_diff) # return only pos. value




#### Compare to gold label

In [28]:
gold_label_column_name = "answer_dataset"
model_name_answer_column_list = ["answer_base", "answer_quantized_model", "answer_onnx_model", "answer_onnx_opt_model", "answer_quant_onnx_model", "answer_quant_onnx_opt_model"]
model_name_logits_column_list = ["logits_answer_base", "logits_answer_quantized_model", "logits_answer_onnx_model", "logits_answer_onnx_opt_model", "logits_answer_quant_onnx_model", "logits_answer_quant_onnx_opt_model"]

In [61]:
r_df = pd.DataFrame(columns=[
    "adapter", "reader", "column_name", 
    "accuracy", "total_true", "total_false", "total_amount",
    "precision", "recall", "f1"
])

for adapter in cat_df["adapter"].unique():
    adapter_cat_df = cat_df[cat_df["adapter"] == adapter]
    for reader in adapter_cat_df["reader"].unique():
        reader_adapter_cat_df = adapter_cat_df[adapter_cat_df["reader"] == reader].reset_index()
        print(f"Loading: {reader} {adapter}")

        total_amount = len(reader_adapter_cat_df)
        for column_name in model_name_answer_column_list:
            t = reader_adapter_cat_df.index[reader_adapter_cat_df[gold_label_column_name] == reader_adapter_cat_df[column_name]]
            f = reader_adapter_cat_df.index[reader_adapter_cat_df[gold_label_column_name] != reader_adapter_cat_df[column_name]]

            total_true = len(t)
            total_false = len(f)

            y_pred = reader_adapter_cat_df[column_name].to_list()
            y_true = reader_adapter_cat_df[gold_label_column_name].to_list()

            hits = (total_true)/total_amount

            # micro
            # macro 
            # weighted
            precision = precision_score(y_true, y_pred)
            recall = recall_score(y_true, y_pred)
            f1 = f1_score(y_true, y_pred)


            print(f"Hits for {reader} {adapter} {column_name} is: {hits}. T: {total_true}. F: {total_false}.")

            r_df.loc[len(r_df)] = [
                adapter, reader, column_name, 
                hits, total_true, total_false, total_amount,
                precision, recall, f1
                ]

Loading: bert-base-uncased boolq
Hits for bert-base-uncased boolq answer_base is: 0.7339449541284404. T: 2400. F: 870.
Hits for bert-base-uncased boolq answer_quantized_model is: 0.7363914373088685. T: 2408. F: 862.
Hits for bert-base-uncased boolq answer_onnx_model is: 0.6174311926605505. T: 2019. F: 1251.
Hits for bert-base-uncased boolq answer_onnx_opt_model is: 0.6174311926605505. T: 2019. F: 1251.
Hits for bert-base-uncased boolq answer_quant_onnx_model is: 0.5376146788990825. T: 1758. F: 1512.
Hits for bert-base-uncased boolq answer_quant_onnx_opt_model is: 0.5376146788990825. T: 1758. F: 1512.
Loading: roberta-base boolq
Hits for roberta-base boolq answer_base is: 0.789908256880734. T: 2583. F: 687.
Hits for roberta-base boolq answer_quantized_model is: 0.791743119266055. T: 2589. F: 681.
Hits for roberta-base boolq answer_onnx_model is: 0.7660550458715596. T: 2505. F: 765.
Hits for roberta-base boolq answer_onnx_opt_model is: 0.7660550458715596. T: 2505. F: 765.
Hits for robert

In [62]:
r_df

Unnamed: 0,adapter,reader,column_name,accuracy,total_true,total_false,total_amount,precision,recall,f1
0,boolq,bert-base-uncased,answer_base,0.733945,2400,870,3270,0.761348,0.833251,0.795679
1,boolq,bert-base-uncased,answer_quantized_model,0.736391,2408,862,3270,0.762203,0.837186,0.797937
2,boolq,bert-base-uncased,answer_onnx_model,0.617431,2019,1251,3270,0.692611,0.691589,0.692099
3,boolq,bert-base-uncased,answer_onnx_opt_model,0.617431,2019,1251,3270,0.692611,0.691589,0.692099
4,boolq,bert-base-uncased,answer_quant_onnx_model,0.537615,1758,1512,3270,0.747388,0.387113,0.510045
5,boolq,bert-base-uncased,answer_quant_onnx_opt_model,0.537615,1758,1512,3270,0.747388,0.387113,0.510045
6,boolq,roberta-base,answer_base,0.789908,2583,687,3270,0.824181,0.841613,0.832806
7,boolq,roberta-base,answer_quantized_model,0.791743,2589,681,3270,0.824376,0.845057,0.834588
8,boolq,roberta-base,answer_onnx_model,0.766055,2505,765,3270,0.771637,0.885883,0.824823
9,boolq,roberta-base,answer_onnx_opt_model,0.766055,2505,765,3270,0.771637,0.885883,0.824823


In [30]:
# Calc diff of logits (of binary) if pred is wrong.

In [40]:
def get_mean_diff_of_logits_2(datafr, model_name):
    l = datafr[model_name].str.replace("([\[\]'])", "", regex=True)

    float_list = []
    for v in l:
        s = v.split(", ")
        float_list.append([float(s[0]), float(s[1])])

    r_list = []
    for fe in float_list:
        r = fe[0] - fe[1]
        r_list.append(r)
        
    mean_diff = sum(r_list) / len(r_list)
    return abs(mean_diff) # return only pos. value

In [41]:
# Testing

t = df.index[df["answer_base"] == df["answer_onnx_model"]]
f = df.index[df["answer_base"] != df["answer_onnx_model"]]

false_onnx = df.iloc[f]
true_onnx = df.iloc[t]

#get result for false results for onnx model
model_name_logits = model_name_logits_column_list[2]
mean_diff_false_results = get_mean_diff_of_logits_2(false_onnx, model_name_logits)
print(mean_diff_false_results)
mean_diff_true_results = get_mean_diff_of_logits_2(true_onnx, model_name_logits)
print(mean_diff_true_results)
mean_diff_all_results = get_mean_diff_of_logits_2(df, model_name_logits)
print(mean_diff_all_results)

0.16006458114519867
1.3025332073218743
1.0647809442903062


In [42]:
for adapter in cat_df["adapter"].unique():
    adapter_cat_df = cat_df[cat_df["adapter"] == adapter]
    for reader in adapter_cat_df["reader"].unique():
        reader_adapter_cat_df = adapter_cat_df[adapter_cat_df["reader"] == reader].reset_index()
        print(f"Loading: {reader} {adapter}")
        
        for column_name in model_name_answer_column_list:
            t = reader_adapter_cat_df.index[reader_adapter_cat_df[gold_label_column_name] == reader_adapter_cat_df[column_name]]
            f = reader_adapter_cat_df.index[reader_adapter_cat_df[gold_label_column_name] != reader_adapter_cat_df[column_name]]

            column_name_logits = model_name_logits_column_list[model_name_answer_column_list.index(column_name)]
            false_pred_df = reader_adapter_cat_df.iloc[f]
            true_pred_df = reader_adapter_cat_df.iloc[t]

            mean_diff_false_results = get_mean_diff_of_logits_2(false_pred_df, column_name_logits)
            mean_diff_true_results = get_mean_diff_of_logits_2(true_pred_df, column_name_logits)
            print(f"{reader} {adapter}. Mean diff logits wrong answer: {column_name_logits} {mean_diff_false_results}")
            print(f"{reader} {adapter}. Mean diff logits right answer: {column_name_logits} {mean_diff_true_results}")

            print("______\n")

Loading: bert-base-uncased boolq
bert-base-uncased boolq. Mean diff logits wrong answer: logits_answer_base 0.5527182899770107
bert-base-uncased boolq. Mean diff logits right answer: logits_answer_base 1.2831902399541695
______

bert-base-uncased boolq. Mean diff logits wrong answer: logits_answer_quantized_model 0.5779924959446868
bert-base-uncased boolq. Mean diff logits right answer: logits_answer_quantized_model 1.2983542962500025
______

bert-base-uncased boolq. Mean diff logits wrong answer: logits_answer_onnx_model 0.10977075498867121
bert-base-uncased boolq. Mean diff logits right answer: logits_answer_onnx_model 0.7066019500930119
______

bert-base-uncased boolq. Mean diff logits wrong answer: logits_answer_onnx_opt_model 0.10977075498867121
bert-base-uncased boolq. Mean diff logits right answer: logits_answer_onnx_opt_model 0.7066019500930119
______

bert-base-uncased boolq. Mean diff logits wrong answer: logits_answer_quant_onnx_model 0.5516529688425929
bert-base-uncased boo

#### Compare to Base pred

In [63]:
gold_label_column_name = "answer_base"
model_name_answer_column_list = ["answer_quantized_model", "answer_onnx_model", "answer_onnx_opt_model", "answer_quant_onnx_model", "answer_quant_onnx_opt_model"]
model_name_logits_column_list = ["logits_answer_quantized_model", "logits_answer_onnx_model", "logits_answer_onnx_opt_model", "logits_answer_quant_onnx_model", "logits_answer_quant_onnx_opt_model"]

In [68]:
r_df = pd.DataFrame(columns=[
    "adapter", "reader", "column_name", 
    "accuracy", "total_true", "total_false", "total_amount",
    "precision", "recall", "f1"
])
for adapter in cat_df["adapter"].unique():
    adapter_cat_df = cat_df[cat_df["adapter"] == adapter]
    for reader in adapter_cat_df["reader"].unique():
        reader_adapter_cat_df = adapter_cat_df[adapter_cat_df["reader"] == reader].reset_index()
        # print(f"Loading: {reader} {adapter}")

        total_amount = len(reader_adapter_cat_df)
        for column_name in model_name_answer_column_list:
            t = reader_adapter_cat_df.index[reader_adapter_cat_df[gold_label_column_name] == reader_adapter_cat_df[column_name]]
            f = reader_adapter_cat_df.index[reader_adapter_cat_df[gold_label_column_name] != reader_adapter_cat_df[column_name]]
            
            total_true = len(t)
            total_false = len(f)

            y_pred = reader_adapter_cat_df[column_name].to_list()
            y_true = reader_adapter_cat_df[gold_label_column_name].to_list()

            precision = precision_score(y_true, y_pred)
            recall = recall_score(y_true, y_pred)
            f1 = f1_score(y_true, y_pred)

            accuracy = (total_true)/total_amount
            r_df.loc[len(r_df)] = [
                adapter, reader, column_name, 
                accuracy, total_true, total_false, total_amount,
                precision, recall, f1
            ]
            # print(f"Accuracy for {reader} {adapter} {column_name} is: {accuracy}. T: {total_true}. F: {total_false}.")

In [69]:
# r_df.head()
r_df

Unnamed: 0,adapter,reader,column_name,accuracy,total_true,total_false,total_amount,precision,recall,f1
0,boolq,bert-base-uncased,answer_quantized_model,0.991437,3242,28,3270,0.991939,0.995506,0.993719
1,boolq,bert-base-uncased,answer_onnx_model,0.709174,2319,951,3270,0.813793,0.742472,0.776498
2,boolq,bert-base-uncased,answer_onnx_opt_model,0.709174,2319,951,3270,0.813793,0.742472,0.776498
3,boolq,bert-base-uncased,answer_quant_onnx_model,0.556575,1820,1450,3270,0.867996,0.410787,0.557657
4,boolq,bert-base-uncased,answer_quant_onnx_opt_model,0.556575,1820,1450,3270,0.867996,0.410787,0.557657
5,boolq,roberta-base,answer_quantized_model,0.987768,3230,40,3270,0.988484,0.992293,0.990385
6,boolq,roberta-base,answer_onnx_model,0.874618,2860,410,3270,0.856898,0.963391,0.907029
7,boolq,roberta-base,answer_onnx_opt_model,0.874618,2860,410,3270,0.856898,0.963391,0.907029
8,boolq,roberta-base,answer_quant_onnx_model,0.848624,2775,495,3270,0.835669,0.947977,0.888287
9,boolq,roberta-base,answer_quant_onnx_opt_model,0.848624,2775,495,3270,0.835669,0.947977,0.888287


In [77]:
# for adapter in cat_df["adapter"].unique():
#     adapter_cat_df = cat_df[cat_df["adapter"] == adapter]
#     for reader in adapter_cat_df["reader"].unique():
#         reader_adapter_cat_df = adapter_cat_df[adapter_cat_df["reader"] == reader].reset_index()
#         print(f"Loading: {reader} {adapter}")
        
#         for column_name in model_name_answer_column_list:
#             t = reader_adapter_cat_df.index[reader_adapter_cat_df[gold_label_column_name] == reader_adapter_cat_df[column_name]]
#             f = reader_adapter_cat_df.index[reader_adapter_cat_df[gold_label_column_name] != reader_adapter_cat_df[column_name]]

#             column_name_logits = model_name_logits_column_list[model_name_answer_column_list.index(column_name)]
#             false_pred_df = reader_adapter_cat_df.iloc[f]
#             true_pred_df = reader_adapter_cat_df.iloc[t]

#             mean_diff_false_results = get_mean_diff_of_logits(false_pred_df, column_name_logits)
#             mean_diff_true_results = get_mean_diff_of_logits(true_pred_df, column_name_logits)
            
            
#             print(f"{reader} {adapter}. Mean diff logits wrong answer: {column_name_logits} {mean_diff_false_results}")
#             print(f"{reader} {adapter}. Mean diff logits right answer: {column_name_logits} {mean_diff_true_results}")

#             print("______\n")

### Eval Mcq

In [71]:
df = pd.read_csv("eval_files/multiple_choice.csv")
mcq_df = df[df["skill"] == "multiple-choice"]

In [72]:
mcq_df.keys()

Index(['Unnamed: 0', 'skill', 'reader', 'adapter', 'timestamp', 'answer_base',
       'logits_answer_base', 'answer_quantized_model',
       'logits_answer_quantized_model', 'answer_onnx_model',
       'logits_answer_onnx_model', 'answer_onnx_opt_model',
       'logits_answer_onnx_opt_model', 'answer_quant_onnx_model',
       'logits_answer_quant_onnx_model', 'answer_quant_onnx_opt_model',
       'logits_answer_quant_onnx_opt_model', 'data_id', 'dataset', 'question',
       'context', 'answer_dataset'],
      dtype='object')

In [73]:
# mcq_df

In [74]:
gold_label_column_name = "answer_dataset"
model_name_answer_column_list = ["answer_base", "answer_quantized_model", "answer_onnx_model", "answer_onnx_opt_model", "answer_quant_onnx_model", "answer_quant_onnx_opt_model"]
model_name_logits_column_list = ["logits_answer_base", "logits_answer_quantized_model", "logits_answer_onnx_model", "logits_answer_onnx_opt_model", "logits_answer_quant_onnx_model", "logits_answer_quant_onnx_opt_model"]

In [78]:
r_df = pd.DataFrame(columns=[
    "adapter", "reader", "column_name", 
    "accuracy", "total_true", "total_false", "total_amount",
    "precision", "recall", "f1"
])

for adapter in mcq_df["adapter"].unique():
    adapter_mcq_df = mcq_df[mcq_df["adapter"] == adapter]
    for reader in adapter_mcq_df["reader"].unique():
        # print(f"Loading: {reader} {adapter}")
        reader_adapter_mcq_df = adapter_mcq_df[adapter_mcq_df["reader"] == reader].reset_index()

        total_amount = len(reader_adapter_mcq_df)
        for column_name in model_name_answer_column_list:
            t = reader_adapter_mcq_df.index[reader_adapter_mcq_df["answer_dataset"] == reader_adapter_mcq_df[column_name]]
            f = reader_adapter_mcq_df.index[reader_adapter_mcq_df["answer_dataset"] != reader_adapter_mcq_df[column_name]]

            total_true = len(t)
            total_false = len(f)

            y_pred = reader_adapter_cat_df[column_name].to_list()
            y_true = reader_adapter_cat_df[gold_label_column_name].to_list()

            precision = precision_score(y_true, y_pred)
            recall = recall_score(y_true, y_pred)
            f1 = f1_score(y_true, y_pred)

            accuracy = (total_true)/total_amount
            r_df.loc[len(r_df)] = [
                adapter, reader, column_name, 
                accuracy, total_true, total_false, total_amount,
                precision, recall, f1
            ]
            
            # print(f"Accuracy for {reader} {adapter} {column_name} is: {accuracy}. T: {total_true}. F: {total_false}.")


In [80]:
r_df.head()

Unnamed: 0,adapter,reader,column_name,accuracy,total_true,total_false,total_amount,precision,recall,f1
0,quartz,roberta-base,answer_base,0.783854,301,83,384,0.824181,0.841613,0.832806
1,quartz,roberta-base,answer_quantized_model,0.778646,299,85,384,0.824376,0.845057,0.834588
2,quartz,roberta-base,answer_onnx_model,0.671875,258,126,384,0.771637,0.885883,0.824823
3,quartz,roberta-base,answer_onnx_opt_model,0.671875,258,126,384,0.771637,0.885883,0.824823
4,quartz,roberta-base,answer_quant_onnx_model,0.65625,252,132,384,0.76051,0.880964,0.816317


### Compare to base 

In [81]:
mcq_df.keys()

Index(['Unnamed: 0', 'skill', 'reader', 'adapter', 'timestamp', 'answer_base',
       'logits_answer_base', 'answer_quantized_model',
       'logits_answer_quantized_model', 'answer_onnx_model',
       'logits_answer_onnx_model', 'answer_onnx_opt_model',
       'logits_answer_onnx_opt_model', 'answer_quant_onnx_model',
       'logits_answer_quant_onnx_model', 'answer_quant_onnx_opt_model',
       'logits_answer_quant_onnx_opt_model', 'data_id', 'dataset', 'question',
       'context', 'answer_dataset'],
      dtype='object')

In [82]:
gold_label_column_name = "answer_base"
model_name_answer_column_list = ["answer_quantized_model", "answer_onnx_model", "answer_onnx_opt_model", "answer_quant_onnx_model", "answer_quant_onnx_opt_model"]
model_name_logits_column_list = ["logits_answer_quantized_model", "logits_answer_onnx_model", "logits_answer_onnx_opt_model", "logits_answer_quant_onnx_model", "logits_answer_quant_onnx_opt_model"]

In [87]:
r_df = pd.DataFrame(columns=[
    "adapter", "reader", "column_name", 
    "accuracy", "total_true", "total_false", "total_amount",
    "precision", "recall", "f1"
])
for adapter in mcq_df["adapter"].unique():
    adapter_mcq_df = mcq_df[mcq_df["adapter"] == adapter]
    for reader in adapter_mcq_df["reader"].unique():
        reader_adapter_mcq_df = adapter_mcq_df[adapter_mcq_df["reader"] == reader].reset_index()
        # print(f"Loading: {reader} {adapter}")

        total_amount = len(reader_adapter_mcq_df)
        for column_name in model_name_answer_column_list:
            t = reader_adapter_mcq_df.index[reader_adapter_mcq_df[gold_label_column_name] == reader_adapter_mcq_df[column_name]]
            f = reader_adapter_mcq_df.index[reader_adapter_mcq_df[gold_label_column_name] != reader_adapter_mcq_df[column_name]]
            
            total_true = len(t)
            total_false = len(f)

            y_pred = reader_adapter_mcq_df[column_name].to_list()
            y_true = reader_adapter_mcq_df[gold_label_column_name].to_list()

            precision = precision_score(y_true, y_pred, average="macro")
            recall = recall_score(y_true, y_pred, average="macro")
            f1 = f1_score(y_true, y_pred, average="macro")

            accuracy = (total_true)/total_amount
            r_df.loc[len(r_df)] = [
                adapter, reader, column_name, 
                accuracy, total_true, total_false, total_amount,
                precision, recall, f1
            ]
            # print(f"Accuracy for {reader} {adapter} {column_name} is: {accuracy}. T: {total_true}. F: {total_false}.")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [88]:
r_df

Unnamed: 0,adapter,reader,column_name,accuracy,total_true,total_false,total_amount,precision,recall,f1
0,quartz,roberta-base,answer_quantized_model,0.958333,368,16,384,0.927064,0.934821,0.925619
1,quartz,roberta-base,answer_onnx_model,0.721354,277,107,384,0.589933,0.662096,0.602677
2,quartz,roberta-base,answer_onnx_opt_model,0.721354,277,107,384,0.589933,0.662096,0.602677
3,quartz,roberta-base,answer_quant_onnx_model,0.726562,279,105,384,0.63649,0.670749,0.632701
4,quartz,roberta-base,answer_quant_onnx_opt_model,0.726562,279,105,384,0.63649,0.670749,0.632701
5,quartz,bert-base-uncased,answer_quantized_model,0.755208,290,94,384,0.618211,0.555243,0.569021
6,quartz,bert-base-uncased,answer_onnx_model,0.505208,194,190,384,0.367831,0.380743,0.362298
7,quartz,bert-base-uncased,answer_onnx_opt_model,0.505208,194,190,384,0.367831,0.380743,0.362298
8,quartz,bert-base-uncased,answer_quant_onnx_model,0.513021,197,187,384,0.42283,0.355577,0.370302
9,quartz,bert-base-uncased,answer_quant_onnx_opt_model,0.513021,197,187,384,0.42283,0.355577,0.370302


In [43]:
reader_adapter_mcq_df.keys()

Index(['index', 'skill', 'reader', 'adapter', 'timestamp', 'answer_base',
       'logits_answer_base', 'answer_quantized_model',
       'logits_answer_quantized_model', 'answer_onnx_model',
       'logits_answer_onnx_model', 'answer_onnx_opt_model',
       'logits_answer_onnx_opt_model', 'answer_quant_onnx_model',
       'logits_answer_quant_onnx_model', 'answer_quant_onnx_opt_model',
       'logits_answer_quant_onnx_opt_model', 'data_id', 'dataset', 'question',
       'context', 'choices', 'answer_dataset'],
      dtype='object')

In [96]:
test_ind = 0

l = mcq_df["choices"].str.replace("([\[\]])", "", regex=True)
choices = l.iloc[test_ind].split("', '")
choices[0] = choices[0][1:]
choices[-1] = choices[-1][:-1]

choice = mcq_df.iloc[test_ind]["choices"]

golden_answer = mcq_df.iloc[test_ind]["answer_dataset"]
base_answer = mcq_df.iloc[test_ind]["answer_base"]
onnx_answer = mcq_df.iloc[test_ind]["answer_onnx_model"]

In [97]:
index_of_golden_answer = choices.index(golden_answer)
index_of_base_answer = choices.index(base_answer)
index_of_onnx_answer = choices.index(onnx_answer)

In [99]:
l2 = mcq_df["logits_answer_base"].str.replace("([\[\]])", "", regex=True)
l3 = mcq_df["logits_answer_onnx_model"].str.replace("([\[\]])", "", regex=True)

In [110]:
re_base = [float(e) for e in l2.iloc[test_ind].split()]
re_onnx = [float(e) for e in l3.iloc[test_ind].split()]


In [119]:
logits_base_answer = re_base[index_of_base_answer]
logits_onnx_answer = re_onnx[index_of_onnx_answer]

# print(logits_base_answer)
# print(logits_onnx_answer)

diff = abs(logits_base_answer - logits_onnx_answer)

print(diff)

1.22903556


In [117]:
model_name = model_name_logits_column_list[2]
print(model_name)

logits_answer_onnx_model


In [36]:
l = reader_adapter_mcq_df[model_name].str.replace("([\[\]])", "", regex=True)

In [40]:
float_list = []
for v in l:
    s = v.split()
    float_list.append([float(s[r]) for r in range(len(s))])

In [None]:
#TODO find right answer

In [None]:
r_list = []
for fe in float_list:
    r = fe[0] - fe[1]
    r_list.append(r)

In [None]:
def get_mean_diff_of_logits(datafr, model_name):
    l = datafr[model_name].str.replace("([\[\]])", "", regex=True)

    float_list = []
    for v in l:
        s = v.split()
        float_list.append([float(s[0]), float(s[1])])

    r_list = []
    for fe in float_list:
        r = fe[0] - fe[1]
        r_list.append(r)
        
    mean_diff = sum(r_list) / len(r_list)
    return abs(mean_diff) # return only pos. value

## Archive Inference Time

In [169]:
df = pd.read_csv("inference_time_categorical.csv")
# df = pd.read_csv("inference_time_extractive.csv")
# df = pd.read_csv("")

In [171]:
df.head()

Unnamed: 0.1,Unnamed: 0,adapter,reader,model_name,mean_time,median_time,min_time,max_time,mean_time_per_token,median_time_per_token,min_time_per_token,max_time_per_token,runs,time_unique_values,seq_length,context,question,data_id
0,0,drop,bert-base-uncased,Base,129.51169,116.29796,100.482702,190.842867,0.824915,0.740751,100.482702,190.842867,5,"[116.29796028137208, 190.842866897583, 108.004...",157,Hoping to rebound from their loss to the Patr...,Who scored the first touchdown of the game?,f37e81fa-ef7b-4583-b671-762fc433faa9
1,1,drop,bert-base-uncased,Base,118.036652,101.804018,88.285923,202.951193,0.751826,0.648433,88.285923,202.951193,5,"[106.0810089111328, 202.95119285583496, 88.285...",157,Hoping to rebound from their loss to the Patr...,How many field goals did Kris Brown kick?,ac6ba235-3024-4f63-a6ab-730a14def4cb
2,2,drop,bert-base-uncased,Base,127.039289,119.737864,104.492188,150.061131,0.809167,0.762662,104.492188,150.061131,5,"[117.91324615478516, 150.06113052368164, 104.4...",157,Hoping to rebound from their loss to the Patr...,Which team won the game?,2c7c93f6-69ed-47cc-a5af-94a00c185a26
3,3,drop,bert-base-uncased,Base,119.86289,109.89213,100.628376,161.371231,0.763458,0.69995,100.628376,161.371231,5,"[119.31896209716795, 161.37123107910156, 109.8...",157,Hoping to rebound from their loss to the Patr...,How many field goals did both teams kick in th...,7dfd2b64-f39e-4bb4-aeb0-1900adda6018
4,4,drop,bert-base-uncased,Base,111.07955,109.916925,101.395845,126.97196,0.707513,0.700108,101.395845,126.97196,5,"[126.97196006774902, 113.73090744018556, 103.3...",157,Hoping to rebound from their loss to the Patr...,How many more yards was Kris Browns's first fi...,121a8f57-7752-4373-a9ba-748b2c577cd2


In [170]:
df["time_per_token"] = df["time once (ms)"]/df["seq_length"]

KeyError: 'time once (ms)'

In [140]:
print(len(df))
# df.head()
df.tail()


24000


Unnamed: 0,reader,adapter,model_name,time once (ms),average_time 50 times (ms),seq_length,context,question,data_id,time_per_token
23995,roberta-base,squad_v2,ONNX-OPT Quantized,43.093204,,96,"In 1066, Duke William II of Normandy conquered...",Where did Harold II die?,56de16ca4396321400ee25c5,0.448888
23996,roberta-base,squad_v2,ONNX-OPT Quantized,38.706064,,96,"In 1066, Duke William II of Normandy conquered...",Who killed Harold II?,56de16ca4396321400ee25c6,0.403188
23997,roberta-base,squad_v2,ONNX-OPT Quantized,37.344933,,96,"In 1066, Duke William II of Normandy conquered...",When was the Battle of Hastings?,56de16ca4396321400ee25c7,0.38901
23998,roberta-base,squad_v2,ONNX-OPT Quantized,40.225029,,96,"In 1066, Duke William II of Normandy conquered...",Who was the ruling class ahead of the Normans?,56de16ca4396321400ee25c8,0.419011
23999,roberta-base,squad_v2,ONNX-OPT Quantized,37.925959,,96,"In 1066, Duke William II of Normandy conquered...",When did King Harold II conquer England?,5ad3f4b1604f3c001a3ff951,0.395062


In [141]:
data_id_list = df["data_id"].unique().tolist()
adapter_list = df["adapter"].unique().tolist()
reader_list = df["reader"].unique().tolist()
model_name_list = df["model_name"].unique().tolist()

In [165]:
df_fin = pd.DataFrame(columns=["adapter", "reader", "model_name", "mean_time","median_time", "min_time", "max_time", "mean_time_per_token", "median_time_per_token", "min_time_per_token", "max_time_per_token", "runs", "time_unique_values", "seq_length", "context", "question", "data_id"])
df_overall = pd.DataFrame(columns=["adapter", "reader", "model_name", "mean_time","median_time", "min_time", "max_time", "mean_time_per_token", "median_time_per_token", "min_time_per_token", "max_time_per_token", "runs", "av_seq_length"])

In [167]:
for adapter in adapter_list:
    # print(f"Doing {adapter}")
    df_adapter = df[df["adapter"] == adapter]
    
    for reader in reader_list:
        # print(f"Doing {reader}")
        df_reader = df_adapter[df_adapter["reader"] == reader]
        
        for model_name in model_name_list:
            # print(f"Doing {model_name}")
            df_model = df_reader[df_reader["model_name"]== model_name]

            for data_id in data_id_list:
                df_data_id = df_model[df_model["data_id"] == data_id]
                if df_data_id.empty:
                    continue
                # print(f"Doing {data_id}")
                
                time_unique_values = df_data_id["time once (ms)"].tolist()
                runs = len(time_unique_values)

                mean_time = df_data_id["time once (ms)"].mean()
                median_time = df_data_id["time once (ms)"].median()
                min_time = df_data_id["time once (ms)"].min()
                max_time = df_data_id["time once (ms)"].max()
                
                mean_time_per_token = df_data_id["time_per_token"].mean()
                median_time_per_token = df_data_id["time_per_token"].median()
                min_time_per_token = df_data_id["time once (ms)"].min()
                max_time_per_token = df_data_id["time once (ms)"].max()

                # add question, context. 
                seq_length = df_data_id["seq_length"].unique()[0]
                context = df_data_id["context"].unique()[0]
                question = df_data_id["question"].unique()[0]


                df_fin.loc[len(df_fin)] = [adapter, reader, model_name, mean_time,median_time, min_time, max_time, mean_time_per_token, median_time_per_token, min_time_per_token, max_time_per_token, runs, time_unique_values, seq_length, context, question, data_id]

            

            runs = len(df_model["time once (ms)"].tolist())

            overall_mean_time = df_model["time once (ms)"].mean()
            overall_median_time = df_model["time once (ms)"].median()
            overall_min_time = df_model["time once (ms)"].min()
            overall_max_time = df_model["time once (ms)"].max()

            overall_mean_time_per_token = df_model["time_per_token"].mean()
            overall_median_time_per_token = df_model["time_per_token"].median()
            overall_min_time_per_token = df_model["time once (ms)"].min()
            overall_max_time_per_token = df_model["time once (ms)"].max()
            av_seq_length = df_model["seq_length"].sum()/len(df_model["seq_length"])
            
            df_overall.loc[len(df_fin)] = [adapter, reader, model_name, mean_time, median_time, min_time, max_time, mean_time_per_token, median_time_per_token, min_time_per_token, max_time_per_token, runs, av_seq_length]

In [168]:
df_overall.to_csv(f"inference_time_{skill}_overall.csv")
df_overall.to_excel(f"inference_time_{skill}_overall.xlsx")

In [158]:
df_fin.to_csv(f"inference_time_{skill}_overall.csv")
df_fin.to_excel(f"inference_time_{skill}_overall.xlsx") # drop duplicates - currently manually

### archive

In [43]:
for data_id in df["data_id"]:
    print(len(df))

    rows = df.loc[df["data_id"] == data_id]

    for model_name in df['model_name'].unique():
        
        model = rows[rows["model_name"] == model_name]
        
        try: 
            runs = len(model)

            run_min = model["time once (ms)"].min()
            run_max = model["time once (ms)"].max()
            run_median = model["time once (ms)"].median()

            seq_length = model["seq_length"].values[0]

            min_time_per_token = run_min/seq_length
            max_time_per_token = run_max/seq_length
            median_time_per_token = run_median/seq_length
            
            context = model["context"].values[0]
            question = model["question"].values[0]
        

            df_fin.loc[len(df_fin)] = [model_name, "", run_min, run_max, run_median, runs, seq_length, min_time_per_token, max_time_per_token, median_time_per_token, context, question, data_id]
            
        except Exception as e:
            print("error")
            print(e)
            pass

        break
    break

    
    
    df = df.drop(df.loc[df["data_id"] == data_id].index) #reduce search space


4200


In [44]:
model

Unnamed: 0,model_name,time once (ms),average_time 50 times (ms),seq_length,context,question,choices,data_id,data_set_name,time_per_token
0,Base,258.271933,,92,Do i need to go for a legal divorce ? I wanted...,Why is this person asking about divorce ?,['If he gets married in the church he wo nt ha...,0,cosmos_qa,2.807304
600,Base,299.651861,,92,Do i need to go for a legal divorce ? I wanted...,Why is this person asking about divorce ?,['If he gets married in the church he wo nt ha...,0,cosmos_qa,3.257085
1200,Base,1240.674257,,310,Candy watched the bearded man drive his silver...,How long was Candy trying to seduce Larry?,"['about 10 minutes', 'about 2 hours', 'not eno...",0,quail,4.002175
1800,Base,1662.490129,,310,Candy watched the bearded man drive his silver...,How long was Candy trying to seduce Larry?,"['about 10 minutes', 'about 2 hours', 'not eno...",0,quail,5.362871
2400,Base,62.294006,,19,"When particles of matter are closer together, ...",If Jim moves some particles of matter farther ...,"['decrease', 'increase']",0,quartz,3.278632
3000,Base,1608.65593,,339,"I am a psychologist. I first met Timothy, a qu...",What did the writer think of Timothy after lea...,"['Timothy was very hardworking.', 'Timothy was...",0,race,4.745298
3600,Base,1163.430929,,339,"I am a psychologist. I first met Timothy, a qu...",What did the writer think of Timothy after lea...,"['Timothy was very hardworking.', 'Timothy was...",0,race,3.43195


In [8]:
df_fin.to_csv("inference_time_categorical.csv")
df_fin.to_excel("inference_time_categorical.xlsx") # drop duplicates - currently manually

In [97]:
print(len(df_fin))

274708


In [100]:
df_droped = df_fin.drop_duplicates()

In [101]:
df_droped.to_csv("analyse4.csv")
df_droped.to_excel("analyse4.xlsx")