In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}


<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams

In [3]:
%matplotlib inline

In [4]:
TEXT_COLOUR = {
    'PURPLE':'\033[95m',
    'CYAN':'\033[96m',
    'DARKCYAN':'\033[36m',
    'BLUE':'\033[94m',
    'GREEN':'\033[92m',
    'YELLOW':'\033[93m',
    'RED':'\033[91m',
    'BOLD':'\033[1m',
    'UNDERLINE':'\033[4m',
    'END':'\033[0m'
}

def print_bold(*msgs):
    print(TEXT_COLOUR['BOLD'])
    print(*msgs)
    print(TEXT_COLOUR['END'])

def print_green(*msgs):
    print(TEXT_COLOUR['GREEN'])
    print(*msgs)
    print(TEXT_COLOUR['END'])

def print_error(*msgs):
    print(TEXT_COLOUR['RED'])
    print(*msgs)
    print(TEXT_COLOUR['END'])

def wrap_green(msg):
    return TEXT_COLOUR['GREEN'] + msg + TEXT_COLOUR['END']

def wrap_red(msg):
    return TEXT_COLOUR['RED'] + msg + TEXT_COLOUR['END']

def up_down_str(val):
    msg = str(val)
    if val > 0:
        msg = wrap_green(msg)
    elif val < 0:
        msg = wrap_red(msg)
    return msg

In [5]:
exp='roberta-base'
num_layers = 12

In [6]:
tasks = ["CoLA","SST-2","MRPC","STS-B","QQP","MNLI", "MNLI-MM", "QNLI","RTE"]

metrics = {
    "CoLA":["mcc"],
    "MNLI":["acc"],
    "MNLI-MM":["acc"],
    "MRPC":["f1"],
    "QNLI":["acc"],
    "QQP":["f1"],
    "RTE":["acc"],
    "SST-2":["acc"],
    "STS-B":["spearmanr"],
    "WNLI":["acc"] #temp
}

reported_in_paper = {
    "CoLA":0.00,
    "MNLI":0.00,
    "MNLI-MM":0.0,
    "MRPC":0.00,
    "QNLI":0.00,
    "QQP":0.00,
    "RTE":0.00,
    "SST-2":0.00,
    "STS-B":0.00,
    "WNLI":0.00
}

In [7]:

def get_average_val(lines):
    reported = []
    for line in lines:
#         print('\t', line)
        val = float(line.split()[1])
        if val != 0:
            reported.append(val)
    out = 0
    if len(reported) != 0:
        reported.sort(reverse = True)
        candidates = [reported[0]]
        for j in range(1, len(reported)):
            if reported[j] > 0.9 * reported[0]:
                candidates.append(reported[j])
        out = np.mean(candidates)
        
    return out


In [8]:
results = {}

for task in tasks:
    task_results = {}
    task_metrics = metrics[task]
    for metric in task_metrics:
        
        # base metrics
        print(f"../../mt_dnn_exp_results/{exp}/{task}/base-{metric}.txt")
        f=open(f"../../mt_dnn_exp_results/{exp}/{task}/base-{metric}.txt", "r")
        lines = f.read().splitlines()
        task_results[f'base-{metric}'] = get_average_val(lines)
        
        # no layer metrics
        
        fine_tuning_metrics = []
        f=open(f"../../mt_dnn_exp_results/{exp}/{task}/no_layer-{metric}.txt", "r")

        lines = f.read().splitlines()
        fine_tuning_metrics.append(get_average_val(lines))
        
        # fine-tuned metrics
        
        log_file_prefix=''
        for i in reversed(range(int(num_layers/2), num_layers)):
            log_file_prefix += str(i)
            f=open(f"../../mt_dnn_exp_results/{exp}/{task}/{log_file_prefix}-{metric}.txt", "r")
            lines = f.read().splitlines()
            fine_tuning_metrics.append(get_average_val(lines))
            
            log_file_prefix +='_'
        
        task_results[f'{metric}'] = list(reversed(fine_tuning_metrics))
        
    results[task] = task_results

../../mt_dnn_exp_results/roberta-base/CoLA/base-mcc.txt
../../mt_dnn_exp_results/roberta-base/SST-2/base-acc.txt
../../mt_dnn_exp_results/roberta-base/MRPC/base-f1.txt
../../mt_dnn_exp_results/roberta-base/STS-B/base-spearmanr.txt
../../mt_dnn_exp_results/roberta-base/QQP/base-f1.txt
../../mt_dnn_exp_results/roberta-base/MNLI/base-acc.txt
../../mt_dnn_exp_results/roberta-base/MNLI-MM/base-acc.txt
../../mt_dnn_exp_results/roberta-base/QNLI/base-acc.txt
../../mt_dnn_exp_results/roberta-base/RTE/base-acc.txt


In [9]:
x_axis = []

for i in range(int(num_layers/2), num_layers):
    x_axis.append(str(i))

x_axis.append("none")

In [10]:
def draw_graph(task, y_label, paper, base, reported):
    plt.figure(figsize=(10,6))
    plt.plot(x_axis, reported)
    
    plt.xlabel("layers")
    plt.ylabel(y_label)
    
    if paper == 0.0:    
        gap = max(reported) - min(reported)
        top = max(max(reported), base) + (gap*0.2)
        bottom = min(min(reported), base) - (gap*0.2)
    
        plt.ylim(bottom, top)

        plt.axhline(y=base, linestyle='--', c='green')
    else:
        gap = max(reported) - min(reported)
        top = max(max(reported), base, paper) + (gap*0.2)
        bottom = min(min(reported), base, paper) - (gap*0.2)
    
        plt.ylim(bottom, top)

        plt.axhline(y=base, linestyle='--', c='green')
        plt.axhline(y=paper, linestyle='--', c='red')
    
    plt.title(f'{exp}-{task} ({round(base,4)})')
    plt.savefig(f'images/{exp}/{task}', format='png', bbox_inches='tight')
    plt.show()

In [11]:
for task in tasks:
    task_results = results[task]
    task_metrics = metrics[task]
    for metric in task_metrics:
        reported = task_results[metric]
        base = task_results[f'base-{metric}']
        print_bold(task, metric, ': b -', round(base * 100, 2), 'h -',round(task_results[metric][0] * 100, 2), 'n -', round(task_results[metric][-1] * 100, 2))

[1m
CoLA mcc : b - 59.85 h - 58.51 n - 0
[0m
[1m
SST-2 acc : b - 94.63 h - 93.9 n - 80.28
[0m
[1m
MRPC f1 : b - 92.79 h - 90.49 n - 81.22
[0m
[1m
STS-B spearmanr : b - 90.76 h - 88.8 n - 20.0
[0m
[1m
QQP f1 : b - 88.83 h - 87.69 n - 62.51
[0m
[1m
MNLI acc : b - 87.41 h - 85.68 n - 52.6
[0m
[1m
MNLI-MM acc : b - 86.99 h - 85.93 n - 53.02
[0m
[1m
QNLI acc : b - 92.75 h - 91.59 n - 65.74
[0m
[1m
RTE acc : b - 78.16 h - 74.44 n - 57.4
[0m


In [12]:
import copy 

layer_90 = []
layer_95 = []

threshold_90 = 0.9
threshold_95 = 0.95
x_axis.reverse()

for task in tasks:
#     print_bold(task)
    task_results = results[task]
    task_metrics = metrics[task]
    for metric in task_metrics:
        base = task_results[f'base-{metric}']
        reported = copy.deepcopy(task_results[metric])
        reported.reverse()
        
        flag_90 = True
        flag_95 = True

        for ind, val in enumerate(reported):

            if val/base > threshold_90 and flag_90:
                flag_90 = False
                layer_90.append(ind)
                results[task]['90%'] = ind

            if val/base > threshold_95 and flag_95:
                flag_95 = False
                layer_95.append(ind)
                results[task]['95%'] = ind

        if flag_90:
            print(task, "Fails to achieve 90% threshold", reported[-1]/base)
            layer_90.append(len(reported)-1)
            results[task]['90%'] = "-"

        if flag_95:
            print(task, "Fails to achieve 95% threshold", reported[-1]/base)
            layer_95.append(len(reported)-1)
            results[task]['95%'] = "-"


            
print(x_axis)
            
            
print(layer_90)
min_layer_ind_90 = max(layer_90)
print("layer_90 ", min_layer_ind_90, 'layer:', x_axis[min_layer_ind_90], round((1-(min_layer_ind_90/num_layers)) * 100, 2), '%')

print(layer_95)
min_layer_ind_95 = max(layer_95)
print("layer_95 ", min_layer_ind_95, 'layer:', x_axis[min_layer_ind_95], round((1-(min_layer_ind_95/num_layers)) * 100, 2), '%')


firsts = []
seconds = []
    
for task in tasks:
    task_results = results[task]
    task_metrics = metrics[task]
    for metric in task_metrics:
        base = task_results[f'base-{metric}']
        reported = copy.deepcopy(task_results[metric])
        reported.reverse()
        
        if task != "CoLA":
            first = round(100*reported[0]/base, 2)
            second = round(100*reported[1]/base, 2)
            firsts.append(first)
            seconds.append(second)
            
        print_bold(task, base)
        print('\t90', reported[min_layer_ind_90], round(reported[min_layer_ind_90]/base * 100, 2))
        print('\t95', reported[min_layer_ind_95], round(reported[min_layer_ind_95]/base * 100, 2))
        
print_bold(len(firsts), np.mean(firsts), np.mean(seconds), round(np.mean(seconds) - np.mean(firsts),2))

['none', '11', '10', '9', '8', '7', '6']
[3, 1, 1, 1, 1, 2, 2, 1, 5]
layer_90  5 layer: 7 58.33 %
[5, 1, 2, 3, 2, 3, 2, 2, 6]
layer_95  6 layer: 6 50.0 %
[1m
CoLA 0.5985355628900488
[0m
	90 0.590627175609528 98.68
	95 0.5851166494633901 97.76
[1m
SST-2 0.9463302752293578
[0m
	90 0.9357798165137614 98.89
	95 0.9389908256880733 99.22
[1m
MRPC 0.9279326703664351
[0m
	90 0.8973524816586721 96.7
	95 0.9049256630801625 97.52
[1m
STS-B 0.9075547888517266
[0m
	90 0.8761809981890807 96.54
	95 0.8879587041713008 97.84
[1m
QQP 0.888284504739094
[0m
	90 0.874658248225866 98.47
	95 0.8769435099769318 98.72
[1m
MNLI 0.8740906775343861
[0m
	90 0.8493530310748854 97.17
	95 0.85683138053999 98.03
[1m
MNLI-MM 0.8698535394629779
[0m
	90 0.8517493897477625 97.92
	95 0.8592961757526444 98.79
[1m
QNLI 0.9275123558484349
[0m
	90 0.9132710964671427 98.46
	95 0.9159436207212156 98.75
[1m
RTE 0.7815884476534296
[0m
	90 0.7155234657039711 91.55
	95 0.744404332129964 95.24
[1m
8 66.2775 90.4 24

In [13]:
for task in ["STS-B"]:
    task_results = results[task]
    task_metrics = metrics[task]
    for metric in task_metrics:
        
        print(task_results[metric][-1])
        print(task_results[metric][-2])

0.19995395674286554
0.8402730207714812


In [14]:
latex_metrics = {
    "CoLA":"MCC",
    "MNLI":"Acc.",
    "MNLI-MM":"Acc.",
    "MRPC":"F$_1$",
    "QNLI":"Acc.",
    "QQP":"F$_1$",
    "RTE":"Acc.",
    "SST-2":"Acc.",
    "STS-B":"$\\rho$"
}

In [15]:
print("\\begin{center}\n\t\\scalebox{0.88}{\n\t\t\\begin{tabular}{rc|ccccccc} \n\t\t\\toprule[1pt] \n\t\t\\multirow{2}{*}{Task (metric)} & \\multirow{2}{*}{Baseline} & \\multicolumn{7}{c}{Fine-tuned layers} \\\\ \n\t\t\\cline{3-9} \n\t\t& & 6-11 & 7-11 & 8-11 & 9-11 & 10-11 & 11-11 & None \\\\ \n\t\t\t\\midrule")

avg_performance = []

for task in tasks:
    m = metrics[task][0]
    base_key = f"base-{m}"
    
    if task == "MNLI-MM":
        row = f"\t\t\tMNLI-mm ({latex_metrics[task]}) & "
    else:
        row = f"\t\t\t{task} ({latex_metrics[task]}) & "
    
    row += "{:0.2f}".format(round(results[task][base_key] * 100, 2))
    
    for ind, val in enumerate(results[task][m]):
        row += " & {:0.2f}".format(round(val * 100,2))
        
        if len(avg_performance) == ind:
            avg_performance.append([])
            
            
        percent = (val / results[task][base_key]) * 100
        avg_performance[ind].append(percent)
        
#     row += "& {}".format(results[task]["90%"])
#     row += "& {}".format(results[task]["95%"])
        
    row += " \\\\"
    print(row)
    
print("\t\t\t\\midrule\\midrule")

row = "\t\t\tRel. perf. (\%) & 100.00"

for perf in avg_performance:
    row += " & {:0.2f}".format(round(np.mean(perf) ,2))
    
row += " \\\\"

print(row)
    
print("\t\t\\end{tabular}\n\t}\n\t\\caption{MTDNN-RoBERTa-base on GLUE}\n\t\\label{table:finetune-all}\n\\end{center}")

\begin{center}
	\scalebox{0.88}{
		\begin{tabular}{rc|ccccccc} 
		\toprule[1pt] 
		\multirow{2}{*}{Task (metric)} & \multirow{2}{*}{Baseline} & \multicolumn{7}{c}{Fine-tuned layers} \\ 
		\cline{3-9} 
		& & 6-11 & 7-11 & 8-11 & 9-11 & 10-11 & 11-11 & None \\ 
			\midrule
			CoLA (MCC) & 59.85 & 58.51 & 59.06 & 55.63 & 54.76 & 53.26 & 50.51 & 0.00 \\
			SST-2 (Acc.) & 94.63 & 93.90 & 93.58 & 93.53 & 93.46 & 93.05 & 92.06 & 80.28 \\
			MRPC (F$_1$) & 92.79 & 90.49 & 89.74 & 88.71 & 88.49 & 88.29 & 85.08 & 81.22 \\
			STS-B ($\rho$) & 90.76 & 88.80 & 87.62 & 87.21 & 86.99 & 85.67 & 84.03 & 20.00 \\
			QQP (F$_1$) & 88.83 & 87.69 & 87.47 & 87.47 & 87.08 & 85.52 & 83.12 & 62.51 \\
			MNLI (Acc.) & 87.41 & 85.68 & 84.94 & 84.92 & 84.68 & 82.45 & 77.09 & 52.60 \\
			MNLI-mm (Acc.) & 86.99 & 85.93 & 85.17 & 85.60 & 85.11 & 83.24 & 77.58 & 53.02 \\
			QNLI (Acc.) & 92.75 & 91.59 & 91.33 & 90.95 & 90.77 & 89.15 & 84.97 & 65.74 \\
			RTE (Acc.) & 78.16 & 74.44 & 71.55 & 68.38 & 66.86 & 65.70 & 61