# Plot Passage Performance

In [25]:
import pandas as pd
import json
from glob import glob

DIR = './'

approaches = ['passage-retrieval-t5', 'passage-retrieval-bert', 'sentence-retrieval-t5', 'sentence-retrieval-bert'] + glob('sentence-retrieval-bm25/*/') + glob('passage-retrieval-bm25/*/') + glob('sentence-retrieval-qld/*/') + glob('passage-retrieval-qld/*/')
df = []

thresholds = {
    'phrase': {
        'bertscore_score': 0.3,
        'meteor_score': 0.1,
        'bleu4_score': 0.1,
    }, 'passage': {
        'bertscore_score': 0.5,
        'meteor_score': 0.3,
        'bleu4_score': 0.05,
    }
}

for approach in approaches:
    for t in ['phrase', 'passage']:
        i = json.load(open(DIR + approach + '/eval-' + t + '/all_scores.json'))
        
        correctBleu = 0;
        correctBertScore = 0;
        correctMeteor = 0;
        
        for k in i['single_scores'].keys():
            tmp = i['single_scores'][k]
            if tmp['bertscore_score'] >= thresholds[t]['bertscore_score']:
                correctBertScore += 1
                
            if tmp['meteor_score'] >= thresholds[t]['meteor_score']:
                correctMeteor += 1
                
            if tmp['bleu4_score'] >= thresholds[t]['bleu4_score']:
                correctBleu += 1
               
        df += [{
            'Retrieval Model': approach.split('-retrieval-')[1],
            'Retrieval Unit': approach.split('-retrieval-')[0],
            'Type of Clickbait': t,
            'Bleu': i['scores']['bleu4_lemma'],
            'Meteor': i['scores']['meteor_score'],
            'BertScore': i['scores']['bertscore_score'],
            'correctBleu': correctBleu,
            'correctMeteor': correctMeteor,
            'correctBertScore': correctBertScore,
        }]

df = pd.DataFrame(df)
df.sort_values('Meteor', ascending=False)

Unnamed: 0,Retrieval Model,Retrieval Unit,Type of Clickbait,Bleu,Meteor,BertScore,correctBleu,correctMeteor,correctBertScore
7,bert,sentence,passage,0.104317,0.223731,0.365839,74,75,46
5,t5,sentence,passage,0.105794,0.220221,0.367044,74,74,46
11,bm25/k1=0.8-b=0.4/,sentence,passage,0.079079,0.201949,0.347100,53,61,42
21,bm25/k1=1.0-b=0.4/,sentence,passage,0.076110,0.199053,0.345570,51,59,41
19,bm25/k1=0.9-b=0.3/,sentence,passage,0.076099,0.198704,0.346124,51,59,41
...,...,...,...,...,...,...,...,...,...
68,qld/mu=1150.0/,passage,phrase,0.030630,0.035003,0.191875,39,68,8
34,bm25/k1=1.0-b=0.3/,passage,phrase,0.029809,0.034986,0.191816,35,66,7
78,qld/mu=1200.0/,passage,phrase,0.030630,0.034970,0.191685,39,68,8
42,bm25/k1=0.8-b=0.3/,passage,phrase,0.029870,0.034962,0.191832,35,66,7


In [26]:
df[df['Type of Clickbait'] == 'phrase'].sort_values('Meteor', ascending=False)

Unnamed: 0,Retrieval Model,Retrieval Unit,Type of Clickbait,Bleu,Meteor,BertScore,correctBleu,correctMeteor,correctBertScore
4,t5,sentence,phrase,0.04952,0.064755,0.209806,82,115,16
6,bert,sentence,phrase,0.042093,0.06109,0.206653,72,103,11
8,bm25/k1=1.0-b=0.5/,sentence,phrase,0.034047,0.050623,0.199421,55,83,12
12,bm25/k1=0.9-b=0.5/,sentence,phrase,0.034047,0.050531,0.19944,55,83,12
22,bm25/k1=0.8-b=0.5/,sentence,phrase,0.033126,0.050003,0.198728,54,82,11
14,bm25/k1=0.9-b=0.4/,sentence,phrase,0.03316,0.049323,0.198778,54,82,11
20,bm25/k1=1.0-b=0.4/,sentence,phrase,0.033023,0.049176,0.198576,54,82,11
10,bm25/k1=0.8-b=0.4/,sentence,phrase,0.033047,0.049108,0.198524,54,81,11
16,bm25/k1=1.0-b=0.3/,sentence,phrase,0.03263,0.048526,0.198145,53,80,11
18,bm25/k1=0.9-b=0.3/,sentence,phrase,0.03263,0.048526,0.198147,53,80,11


In [28]:
df[df['Type of Clickbait'] == 'phrase'].sort_values('Meteor', ascending=False)

Unnamed: 0,Retrieval Model,Retrieval Unit,Type of Clickbait,Bleu,Meteor,BertScore,correctBleu,correctMeteor,correctBertScore
4,t5,sentence,phrase,0.04952,0.064755,0.209806,82,115,16
6,bert,sentence,phrase,0.042093,0.06109,0.206653,72,103,11
8,bm25/k1=1.0-b=0.5/,sentence,phrase,0.034047,0.050623,0.199421,55,83,12
12,bm25/k1=0.9-b=0.5/,sentence,phrase,0.034047,0.050531,0.19944,55,83,12
22,bm25/k1=0.8-b=0.5/,sentence,phrase,0.033126,0.050003,0.198728,54,82,11
14,bm25/k1=0.9-b=0.4/,sentence,phrase,0.03316,0.049323,0.198778,54,82,11
20,bm25/k1=1.0-b=0.4/,sentence,phrase,0.033023,0.049176,0.198576,54,82,11
10,bm25/k1=0.8-b=0.4/,sentence,phrase,0.033047,0.049108,0.198524,54,81,11
16,bm25/k1=1.0-b=0.3/,sentence,phrase,0.03263,0.048526,0.198145,53,80,11
18,bm25/k1=0.9-b=0.3/,sentence,phrase,0.03263,0.048526,0.198147,53,80,11


In [9]:
from glob import glob
glob('sentence-retrieval-qld/*/')

['sentence-retrieval-qld/mu=800.0/',
 'sentence-retrieval-qld/mu=900.0/',
 'sentence-retrieval-qld/mu=850.0/',
 'sentence-retrieval-qld/mu=1150.0/',
 'sentence-retrieval-qld/mu=950.0/',
 'sentence-retrieval-qld/mu=1100.0/',
 'sentence-retrieval-qld/mu=1000.0/',
 'sentence-retrieval-qld/mu=1050.0/',
 'sentence-retrieval-qld/mu=1200.0/']

# Plot Performance Pilot Experiments

In [29]:
import pandas as pd
import json

DIR = './test-200/'

approaches = ['sentence-retrieval-bert', 'sentence-retrieval-t5', 'sentence-retrieval-bm25',
              'sentence-retrieval-bm25-rm3', 'sentence-retrieval-bm25-ax', 'sentence-retrieval-bm25-prf',
              'sentence-retrieval-qld',
              'sentence-retrieval-qld+rm3', 'sentence-retrieval-ax', 'sentence-retrieval-prf']
df = []

#thresholds = {
#    'phrase': {
#        'bertscore_score': 0.8,
#        'meteor_score': 0.7,
#        'bleu4_score': 0.5,
#    }, 'passage': {
#        'bertscore_score': 0.6,
#        'meteor_score': 0.7,
#        'bleu4_score': 0.3,
#    }
#}

thresholds = {
    'phrase': {
        'bertscore_score': 0.3,
        'meteor_score': 0.1,
        'bleu4_score': 0.1,
    }, 'passage': {
        'bertscore_score': 0.5,
        'meteor_score': 0.3,
        'bleu4_score': 0.05,
    }
}

for approach in approaches:
    for t in ['phrase', 'passage']:
        i = json.load(open(DIR + approach + '/eval-' + t + '/all_scores.json'))
        
        correctBleu = 0;
        correctBertScore = 0;
        correctMeteor = 0;
        
        for k in i['single_scores'].keys():
            tmp = i['single_scores'][k]
            if tmp['bertscore_score'] >= thresholds[t]['bertscore_score']:
                correctBertScore += 1
                
            if tmp['meteor_score'] >= thresholds[t]['meteor_score']:
                correctMeteor += 1
                
            if tmp['bleu4_score'] >= thresholds[t]['bleu4_score']:
                correctBleu += 1
               
        df += [{
            'Retrieval Model': approach.split('-retrieval-')[1],
            'Retrieval Unit': approach.split('-retrieval-')[0],
            'Type of Clickbait': t,
            'Bleu': i['scores']['bleu4_lemma'],
            'Meteor': i['scores']['meteor_score'],
            'BertScore': i['scores']['bertscore_score'],
            'correctBleu': correctBleu,
            'correctMeteor': correctMeteor,
            'correctBertScore': correctBertScore,
        }]

df = pd.DataFrame(df)
df.sort_values('Meteor', ascending=False)

Unnamed: 0,Retrieval Model,Retrieval Unit,Type of Clickbait,Bleu,Meteor,BertScore,correctBleu,correctMeteor,correctBertScore
1,bert,sentence,passage,0.145508,0.268646,0.381046,29,25,15
3,t5,sentence,passage,0.142781,0.267001,0.389429,29,26,17
5,bm25,sentence,passage,0.114937,0.226425,0.367956,22,21,12
9,bm25-ax,sentence,passage,0.11271,0.224623,0.365187,21,19,12
11,bm25-prf,sentence,passage,0.096853,0.21102,0.354476,18,16,11
7,bm25-rm3,sentence,passage,0.100687,0.210335,0.355587,19,17,12
17,ax,sentence,passage,0.106843,0.188479,0.366859,24,17,11
15,qld+rm3,sentence,passage,0.116689,0.18547,0.371335,27,19,12
13,qld,sentence,passage,0.109475,0.178064,0.366973,25,18,11
19,prf,sentence,passage,0.108628,0.175287,0.364695,23,17,11


In [30]:
df[df['Type of Clickbait'] == 'passage'].sort_values('Meteor', ascending=False)

Unnamed: 0,Retrieval Model,Retrieval Unit,Type of Clickbait,Bleu,Meteor,BertScore,correctBleu,correctMeteor,correctBertScore
1,bert,sentence,passage,0.145508,0.268646,0.381046,29,25,15
3,t5,sentence,passage,0.142781,0.267001,0.389429,29,26,17
5,bm25,sentence,passage,0.114937,0.226425,0.367956,22,21,12
9,bm25-ax,sentence,passage,0.11271,0.224623,0.365187,21,19,12
11,bm25-prf,sentence,passage,0.096853,0.21102,0.354476,18,16,11
7,bm25-rm3,sentence,passage,0.100687,0.210335,0.355587,19,17,12
17,ax,sentence,passage,0.106843,0.188479,0.366859,24,17,11
15,qld+rm3,sentence,passage,0.116689,0.18547,0.371335,27,19,12
13,qld,sentence,passage,0.109475,0.178064,0.366973,25,18,11
19,prf,sentence,passage,0.108628,0.175287,0.364695,23,17,11


In [31]:
import pandas as pd
import json

DIR = './end-to-end-evaluation/'

approaches = ['bert-with-roberta-predictor', 'roberta-with-roberta-predictor', 'debert-with-roberta-predictor']
df = []

thresholds = {
    'phrase': {
        'bertscore_score': 0.8,
        'meteor_score': 0.7,
        'bleu4_score': 0.5,
    }, 'passage': {
        'bertscore_score': 0.6,
        'meteor_score': 0.7,
        'bleu4_score': 0.3,
    }
}

for approach in approaches:
    for t in ['phrase', 'passage']:
        i = json.load(open(DIR + approach + '/eval-' + t + '/all_scores.json'))
        
        correctBleu = 0;
        correctBertScore = 0;
        correctMeteor = 0;
        
        for k in i['single_scores'].keys():
            tmp = i['single_scores'][k]
            if tmp['bertscore_score'] >= thresholds[t]['bertscore_score']:
                correctBertScore += 1
                
            if tmp['meteor_score'] >= thresholds[t]['meteor_score']:
                correctMeteor += 1
                
            if tmp['bleu4_score'] >= thresholds[t]['bleu4_score']:
                correctBleu += 1
               
        df += [{
            'Retrieval Model': approach,
            'Retrieval Unit': approach.split('-retrieval-')[0],
            'Type of Clickbait': t,
            'Bleu': i['scores']['bleu4_lemma'],
            'Meteor': i['scores']['meteor_score'],
            'BertScore': i['scores']['bertscore_score'],
            'correctBleu': correctBleu,
            'correctMeteor': correctMeteor,
            'correctBertScore': correctBertScore,
        }]

df = pd.DataFrame(df)
df.sort_values('Meteor', ascending=False)

Unnamed: 0,Retrieval Model,Retrieval Unit,Type of Clickbait,Bleu,Meteor,BertScore,correctBleu,correctMeteor,correctBertScore
5,debert-with-roberta-predictor,debert-with-roberta-predictor,passage,0.282032,0.450424,0.482231,143,133,156
2,roberta-with-roberta-predictor,roberta-with-roberta-predictor,phrase,0.587995,0.434071,0.688078,247,229,205
3,roberta-with-roberta-predictor,roberta-with-roberta-predictor,passage,0.258028,0.432196,0.466824,127,127,156
4,debert-with-roberta-predictor,debert-with-roberta-predictor,phrase,0.609705,0.421388,0.696292,249,244,222
0,bert-with-roberta-predictor,bert-with-roberta-predictor,phrase,0.529633,0.364308,0.657784,222,218,195
1,bert-with-roberta-predictor,bert-with-roberta-predictor,passage,0.180952,0.335924,0.413483,89,85,99


In [23]:
df.sort_values('Retrieval Model', ascending=False)

Unnamed: 0,Retrieval Model,Retrieval Unit,Type of Clickbait,Bleu,Meteor,BertScore,correctBleu,correctMeteor,correctBertScore
2,roberta-with-roberta-predictor,roberta-with-roberta-predictor,phrase,0.587995,0.434071,0.688078,247,229,205
3,roberta-with-roberta-predictor,roberta-with-roberta-predictor,passage,0.258028,0.432196,0.466824,127,127,156
4,debert-with-roberta-predictor,debert-with-roberta-predictor,phrase,0.609705,0.421388,0.696292,249,244,222
5,debert-with-roberta-predictor,debert-with-roberta-predictor,passage,0.282032,0.450424,0.482231,143,133,156
0,bert-with-roberta-predictor,bert-with-roberta-predictor,phrase,0.529633,0.364308,0.657784,222,218,195
1,bert-with-roberta-predictor,bert-with-roberta-predictor,passage,0.180952,0.335924,0.413483,89,85,99
