# Error analysis notebook (test data)

## Import packages

In [6]:
import pandas as pd
import math

import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from elasticsearch import Elasticsearch

import warnings
import matplotlib
warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)
warnings.filterwarnings("ignore")

from enum import IntEnum

import json


## Set filepaths

In [7]:
ranking = '../evaluation/resources/2020/jsonruns/p_controller_train_nle_meta_9_1_train.json'
eval_result_hlevel = '../evaluation/resources/2020/eval_results/p_controller_train_nle_meta_9_1_train_hlevel.tsv'
eval_result_level = '../evaluation/resources/2020/eval_results/p_controller_train_nle_meta_9_1_train_level.tsv'
gt = '../pre_processing/resources/training/2020/TREC-Fair-Ranking-training-sample.json'
est_rel_meta_p = '../reranking/resources/relevances/Training_rel_scores_model_A.csv'
annotations = '../pre_pre_processing/resources/merged-annotations.json'
mapping = '../reranking/resources/mappings/training_doc_to_author.json'

## Prepare dataframes

In [8]:
# ranking df
rdf = pd.read_json(ranking, lines=True)

# ground truth df
gtdf = pd.read_json(gt, lines=True).explode('documents')
gtdf['doc_id'] = gtdf.documents.apply(lambda row: row.get('doc_id'))
gtdf['doc_rel'] = gtdf.documents.apply(lambda row: row.get('relevance'))

# paired down relevances df
reldf = gtdf[['qid','query','doc_id','doc_rel']]

# eval result dfs
ehldf = pd.read_csv(eval_result_hlevel, sep='\t', names=['key', 'qid', 'value'])
ehldf = ehldf.pivot(index='qid', columns='key', values='value')

eldf = pd.read_csv(eval_result_level, sep='\t', names=['key', 'qid', 'value'])
eldf = eldf.pivot(index='qid', columns='key', values='value')


# est rel df
est_rel_meta = pd.read_csv(est_rel_meta_p)
est_rel_meta.head()

# annotations
adf = pd.read_json(annotations,lines=True)
adf = adf.explode('authors')
adf[['name','wiki','country','type','valid','auth_id', 'h_index','level','region']] = adf.authors.apply(pd.Series).rename({'id':'auth_id'},axis=1)

# doc to author and reverse mapping
with open(mapping) as fp:
    doc_to_author = json.load(fp)
alldocs = rdf.explode('ranking')['ranking'].drop_duplicates().to_list()
for doc in alldocs:
    if not doc in doc_to_author:
        doc_to_author[doc] = []
    
author_to_doc = {}
for doc, aulist in doc_to_author.items():
    for au in aulist:
        if not au in author_to_doc:
            author_to_doc[au] = []
        author_to_doc[au] = author_to_doc[au] + [doc]

In [9]:
rdf.head(2)

Unnamed: 0,q_num,qid,ranking
0,0.0,5438,"[b2fdee22aa02477292b858fbafcb418932732bce, 993..."
1,0.1,5438,"[70f3a58b0fc6916c2e6616bfbae5758c00408894, ec1..."


In [10]:
gtdf.head(2)

Unnamed: 0,qid,query,frequency,documents,doc_id,doc_rel
0,5438,cloud computing,5.7e-05,{'doc_id': '3e19046c665867bbe557685da60738a407...,3e19046c665867bbe557685da60738a40738010a,0
0,5438,cloud computing,5.7e-05,{'doc_id': '7ef08f1fa127af817cdfd9d3bd00bdf60e...,7ef08f1fa127af817cdfd9d3bd00bdf60e32143b,0


In [11]:
reldf.head(2)

Unnamed: 0,qid,query,doc_id,doc_rel
0,5438,cloud computing,3e19046c665867bbe557685da60738a40738010a,0
0,5438,cloud computing,7ef08f1fa127af817cdfd9d3bd00bdf60e32143b,0


In [12]:
ehldf.head(2)

key,difference,disparity,relevance
qid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
69,0.146308,2.820826,2.254359
258,0.549351,1.043557,0.679082


In [13]:
eldf.head(2)

key,difference,disparity,relevance
qid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
69,0.211649,3.322678,2.491326
258,0.479589,2.230424,1.349051


In [14]:
adf.head(2)

Unnamed: 0,id,missing_authors,authors,name,wiki,country,type,valid,auth_id,h_index,level,region
0,f3b7db81ee8642cf2deab7a9834e07b8df54185b,False,"{'name': 'David M. Szymanski', 'wiki': 'Texas ...",David M. Szymanski,Texas A&M University,US,academic,True,50215565.0,9.0,Advanced,Americas
0,f3b7db81ee8642cf2deab7a9834e07b8df54185b,False,"{'name': 'David H. Henard', 'wiki': 'North Car...",David H. Henard,North Carolina State University,US,academic,True,115002169.0,8.0,Advanced,Americas


We order the queries from lowest to highest DGEE according to each grouping. Then we plot the "difficulty rank" of each query according to the "hlevel" grouping against its difficulty according to the "level" grouping. This allows us to see if the DGEE for both groupings is generally in accordance or not.

In [15]:
edf_m = pd.merge(ehldf.sort_values(by='difference').reset_index().reset_index(),eldf.sort_values(by='difference').reset_index().reset_index(),on='qid')
edf_m = edf_m.rename({'index_x':'difficulty hlevel', 'index_y':'difficulty econ level'},axis=1)
edf_m.head()

key,difficulty hlevel,qid,difference_x,disparity_x,relevance_x,difficulty econ level,difference_y,disparity_y,relevance_y
0,0,132049,0.002713,1.421254,1.403715,0,0.002713,1.421254,1.403715
1,1,71096,0.029287,1.975807,1.862498,1,0.026328,1.97282,1.862484
2,2,21984,0.032334,1.367385,1.271935,6,0.071535,1.515766,1.32663
3,3,15395,0.04051,2.053024,1.890567,2,0.026948,2.035232,1.888323
4,4,52728,0.057478,1.04736,0.839632,13,0.09893,1.011948,0.728643


We see a clear upward trend in the plot above. This tells us that in general, the DGEE according to the two groupings is similar. We would not expect great differences in the analysis results for different groupings.

In [16]:
edf = edf_m.copy()
edf['difference'] = edf.apply(lambda row: (row.difference_x + row.difference_y)/2, axis=1)
edf['disparity'] = edf.apply(lambda row: (row.disparity_x + row.disparity_y)/2, axis=1)
edf['relevance'] = edf.apply(lambda row: (row.relevance_x + row.relevance_y)/2, axis=1)

edf = edf.sort_values(by='difference')[['qid','difference', 'disparity', 'relevance']]
edf.head()

key,qid,difference,disparity,relevance
0,132049,0.002713,1.421254,1.403715
1,71096,0.027807,1.974314,1.862491
3,15395,0.033729,2.044128,1.889445
2,21984,0.051934,1.441575,1.299282
4,52728,0.078204,1.029654,0.784137


In [17]:
edf.head()

key,qid,difference,disparity,relevance
0,132049,0.002713,1.421254,1.403715
1,71096,0.027807,1.974314,1.862491
3,15395,0.033729,2.044128,1.889445
2,21984,0.051934,1.441575,1.299282
4,52728,0.078204,1.029654,0.784137


In [27]:
temp = reldf[reldf.qid == 132049]
temp
temp['doc_auth'] = temp.doc_id.apply(lambda row: doc_to_author[row])
temp = temp.explode('doc_auth')
temp
temp = temp.drop_duplicates()
temp

Unnamed: 0,qid,query,doc_id,doc_rel,doc_auth
111,132049,combidex,d16d4ab632d18d70c0ffd9d4d7493444281275e5,1,144564464
111,132049,combidex,d16d4ab632d18d70c0ffd9d4d7493444281275e5,1,2337512
111,132049,combidex,d16d4ab632d18d70c0ffd9d4d7493444281275e5,1,6192208
111,132049,combidex,d16d4ab632d18d70c0ffd9d4d7493444281275e5,1,1917912
111,132049,combidex,d16d4ab632d18d70c0ffd9d4d7493444281275e5,1,6695007
111,132049,combidex,d16d4ab632d18d70c0ffd9d4d7493444281275e5,1,6394406
111,132049,combidex,d16d4ab632d18d70c0ffd9d4d7493444281275e5,1,48190736
111,132049,combidex,d16d4ab632d18d70c0ffd9d4d7493444281275e5,1,9942536
111,132049,combidex,d16d4ab632d18d70c0ffd9d4d7493444281275e5,1,49756543
111,132049,combidex,d16d4ab632d18d70c0ffd9d4d7493444281275e5,1,4267637


In [48]:
def avg_number_of_authors_per_doc(qid):
    numauths = 0
    docs = reldf[reldf.qid == qid].doc_id.to_list()
    for did in docs:
        numauths += len(doc_to_author[did])
    print(qid, '\t,', numauths / len(docs))

In [49]:
for qid in edf.head(10).qid.to_list() + edf.tail(10).qid.to_list():
    avg_number_of_authors_per_doc(qid)

132049 	, 8.35
71096 	, 3.088235294117647
15395 	, 2.6153846153846154
21984 	, 3.95
52728 	, 4.894736842105263
54538 	, 2.966666666666667
35778 	, 1.3
77011 	, 4.305555555555555
27887 	, 1.5666666666666667
96265 	, 1.8235294117647058
82759 	, 3.7
5762 	, 1.896551724137931
24848 	, 1.6
76875 	, 2.727272727272727
30631 	, 4.5
11856 	, 3.7
31412 	, 3.9473684210526314
111469 	, 3.2777777777777777
47984 	, 5.238095238095238
33337 	, 69.76470588235294


In [65]:
for qid in edf.qid.to_list()[:1]:
    authors = []
    for did in reldf[reldf.qid == qid].doc_id.to_list():
        print(qid, "\t, ",did,", ", sorted(doc_to_author[did]))
        authors = authors + doc_to_author[did]
        # for au in doc_to_author[did]:
        #      print(au, '\t,', author_to_doc[au])
        # print()
       
    print()
authors = list(set(authors))

132049 	,  d16d4ab632d18d70c0ffd9d4d7493444281275e5 ,  ['144564464', '1917912', '2337512', '48190736', '6192208', '6394406', '6695007']
132049 	,  6c6fc8896569067cf30b0f2349933c7462809912 ,  ['2451275', '31840891', '4267637', '4368293', '48789670', '49756543', '52204052', '6132439', '9942536']
132049 	,  e13918f61745d29a2e84101ebfc46e9487d38fca ,  ['11772386', '143810480', '144017776', '3147434', '33884244', '3442873', '35846943', '40475181', '49730336', '5114265']
132049 	,  6da29e3f7e9ab2cbedccb71201ae62ed52f949a4 ,  ['144658747', '2835612', '32563017', '33638840', '3484009', '3484217', '3925018', '4310743', '4462043', '46747531', '4705220', '47787321', '49689033']
132049 	,  167abf8617dc3a30990addfcf6fb3161ff222f21 ,  ['144564464', '145970494', '152895878', '4529044', '46850944', '49596855', '6695007']
132049 	,  c1be35f92224ac905e2e1e8f3ca86cfe88316f09 ,  ['143789235', '144607591', '144671922', '144806171', '145807992', '2101923', '31682694', '3802268', '4595739', '47384085', '4814

In [77]:
def num_auths_with_more_than_one_doc(qid):
    documents = reldf[reldf.qid == qid].doc_id.to_list()
    
    authors = []
    for did in documents:
         authors = authors + doc_to_author[did]
            
    authors = list(set(authors))
    
    nauthors = 0
    for author in authors:
        if len(set(author_to_doc[author]).intersection(set(documents))) > 1:
            nauthors += 1
    print(qid, "\t,", nauthors)

In [78]:
for qid in edf.head(10).qid.to_list():
    num_auths_with_more_than_one_doc(qid)

132049 	, 7
71096 	, 9
15395 	, 2
21984 	, 1
52728 	, 3
54538 	, 9
35778 	, 1
77011 	, 10
27887 	, 1
96265 	, 0


In [79]:
for qid in edf.tail(10).qid.to_list():
    num_auths_with_more_than_one_doc(qid)

82759 	, 3
5762 	, 1
24848 	, 0
76875 	, 1
30631 	, 0
11856 	, 7
31412 	, 3
111469 	, 0
47984 	, 8
33337 	, 1


In [88]:
def avg_num_of_docs_written_by_authors_who_have_written_more_than_one_doc(qid):
    documents = reldf[reldf.qid == qid].doc_id.to_list()
    
    authors = []
    for did in documents:
         authors = authors + doc_to_author[did]
            
    authors = list(set(authors))
    
    nauthors = 0
    numdo = 0
    for author in authors:
        numbdo = len(set(author_to_doc[author]).intersection(set(documents)))
        
        if  numbdo > 1:
            nauthors += 1
            numdo += numbdo
    print(qid, "\t,",nauthors, "\t,", numdo / max(nauthors,1))

In [89]:
for qid in edf.head(10).qid.to_list():
    avg_num_of_docs_written_by_authors_who_have_written_more_than_one_doc(qid)

132049 	, 7 	, 2.0
71096 	, 9 	, 2.7777777777777777
15395 	, 2 	, 2.0
21984 	, 1 	, 2.0
52728 	, 3 	, 2.0
54538 	, 9 	, 2.3333333333333335
35778 	, 1 	, 2.0
77011 	, 10 	, 2.1
27887 	, 1 	, 2.0
96265 	, 0 	, 0.0


In [90]:
for qid in edf.tail(10).qid.to_list():
    avg_num_of_docs_written_by_authors_who_have_written_more_than_one_doc(qid)

82759 	, 3 	, 3.0
5762 	, 1 	, 2.0
24848 	, 0 	, 0.0
76875 	, 1 	, 3.0
30631 	, 0 	, 0.0
11856 	, 7 	, 3.5714285714285716
31412 	, 3 	, 2.0
111469 	, 0 	, 0.0
47984 	, 8 	, 2.875
33337 	, 1 	, 2.0


In [91]:
(2.0 + 2.77777777 + 2 + 2+ 2+ 2.3333333333333+2+2.1+2+0)/10

1.9211111103333303

In [92]:
(3.0 +2.0 + 0.0 +3.0+0.0+3.5714285714285716+2.0+0.0+2.875+2.0)/10

1.844642857142857

In [66]:
for qid in edf.head(10).qid.to_list():
    documents = reldf[reldf.qid == qid].doc_id.to_list()
    authors = []
    for did in documents:
         authors = authors + doc_to_author[did]
    authors = list(set(authors))
    
    for author in authors:
        print(author, "\t, ", set(author_to_doc[author]).intersection(set(documents)))
    print()


34623390 	,  {'5f91afa4ba1591dfd9d35450ac07f41dca1ce4f6'}
35846943 	,  {'e13918f61745d29a2e84101ebfc46e9487d38fca'}
4595739 	,  {'c1be35f92224ac905e2e1e8f3ca86cfe88316f09'}
1399174156 	,  {'a5b1fe9129df3f499eeeb7c30913109e448a1174'}
117813613 	,  {'0560a08ef071d54a60f2efac0bf3a6505ae36910'}
4267637 	,  {'6c6fc8896569067cf30b0f2349933c7462809912'}
47787321 	,  {'6da29e3f7e9ab2cbedccb71201ae62ed52f949a4'}
121777168 	,  {'ca614b83b9496ef4893881a713195c5389c19ec8'}
11772386 	,  {'e13918f61745d29a2e84101ebfc46e9487d38fca'}
31742254 	,  {'5f91afa4ba1591dfd9d35450ac07f41dca1ce4f6'}
49472102 	,  {'35f38d22d658330d980eb78f4774ad1a68a11e30'}
145807992 	,  {'c1be35f92224ac905e2e1e8f3ca86cfe88316f09'}
4798383 	,  {'67315d11bec6fce71080a86283a3255e294b89c5'}
33884244 	,  {'e13918f61745d29a2e84101ebfc46e9487d38fca'}
2225836 	,  {'8462f4b6590713db112d6fc665b43cfeb88bab41'}
2567399 	,  {'35f38d22d658330d980eb78f4774ad1a68a11e30'}
1381562872 	,  {'7dff8a6d32efbaa16fb7510b9c3347985de71596'}
144806171 	,

In [67]:
for qid in edf.tail(10).qid.to_list():
    documents = reldf[reldf.qid == qid].doc_id.to_list()
    authors = []
    for did in documents:
         authors = authors + doc_to_author[did]
    authors = list(set(authors))
    
    for author in authors:
        print(author, "\t, ", set(author_to_doc[author]).intersection(set(documents)))
    print()


143767989 	,  {'e86f71ca2948d17b003a5f068db1ecb2b77827f7'}
8455031 	,  {'d09bec5af4eef5038e48b26b6c14098f95997114'}
3055921 	,  {'e0fcf8e73b5ac5cd30210892658eb52798e24f20'}
47971768 	,  {'e86f71ca2948d17b003a5f068db1ecb2b77827f7'}
2451989 	,  {'bf214c0f0ca5523a1e87521badcdca99be91244e'}
46349627 	,  {'f9441005143eac86fd045b194274cf8ea6b8169d'}
1789429 	,  {'e7ef9b506b1e4614858351a6a148774e76c1efad'}
1394175660 	,  {'ef8ab2a0be51a0cd04c2c0f01adfae956a2a84af'}
34313265 	,  {'d09bec5af4eef5038e48b26b6c14098f95997114'}
1413992630 	,  {'acb670a61a292109fa827d81749432eb316dd0b6'}
1783184 	,  {'545f142a246db9e65c65ec81f619d3ef093cca64'}
144316750 	,  {'75dd14b3cc47a6fa3a14b6270f18e677dd57a45a'}
1788771 	,  {'bf214c0f0ca5523a1e87521badcdca99be91244e'}
71073694 	,  {'4f95d386824643bf8e6cb5a54f3a7be0b4e578d3'}
145373997 	,  {'cc04e4a58098d675adf18aa772620fe3e620b5f9'}
145981974 	,  {'d09bec5af4eef5038e48b26b6c14098f95997114'}
1693967 	,  {'cc04e4a58098d675adf18aa772620fe3e620b5f9'}
1742986 	,  {

In [40]:
documents = reldf[reldf.qid == 132049].doc_id.to_list()
for author in authors:
    print(author, "\t, ", set(author_to_doc[author]).intersection(set(documents)))

    

34623390 	,  {'5f91afa4ba1591dfd9d35450ac07f41dca1ce4f6'}
35846943 	,  {'e13918f61745d29a2e84101ebfc46e9487d38fca'}
4595739 	,  {'c1be35f92224ac905e2e1e8f3ca86cfe88316f09'}
1399174156 	,  {'a5b1fe9129df3f499eeeb7c30913109e448a1174'}
117813613 	,  {'0560a08ef071d54a60f2efac0bf3a6505ae36910'}
4267637 	,  {'6c6fc8896569067cf30b0f2349933c7462809912'}
47787321 	,  {'6da29e3f7e9ab2cbedccb71201ae62ed52f949a4'}
121777168 	,  {'ca614b83b9496ef4893881a713195c5389c19ec8'}
11772386 	,  {'e13918f61745d29a2e84101ebfc46e9487d38fca'}
31742254 	,  {'5f91afa4ba1591dfd9d35450ac07f41dca1ce4f6'}
49472102 	,  {'35f38d22d658330d980eb78f4774ad1a68a11e30'}
145807992 	,  {'c1be35f92224ac905e2e1e8f3ca86cfe88316f09'}
4798383 	,  {'67315d11bec6fce71080a86283a3255e294b89c5'}
33884244 	,  {'e13918f61745d29a2e84101ebfc46e9487d38fca'}
2225836 	,  {'8462f4b6590713db112d6fc665b43cfeb88bab41'}
2567399 	,  {'35f38d22d658330d980eb78f4774ad1a68a11e30'}
1381562872 	,  {'7dff8a6d32efbaa16fb7510b9c3347985de71596'}
144806171 	,