### In this notebook, we want to take a closer look at our queries

In [38]:
import pandas as pd

train_queries = pd.read_csv('../nfcorpus/train.all.queries', sep='\t', header=None)
train_queries.columns = ['id', 'text']
dev_queries = pd.read_csv('../nfcorpus/dev.all.queries', sep='\t', header=None)
dev_queries.columns = ['id', 'text']
test_queries = pd.read_csv('../nfcorpus/test.all.queries', sep='\t', header=None)
test_queries.columns = ['id', 'text']

train_matrix = pd.read_pickle('pickle/train_matrix.pkl')
dev_matrix = pd.read_pickle('pickle/dev_matrix.pkl')
test_matrix = pd.read_pickle('pickle/test_matrix.pkl')

#We also want to get some information about our given relevance scores
train_rel = pd.read_csv('../nfcorpus/train.3-2-1.qrel', sep='\t', header=None)
test_rel = pd.read_csv('../nfcorpus/test.3-2-1.qrel', sep='\t', header=None)
dev_rel = pd.read_csv('../nfcorpus/dev.3-2-1.qrel', sep='\t', header=None)
#column 1 is always 0, so drop it
train_rel = train_rel.drop([1], axis=1)
dev_rel = dev_rel.drop([1], axis=1)
test_rel = test_rel.drop([1], axis=1)
train_rel.columns = ['qid', 'docid', 'rel']
dev_rel.columns = ['qid', 'docid', 'rel']
test_rel.columns = ['qid', 'docid', 'rel']

all_queries = pd.concat([train_queries, dev_queries, test_queries])
all_matrices = pd.concat([train_matrix, dev_matrix, test_matrix])
all_rels = pd.concat([train_rel, dev_rel, test_rel])


In [39]:
train_queries.describe()

Unnamed: 0,id,text
count,2594,2594
unique,2594,2593
top,PLAIN-321,"gmo - - organic foods , pesticides , women 's ..."
freq,1,2


In [40]:
train_matrix.sum(axis=1).describe()

count      2594.000000
mean      13758.305320
std       30645.993434
min          54.000000
25%         344.000000
50%        1078.500000
75%       15857.250000
max      507038.000000
dtype: float64

In [41]:
dev_queries.describe()

Unnamed: 0,id,text
count,325,325
unique,325,325
top,PLAIN-2439,is vanilla almond milk healthy ? how about alm...
freq,1,1


In [42]:
dev_matrix.sum(axis=1).describe()

count       325.000000
mean      16006.550769
std       34137.640589
min         206.000000
25%         342.000000
50%        1274.000000
75%       16764.000000
max      362245.000000
dtype: float64

In [43]:
test_queries.describe()

Unnamed: 0,id,text
count,325,325
unique,325,325
top,PLAIN-2770,"norovirus - - poultry , pesticides , organic f..."
freq,1,1


In [44]:
test_matrix.sum(axis=1).describe()

count       325.000000
mean      14287.843077
std       32762.538922
min         148.000000
25%         346.000000
50%        1490.000000
75%       15539.000000
max      307545.000000
dtype: float64

In [46]:
all_queries.describe()

Unnamed: 0,id,text
count,3244,3244
unique,3244,3243
top,PLAIN-321,"gmo - - organic foods , pesticides , women 's ..."
freq,1,2


In [47]:
all_matrices.sum(axis=1).describe()

count      3244.000000
mean      14036.597411
std       31225.989450
min          54.000000
25%         344.000000
50%        1163.000000
75%       15963.500000
max      507038.000000
dtype: float64

In [45]:
#How many uniquely relevant documents do we have (or why does raw contain 5000 docs, and train/dev/test together only 3633)

docids_train = train_rel.docid
docids_dev = dev_rel.docid
docids_test = test_rel.docid
docids = pd.concat([docids_train, docids_dev, docids_test])
len(docids.unique())
#When creating the dataset, the researchers from HD only kept those documents, that were at least once relevant for a query
#This means, roughly 1700 documents from the raw corpus were never relevant, and therefore excluded

3633

In [48]:
#How many relevance scores do we have in total, avg per query over all sets
all_rels.describe()

Unnamed: 0,rel
count,169759.0
mean,1.822755
std,0.457348
min,1.0
25%,2.0
50%,2.0
75%,2.0
max,3.0


In [50]:
len(all_rels)/len(all_queries)

52.33014796547472