# Google NQ Dataset

## CORPUS DATASET

reading the original corpus dataset from google nq, filtering by the first 2000 documents and dropping the metadata column ../data_compressed/corpus_filtered.jsonl

In [None]:
import pandas as pd

In [None]:
df_nqcorpus = pd.read_json('../data/nq/corpus.jsonl', lines=True)

In [None]:
df_nqcorpus.head()

Unnamed: 0,_id,title,text,metadata
0,doc0,Minority interest,"In accounting, minority interest (or non-contr...",{}
1,doc1,Minority interest,"It is, however, possible (such as through spec...",{}
2,doc2,Minority interest,The reporting of 'minority interest' is a cons...,{}
3,doc3,Minority interest,Some investors have expressed concern that the...,{}
4,doc4,Minority interest,Minority interest is an integral part of the e...,{}


In [None]:
df_nqcorpus["title"].nunique()

108593

In [None]:
df_nqcorpus_filtered = df_nqcorpus[df_nqcorpus["title"].isin(df_nqcorpus["title"].unique()[:20])]


In [None]:
df_nqcorpus_filtered

Unnamed: 0,_id,title,text,metadata
0,doc0,Minority interest,"In accounting, minority interest (or non-contr...",{}
1,doc1,Minority interest,"It is, however, possible (such as through spec...",{}
2,doc2,Minority interest,The reporting of 'minority interest' is a cons...,{}
3,doc3,Minority interest,Some investors have expressed concern that the...,{}
4,doc4,Minority interest,Minority interest is an integral part of the e...,{}
...,...,...,...,...
7242,doc7242,Statue of Liberty,Hundreds of replicas of the Statue of Liberty ...,{}
7243,doc7243,Statue of Liberty,"As an American icon, the Statue of Liberty has...",{}
7244,doc7244,Statue of Liberty,Depictions of the statue have been used by man...,{}
7245,doc7245,Statue of Liberty,The statue is a frequent subject in popular cu...,{}


In [None]:
df_nqcorpus_filtered['title'].nunique()

200

In [None]:
df_nqcorpus_filtered = df_nqcorpus_filtered.drop(columns=["metadata"])  

In [None]:
df_nqcorpus_filtered.head()

Unnamed: 0,_id,title,text
0,doc0,Minority interest,"In accounting, minority interest (or non-contr..."
1,doc1,Minority interest,"It is, however, possible (such as through spec..."
2,doc2,Minority interest,The reporting of 'minority interest' is a cons...
3,doc3,Minority interest,Some investors have expressed concern that the...
4,doc4,Minority interest,Minority interest is an integral part of the e...


In [None]:
df_nqcorpus_filtered.to_json('../data_filtered/corpus_filtered.jsonl', lines=True, orient='records')

## QUERIES DATASET

In [None]:
# load
df_nqqueries = pd.read_json('../data/nq/queries.jsonl', lines=True)

In [None]:
df_nqqueries

Unnamed: 0,_id,text,metadata
0,test0,what is non controlling interest on balance sheet,{}
1,test1,how many episodes are in chicago fire season 4,{}
2,test2,who sings love will keep us alive by the eagles,{}
3,test3,who is the leader of the ontario pc party,{}
4,test4,nitty gritty dirt band fishin in the dark album,{}
...,...,...,...
3447,test3447,when is the met office leaving the bbc,{}
3448,test3448,where does junior want to go to find hope,{}
3449,test3449,who does eric end up with in that 70s show,{}
3450,test3450,where does the great outdoors movie take place,{}


In [None]:
df_nqtest = pd.read_csv('../data/nq/qrels/test.tsv', sep='\t')

In [None]:
df_nqtest

Unnamed: 0,query-id,corpus-id,score
0,test0,doc0,1
1,test0,doc1,1
2,test1,doc6,1
3,test2,doc10,1
4,test3,doc17,1
...,...,...,...
4196,test3449,doc117643,1
4197,test3449,doc117646,1
4198,test3450,doc117662,1
4199,test3450,doc117663,1


In [None]:
# merge df_nqqueries and df_nqtest by "_id" and "query-id", add to df_nqqueries a new column "corpus-ids" which contains the "corpus-id" values from df_nqtest as a list for each "query-id", and drop metadata and query-id columns
df_nqqueries_merge = df_nqqueries.merge(df_nqtest.groupby('query-id')['corpus-id'].apply(list).reset_index(), left_on='_id', right_on='query-id', how='left')
df_nqqueries_merge = df_nqqueries_merge.drop(columns=['metadata', 'query-id'])

In [None]:
df_nqqueries_merge

Unnamed: 0,_id,text,corpus-id
0,test0,what is non controlling interest on balance sheet,"[doc0, doc1]"
1,test1,how many episodes are in chicago fire season 4,[doc6]
2,test2,who sings love will keep us alive by the eagles,[doc10]
3,test3,who is the leader of the ontario pc party,"[doc17, doc18]"
4,test4,nitty gritty dirt band fishin in the dark album,[doc42]
...,...,...,...
3447,test3447,when is the met office leaving the bbc,[doc117531]
3448,test3448,where does junior want to go to find hope,[doc117567]
3449,test3449,who does eric end up with in that 70s show,"[doc117643, doc117646]"
3450,test3450,where does the great outdoors movie take place,"[doc117662, doc117663]"


In [None]:
# now filter df_nqqueries_merge to keep only those rows where all the "corpus-ids" are in df_nqcorpus_filtered
df_nqqueries_merge_filtered = df_nqqueries_merge[df_nqqueries_merge['corpus-id'].apply(lambda ids: all(id_ in df_nqcorpus_filtered['_id'].values for id_ in ids))]

In [None]:
df_nqqueries_merge_filtered

Unnamed: 0,_id,text,corpus-id
0,test0,what is non controlling interest on balance sheet,"[doc0, doc1]"
1,test1,how many episodes are in chicago fire season 4,[doc6]
2,test2,who sings love will keep us alive by the eagles,[doc10]
3,test3,who is the leader of the ontario pc party,"[doc17, doc18]"
4,test4,nitty gritty dirt band fishin in the dark album,[doc42]
...,...,...,...
195,test195,where are mucosal associated lymphoid tissues ...,[doc7125]
196,test196,when is the publishers clearing house sweepsta...,[doc7150]
197,test197,where does summer of the monkeys take place,[doc7155]
198,test198,did the cast of friends really go to london,"[doc7161, doc7166]"


In [None]:
# save data to a jsonl in ../data_filtered/queries_filtered.jsonl
df_nqqueries_merge_filtered.to_json('../data_filtered/queries_filtered.jsonl', lines=True, orient='records')