In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)


**MS MARCO** stands for **M**icro**s**oft **Ma**chine **R**eading **Co**mprehension. It is described as ["a collection of large scale datasets for deep learning related to Search"](https://microsoft.github.io/msmarco/). One of the tasks that this collection has been used for is Question Answering, which I find to be the closest thing to what RAG retriever does, given how RAG queries are usually formulated. The dataset contains the following columns: query_id, query_type, query, passages, answers, and wellFormedAnswers. *Passages* are a list of 10 passages, each labeled as 1 or 0 corresponding to whether the passage was used for formulating the answer or not. (For most lists of passages, only one out of ten is labeled as relevant.) I chose to concatenate the "useful" passages into one document rather than just use the human-formulated answer to a query, since again, in practice RAG is used to query a collection of documents rather than look through a bunch of answers to find one that corresponds to the question.


In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import json

f = open('/content/gdrive/MyDrive/test_RAG/dev_v2.1.json')
data = json.load(f)

In [7]:
# Remove the rows that do not contain an answer to the query
indices = list(data['answers'].keys())
for ind in indices:
  if data['answers'][ind][0] == 'No Answer Present.':
    del data['answers'][ind]

# Keep only the indices of the rows that do contain an answer
indices = list(data['answers'].keys())

print(f"Number of rows in the dataset: {len(indices)}")

print('\nAn example of one the rows:')
ind = indices[0]
for key in data.keys():
  print(key + ":")
  print(data[key][ind])

Number of rows in the dataset: 55636

An example of one the rows:
answers:
['A corporation is a company or group of people authorized to act as a single entity and recognized as such in law.']
passages:
[{'is_selected': 0, 'passage_text': 'A company is incorporated in a specific nation, often within the bounds of a smaller subset of that nation, such as a state or province. The corporation is then governed by the laws of incorporation in that state. A corporation may issue stock, either private or public, or may be classified as a non-stock corporation. If stock is issued, the corporation will usually be governed by its shareholders, either directly or indirectly.', 'url': 'http://www.wisegeek.com/what-is-a-corporation.htm'}, {'is_selected': 0, 'passage_text': 'Today, there is a growing community of more than 2,100 Certified B Corps from 50 countries and over 130 industries working together toward 1 unifying goal: to redefine success in business. Join the Movement', 'url': 'https://www

In [9]:
import pandas as pd

dataset = []

for ind in indices:

  document = ""
  for passage in data['passages'][ind]:
    if passage['is_selected'] != 0:
      document += passage['passage_text']
  dataset.append([data['query'][ind], document])

df = pd.DataFrame(dataset, columns= ['query','document'])

In [10]:
df.head()

Unnamed: 0,query,document
0,. what is a corporation?,McDonald's Corporation is one of the most reco...
1,why did rachel carson write an obligation to e...,The Obligation to Endure by Rachel Carson Rach...
2,symptoms of a dying mouse,The symptoms are similar but the mouse will be...
3,average number of lightning strikes per day,Although many lightning flashes are simply clo...
4,can you burn your lawn with fertilizer,Fertilizer burn is the result of over fertiliz...


In [12]:
df.to_csv('/content/gdrive/MyDrive/test_RAG/MS_MARCO_retrieval.csv')