In [None]:
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

### Install Dependencies

In [None]:
pip install pandas==1.3.5 matplotlib==3.5.2 numpy==1.19.5

### Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json

### Download & Save training data file

Download and save the 'train_v1.1.json' file from the below link and save it locally in the same folder as this notebook file.

https://microsoft.github.io/msmarco/

Then edit the json file and change the structure as follows to follow an array format:

        _ add at the beginning and at the end of the content the symbols []

        _ add a comma before each '{"passages"' (except the first match)

### Load & Explore Data

In [None]:
df = pd.read_json('train_v1.1.json')

# Total numbe of questions
print(f"Total number of questions is {len(df)}")

print("Inspect data structure")
df.head(10)

In [None]:
# Flattening nested 'passages' list from JSON object

# load data
with open('train_v1.1.json','r', encoding="utf8") as f:
    data = json.loads(f.read())

# Flatten data
df_nested_list = pd.json_normalize(data, record_path =['passages'])

# To include query_id, query_type, query and answers
df_nested_list = pd.json_normalize(
    data, 
    record_path =['passages'], 
    meta=['query_id', 'query_type', 'query', 'answers']
)

df_nested_list.head(30)

In [None]:
# Unique number of query types
print(f"Total number of query types is {len(df.query_type.unique())} - {(df.query_type.unique())}")

query_types = df["query_type"].drop_duplicates().to_list() # get a sorted query types list removing duplicate values
query_types_names = []
query_types_total_questions = []
for query_type in query_types:
    query_types_names.append(query_type)
    total_queries = (df["query_type"] == query_type).sum()
    query_types_total_questions.append(total_queries)
    print(f"'{query_type}' - total queries: {total_queries}")

### Plot Data

In [None]:
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))

data = query_types_total_questions

def func(pct, allvals):
    absolute = int(np.round(pct/100.*np.sum(allvals)))
    return "{:.1f}%\n({:d})".format(pct, absolute)


wedges, texts, autotexts = ax.pie(data, autopct=lambda pct: func(pct, data),
                                  textprops=dict(color="w"))

ax.legend(wedges, query_types_names,
          title="Query Type",
          loc="center left",
          bbox_to_anchor=(1, 0, 0.5, 1))

plt.setp(autotexts, size=8, weight="bold")

ax.set_title("Training Dataset for reading comprehension and question answering - Number of questions per type")

plt.show()