<a href="https://colab.research.google.com/github/parrot-qa/models/blob/main/DPR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-requisites

**Need to Have:** The dataset JSON file `parrot-qa.json` generated using the 'parrot-qa/dataset' repository.

Upload it to a `data` directory.



In [1]:
# Install packages

!pip install --upgrade pip

!pip install datasets
!pip install nltk rouge_score

#!pip install farm-haystack[colab,faiss]
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss]

Collecting pip
  Downloading pip-22.0.4-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 7.4 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.0.4
Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.4/325.4 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [2]:
# Make sure you have a GPU running
!nvidia-smi

Fri Apr 29 03:33:39 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Step 1: Dense Passage Retrieval

We will use the DPR model introduced by Karpukhin et al. (2020, https://arxiv.org/abs/2004.04906). 

Original Code: https://fburl.com/qa-dpr

The original reference notebook is [here](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb).


In [3]:
# Constants

# Split documents into pieces, the module respects sentence boundaries.
PREPROC_SPLIT_LEN_DOC = 100
ANSWER_PREFERENCE = 'instructor'  # 'instructor' or 'student'

# Retriever parameters
MAX_SEQ_LEN_QUERY = 256
MAX_SEQ_LEN_PASSAGE = 128
RETRIEVER_BATCH_SIZE = 16

RETRIEVER_TOP_K = 5
READER_TOP_K = 5
USE_CONTEXT_FROM = 'retriever'  # 'retriever' or 'reader'

### Cleaning & Indexing

We group documents by course and index them into the DocumentStore.

In [4]:
import re
import json

from haystack.nodes import PreProcessor


def _format_title(title):
    title = ' '.join(re.findall(r'[a-z0-9.-]+', title, re.IGNORECASE))
    return title


def _get_answer(answers):
    max_val = max(answers['score'])
    if ANSWER_PREFERENCE == 'student':
        max_idx = answers['score'].index(max_val)
    else:
        # Instructor answer is stored last
        max_idx = answers['score'][::-1].index(max_val)
        max_idx = len(answers['score']) - 1 - max_idx
    return answers['text'][max_idx]


def extract_docs(dataset):
    # Store one list of documents per course
    docs_db = {}

    for doc in dataset['documents']:
        course = doc['course']
        if course not in docs_db:
            docs_db[course] = []
        docs_db[course].append({
            'content': doc['passage_text'],
            'meta': {'name': _format_title(doc['article_title'])},
        })

    preproc = PreProcessor(split_length=PREPROC_SPLIT_LEN_DOC)
    for course in docs_db.keys():
        docs_db[course] = preproc.process(docs_db[course])

    # It seems preproc sometimes ends up with duplicate IDs, so cleanup manually
    for course, docs in docs_db.items():
        for idx, doc in enumerate(docs):
            doc.id = f'd{idx}'

    return docs_db


def extract_qa_pairs(dataset):
    # Store one list of documents per course
    qa_db = {}

    for qa in dataset['qa_pairs']:
        course = qa['course']
        if course not in qa_db:
            qa_db[course] = []
        if qa['is_answerable'] == False:
            continue
        qa_db[course].append({
            'question': qa['title'],
            'answer': _get_answer(qa['answers'])})

    return qa_db


with open("data/parrot-qa.json") as file_path:
    dataset = json.load(file_path)

docs_db = extract_docs(dataset)
qa_db = extract_qa_pairs(dataset)

INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/
ERROR - root -  Failed to import 'magic' (from 'python-magic' and 'python-magic-bin' on Windows). FileTypeClassifier will not perform mimetype detection on extensionless files. Please make sure the necessary OS libraries are installed if you need this functionality.
INFO - haystack.telemetry -  Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://haystack.deepset.ai/guides/telemetry


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


100%|██████████| 305/305 [00:00<00:00, 825.65docs/s] 
100%|██████████| 322/322 [00:00<00:00, 819.88docs/s]
100%|██████████| 32/32 [00:00<00:00, 1476.54docs/s]
100%|██████████| 567/567 [00:00<00:00, 660.84docs/s] 


### Document Store & Retriever

#### FAISS

FAISS is a library for efficient similarity search on a cluster of dense vectors.
The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood
to store the document text and other meta data. The vector embeddings of the text are
indexed on a FAISS Index that later is queried for searching answers.
The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for
faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor.
For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index

#### Retriever

**Here:** We use a `DensePassageRetriever`

**Alternatives:**

- The `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters
- Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT)
- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging

In [5]:
import os

from haystack.nodes import DensePassageRetriever
from haystack.document_stores import FAISSDocumentStore

In [6]:
# For each course, embed the pool of documents and create retrievers

dpr_db = {}

for course, docs in docs_db.items():
    db_file = f'data/faiss_document_store_{course}.db'
    if os.path.isfile(db_file):
        os.remove(db_file)
    document_store = FAISSDocumentStore(
        sql_url=f"sqlite:///{db_file}",
        faiss_index_factory_str="Flat",
    )
    document_store.write_documents(docs, duplicate_documents='fail')

    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        max_seq_len_query=MAX_SEQ_LEN_QUERY,
        max_seq_len_passage=MAX_SEQ_LEN_PASSAGE,
        batch_size=RETRIEVER_BATCH_SIZE,
        use_gpu=True,
        embed_title=True,
    )
    document_store.update_embeddings(retriever)

    dpr_db[course] = retriever


Writing Documents:   0%|          | 0/1688 [00:00<?, ?it/s]

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/493 [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-question_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-question_encoder-single-nq-base


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-ctx_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-ctx_encoder-single-nq-base
INFO - haystack.document_stores.faiss -  Updating embeddings for 1688 docs...


Updating Embedding:   0%|          | 0/1688 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/1696 [00:00<?, ? Docs/s]

Writing Documents:   0%|          | 0/1769 [00:00<?, ?it/s]

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-question_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-question_encoder-single-nq-base
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-ctx_encoder-single-nq-base locally.
IN

Updating Embedding:   0%|          | 0/1769 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/1776 [00:00<?, ? Docs/s]

Writing Documents:   0%|          | 0/82 [00:00<?, ?it/s]

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-question_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-question_encoder-single-nq-base
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-ctx_encoder-single-nq-base locally.
IN

Updating Embedding:   0%|          | 0/82 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/96 [00:00<?, ? Docs/s]

Writing Documents:   0%|          | 0/3608 [00:00<?, ?it/s]

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-question_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-question_encoder-single-nq-base
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-ctx_encoder-single-nq-base locally.
IN

Updating Embedding:   0%|          | 0/3608 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/3616 [00:00<?, ? Docs/s]

### Reader

Here we use a FARMReader with the *deepset/roberta-base-squad2* model (see: https://huggingface.co/deepset/roberta-base-squad2)


In [7]:
from haystack.nodes import FARMReader

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find deepset/roberta-base-squad2 locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Loaded deepset/roberta-base-squad2


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.infer -  Got ya 2 parallel workers to do inference ...
INFO - haystack.modeling.infer -   0     0  
INFO - haystack.modeling.infer -  /w\   /w\ 
INFO - haystack.modeling.infer -  /'\   / \ 


### Pipeline

With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline.
Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases.
To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions.
You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd).

In [8]:
from haystack.pipelines import Pipeline, ExtractiveQAPipeline


def attach_context_retriever(qa_db, dpr_db):
    for course, pairs in qa_db.items():
        pipe = Pipeline()
        pipe.add_node(component=dpr_db[course], name='Retriever', inputs=['Query'])
        for qa in pairs:
            context = pipe.run(
                query=qa['question'],
                params={"Retriever": {"top_k": RETRIEVER_TOP_K}}
            )
            qa['contexts'] = [doc.content for doc in context['documents']]


def attach_context_reader(qa_db, dpr_db):
    for course, pairs in qa_db.items():
        pipe = ExtractiveQAPipeline(retriever=dpr_db[course], reader=reader)
        for qa in pairs:
            prediction = pipe.run(
                query=qa['question'],
                params={"Retriever": {"top_k": RETRIEVER_TOP_K}, "Reader": {"top_k": READER_TOP_K}}
            )
            qa['contexts'] = [ans.context for ans in prediction['answers']]


if USE_CONTEXT_FROM == 'retriever':
    attach_context_retriever(qa_db, dpr_db)
elif USE_CONTEXT_FROM == 'reader':
    attach_context_reader(qa_db, dpr_db)
else:
    raise RuntimeError('Invalid configuration for selecting context.')


In [9]:
# Statistics

qlengths = []
clengths = []
alengths = []
for course, pairs in qa_db.items():
    for qa in pairs:
        qlengths.append(len(qa['question']))
        alengths.append(len(qa['answer']))
        clengths.append(
            sum(len(context) for context in qa['contexts'])
        )

print('Average question length (characters):', round(sum(qlengths) / len(qlengths)))
print('Average context length (characters):', round(sum(clengths) / len(clengths)))
print('Average answer length (characters):', round(sum(alengths) / len(alengths)))

Average question length (characters): 269
Average context length (characters): 2344
Average answer length (characters): 217


### Export

In [10]:
# Write contextualized QA pairs to JSON

qa_export = []
for course, pairs in qa_db.items():
    for qa in pairs:
        item = {'course': course}
        item.update(qa)
        qa_export.append(item)

with open('data/parrot-qa-ctx.json', 'w') as file_path:
    json.dump(qa_export, file_path, indent=4)


In [11]:
import random

random.seed(0)
random.shuffle(qa_export)

N = len(qa_export)
train_len = int(0.8 * N)
dev_len = int(0.1 * N)
test_len = N - train_len - dev_len

with open('data/parrot-qa-ctx-train.json', 'w') as file_path:
    json.dump(qa_export[:train_len], file_path, indent=4)

with open('data/parrot-qa-ctx-dev.json', 'w') as file_path:
    json.dump(qa_export[train_len:train_len+dev_len], file_path, indent=4)

with open('data/parrot-qa-ctx-test.json', 'w') as file_path:
    json.dump(qa_export[train_len+dev_len:], file_path, indent=4)
