# Installing Modules

In [None]:
!pip install -qq farm-haystack

[K     |████████████████████████████████| 180 kB 5.3 MB/s 
[K     |████████████████████████████████| 8.4 MB 36.1 MB/s 
[K     |████████████████████████████████| 85 kB 4.0 MB/s 
[K     |████████████████████████████████| 52 kB 930 kB/s 
[K     |████████████████████████████████| 54 kB 3.1 MB/s 
[K     |████████████████████████████████| 59 kB 6.9 MB/s 
[K     |████████████████████████████████| 5.6 MB 35.7 MB/s 
[K     |████████████████████████████████| 321 kB 48.6 MB/s 
[K     |████████████████████████████████| 3.8 MB 34.5 MB/s 
[K     |████████████████████████████████| 56 kB 4.0 MB/s 
[K     |████████████████████████████████| 100 kB 9.8 MB/s 
[K     |████████████████████████████████| 79 kB 8.2 MB/s 
[K     |████████████████████████████████| 204 kB 51.9 MB/s 
[K     |████████████████████████████████| 981 kB 42.4 MB/s 
[K     |████████████████████████████████| 3.4 MB 36.2 MB/s 
[K     |████████████████████████████████| 50 kB 6.9 MB/s 
[K     |███████████████████████████████

You need to restart the runtime before proceeding!

# Preprocessing Data

In [None]:
import os

squad_dir = '/content/data/squad'

if not os.path.exists(squad_dir):
    os.makedirs(squad_dir)

In [None]:
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'
files = ['train-v2.0.json']

In [None]:
import requests

for file in files:
    res = requests.get(url+file)
    # write to file in chunks
    with open(os.path.join(squad_dir, file), 'wb') as f:
        for chunk in res.iter_content(chunk_size=40):
            f.write(chunk)

In [None]:
import json

with open(os.path.join(squad_dir, 'train-v2.0.json'), 'rb') as f:
    squad = json.load(f)

In [None]:

# initialize list where we will place all of our data
new_squad = []

# we need to loop through groups -> paragraphs -> qa_pairs
for group in squad['data']:
    for paragraph in group['paragraphs']:
        # we pull out the context from here
        context = paragraph['context']
        for qa_pair in paragraph['qas']:
            # we pull out the question
            question = qa_pair['question']
            # now the logic to check if we have 'answers' or 'plausible_answers'
            if 'answers' in qa_pair.keys() and len(qa_pair['answers']) > 0:
                answer = qa_pair['answers'][0]['text']
            elif 'plausible_answers' in qa_pair.keys() and len(qa_pair['plausible_answers']) > 0:
                answer = qa_pair['plausible_answers'][0]['text']
            else:
                # this shouldn't happen, but just in case we just set answer = None
                answer = None
            # append dictionary sample to parsed squad
            new_squad.append({
                'question': question,
                'answer': answer,
                'context': context
            })

In [None]:
with open(os.path.join(squad_dir, 'train.json'), 'w') as f:
    json.dump(new_squad, f)

# Reader

In [None]:
from haystack.reader.farm import FARMReader

09/15/2021 07:35:21 - INFO - faiss.loader -   Loading faiss with AVX2 support.
09/15/2021 07:35:21 - INFO - faiss.loader -   Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
09/15/2021 07:35:21 - INFO - faiss.loader -   Loading faiss.
09/15/2021 07:35:21 - INFO - faiss.loader -   Successfully loaded faiss.
09/15/2021 07:35:21 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [None]:
reader = FARMReader(model_name_or_path='deepset/bert-base-cased-squad2',
                    use_gpu=True)

09/15/2021 07:35:23 - INFO - farm.utils -   Using device: CUDA 
09/15/2021 07:35:23 - INFO - farm.utils -   Number of GPUs: 1
09/15/2021 07:35:23 - INFO - farm.utils -   Distributed Training: False
09/15/2021 07:35:23 - INFO - farm.utils -   Automatic Mixed Precision: None
09/15/2021 07:35:23 - INFO - filelock -   Lock 140044122316624 acquired on /root/.cache/huggingface/transformers/15cc42d3f2accfabf2128048289832150b75adafd7ab2b50e37a998b71c528b4.250bb3d860b46cc2180effc9227c0d2faf19405a1585c714c721dcc1a1d04727.lock


Downloading:   0%|          | 0.00/508 [00:00<?, ?B/s]

09/15/2021 07:35:23 - INFO - filelock -   Lock 140044122316624 released on /root/.cache/huggingface/transformers/15cc42d3f2accfabf2128048289832150b75adafd7ab2b50e37a998b71c528b4.250bb3d860b46cc2180effc9227c0d2faf19405a1585c714c721dcc1a1d04727.lock
09/15/2021 07:35:23 - INFO - filelock -   Lock 140044116310672 acquired on /root/.cache/huggingface/transformers/4fb91611eb90525e5d3e1dc48e72a8c8a9d19f8bb138af51f0795a020acf8bfa.eb6109ce49f42129661193e04ce291fda8ccb3407daa60a09d011e671c60a188.lock


Downloading:   0%|          | 0.00/433M [00:00<?, ?B/s]

09/15/2021 07:36:10 - INFO - filelock -   Lock 140044116310672 released on /root/.cache/huggingface/transformers/4fb91611eb90525e5d3e1dc48e72a8c8a9d19f8bb138af51f0795a020acf8bfa.eb6109ce49f42129661193e04ce291fda8ccb3407daa60a09d011e671c60a188.lock
Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
09/15/2021 07:36:16 - INFO - filelock -   Lock 140044066754384 acquired on /root/.cache/huggingface/transformers/a027c4e547fd30e501f9edb04ed6401

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

09/15/2021 07:36:16 - INFO - filelock -   Lock 140044066754384 released on /root/.cache/huggingface/transformers/a027c4e547fd30e501f9edb04ed6401edefe7eede664b9528d4babaf78d1bda6.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791.lock
09/15/2021 07:36:16 - INFO - filelock -   Lock 140044066280208 acquired on /root/.cache/huggingface/transformers/5fa07cc35cf92053100af63a5b23424ba428c75dafc83de837f7c2bd5118a8b1.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock


Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

09/15/2021 07:36:17 - INFO - filelock -   Lock 140044066280208 released on /root/.cache/huggingface/transformers/5fa07cc35cf92053100af63a5b23424ba428c75dafc83de837f7c2bd5118a8b1.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
09/15/2021 07:36:17 - INFO - filelock -   Lock 140044066346512 acquired on /root/.cache/huggingface/transformers/8d45cf43007a2f83a66ff292adf10e02ae1859acd064791f6b6417f7e528b1f3.3e813ef7b46a58d7e35c90b0aacc2127a3800bba6bff791af1651012cf2fab11.lock


Downloading:   0%|          | 0.00/152 [00:00<?, ?B/s]

09/15/2021 07:36:17 - INFO - filelock -   Lock 140044066346512 released on /root/.cache/huggingface/transformers/8d45cf43007a2f83a66ff292adf10e02ae1859acd064791f6b6417f7e528b1f3.3e813ef7b46a58d7e35c90b0aacc2127a3800bba6bff791af1651012cf2fab11.lock
09/15/2021 07:36:17 - INFO - farm.utils -   Using device: CUDA 
09/15/2021 07:36:17 - INFO - farm.utils -   Number of GPUs: 1
09/15/2021 07:36:17 - INFO - farm.utils -   Distributed Training: False
09/15/2021 07:36:17 - INFO - farm.utils -   Automatic Mixed Precision: None
09/15/2021 07:36:17 - INFO - farm.infer -   Got ya 2 parallel workers to do inference ...
09/15/2021 07:36:17 - INFO - farm.infer -    0    0 
09/15/2021 07:36:17 - INFO - farm.infer -   /w\  /w\
09/15/2021 07:36:17 - INFO - farm.infer -   /'\  / \
09/15/2021 07:36:17 - INFO - farm.infer -     


# FAISS

In [None]:
import os

path = 'models/faiss'

if not os.path.exists(path):
    os.makedirs(path)

In [None]:
from haystack.document_store.faiss import FAISSDocumentStore

In [None]:
document_store = FAISSDocumentStore(
    faiss_index_factory_str='Flat',
    sql_url=f'sqlite:///{path}/squad_train.db',
    return_embedding=True
)

In [None]:
import json

with open('./data/squad/train.json', 'r') as f:
    squad = json.load(f)

In [None]:
from haystack import Document

In [None]:
contexts = [sample['context'] for sample in squad]

contexts = list(set(contexts))

squad_docs = [Document(text=sample) for sample in contexts]

In [None]:
document_store.delete_all_documents()

                1. delete_all_documents() method is deprecated, please use delete_documents method
                For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/1045
                


In [None]:
document_store.write_documents(squad_docs)

In [None]:
document_store.update_embeddings(retriever=retriever)

09/15/2021 07:42:00 - INFO - haystack.document_store.faiss -   Updating embeddings for 1204 docs...
Updating Embedding:   0%|          | 0/1204 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/1216 [00:00<?, ? Docs/s]

Documents Processed: 10000 docs [00:43, 227.91 docs/s]


In [None]:
document_store.save(f'{path}/squad_train.faiss')

# DPR

In [None]:
from haystack.retriever.dense import DensePassageRetriever

retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model='facebook/dpr-question_encoder-single-nq-base',
    passage_embedding_model='facebook/dpr-ctx_encoder-single-nq-base',
    use_gpu=True,
    embed_title=True
)

# Getting Answers

## Method 1

This gets the context of the question

In [None]:
retriever.retrieve('What century did the Normans first gain their separate identity?')[0].text

'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'

## Method 2

This can get both the context and the answer

In [None]:
from haystack.pipeline import ExtractiveQAPipeline

In [None]:
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)

In [None]:
pr = pipeline.run(
    query='What century did the Normans first gain their separate identity?',
    top_k_retriever=1
    )

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 15.59 Batches/s]


In [None]:
answer = pr['answers'][0]['answer']
context = pr['answers'][0]['context']

print(f'Answer: {answer}')
print(f'Context: {context}')

Answer: 10th
Context:  and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.


# RAG

In [None]:
from haystack.generator.transformers import RAGenerator


generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=2,
)

09/15/2021 08:00:20 - INFO - filelock -   Lock 140044023305040 acquired on /root/.cache/huggingface/transformers/6337b0203e20d15c98f5e500e1e673c74e71bb8617b2753a53663b9b8e6dfc1a.59948e1fef260da10a0cecb8b6862373c32f40001848a63f985ab4f9d787f3f1.lock


Downloading:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

09/15/2021 08:00:20 - INFO - filelock -   Lock 140044023305040 released on /root/.cache/huggingface/transformers/6337b0203e20d15c98f5e500e1e673c74e71bb8617b2753a53663b9b8e6dfc1a.59948e1fef260da10a0cecb8b6862373c32f40001848a63f985ab4f9d787f3f1.lock
  f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions."
09/15/2021 08:00:21 - INFO - filelock -   Lock 140044063611664 acquired on /root/.cache/huggingface/transformers/26cf899a0974235af1f84469ddd94d2ee83c803c23ecead93b511ce8a0744f5c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

09/15/2021 08:00:21 - INFO - filelock -   Lock 140044063611664 released on /root/.cache/huggingface/transformers/26cf899a0974235af1f84469ddd94d2ee83c803c23ecead93b511ce8a0744f5c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
09/15/2021 08:00:21 - INFO - filelock -   Lock 140044023306512 acquired on /root/.cache/huggingface/transformers/d4df3c917efc1bf4cde9515ac4432cddf040d3ddacfad55c85445bc985f58ceb.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock


Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

09/15/2021 08:00:21 - INFO - filelock -   Lock 140044023306512 released on /root/.cache/huggingface/transformers/d4df3c917efc1bf4cde9515ac4432cddf040d3ddacfad55c85445bc985f58ceb.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
09/15/2021 08:00:22 - INFO - filelock -   Lock 140044021479888 acquired on /root/.cache/huggingface/transformers/445caa3aaff6c34b07acdff304db0c8468640baf1139f92c14270fc50cff2eb8.76ea01b4b85ac16e2cec55c398cba7a943d89ab21dfdd973f6630a152e4b9aed.lock


Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

09/15/2021 08:00:22 - INFO - filelock -   Lock 140044021479888 released on /root/.cache/huggingface/transformers/445caa3aaff6c34b07acdff304db0c8468640baf1139f92c14270fc50cff2eb8.76ea01b4b85ac16e2cec55c398cba7a943d89ab21dfdd973f6630a152e4b9aed.lock
09/15/2021 08:00:22 - INFO - filelock -   Lock 140044010728464 acquired on /root/.cache/huggingface/transformers/786598a0d343d4afb34b4f1ee17c14b58fb129abb0b3db386587bea52ffcfb11.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05.lock


Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

09/15/2021 08:00:22 - INFO - filelock -   Lock 140044010728464 released on /root/.cache/huggingface/transformers/786598a0d343d4afb34b4f1ee17c14b58fb129abb0b3db386587bea52ffcfb11.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05.lock
09/15/2021 08:00:22 - INFO - filelock -   Lock 140047009152784 acquired on /root/.cache/huggingface/transformers/f5fdd0c4b41c985f791c9a230cdf051da1cf9480d84e0f5f617667760a24f50f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

09/15/2021 08:00:23 - INFO - filelock -   Lock 140047009152784 released on /root/.cache/huggingface/transformers/f5fdd0c4b41c985f791c9a230cdf051da1cf9480d84e0f5f617667760a24f50f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
09/15/2021 08:00:23 - INFO - filelock -   Lock 140047016916624 acquired on /root/.cache/huggingface/transformers/73d054acda2f2b44f075a0d391ed392e6c53f78356bbf279581786ae35a79e4d.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0.lock


Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

09/15/2021 08:00:23 - INFO - filelock -   Lock 140047016916624 released on /root/.cache/huggingface/transformers/73d054acda2f2b44f075a0d391ed392e6c53f78356bbf279581786ae35a79e4d.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0.lock
09/15/2021 08:00:24 - INFO - filelock -   Lock 140044022884624 acquired on /root/.cache/huggingface/transformers/a2f0711fd4c199beaf3ff2626c4a723e107e79a8d065099a3d24e7aa361c0f18.67d01b18f2079bd75eac0b2f2e7235768c7f26bd728e7a855a1c5acae01a91a8.lock


Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

09/15/2021 08:00:24 - INFO - filelock -   Lock 140044022884624 released on /root/.cache/huggingface/transformers/a2f0711fd4c199beaf3ff2626c4a723e107e79a8d065099a3d24e7aa361c0f18.67d01b18f2079bd75eac0b2f2e7235768c7f26bd728e7a855a1c5acae01a91a8.lock
09/15/2021 08:00:24 - INFO - filelock -   Lock 140044066266320 acquired on /root/.cache/huggingface/transformers/6288b70ae87b5989f6801120b25a2f6ccf2e88c2f622d94805717b5b043ff71b.bceb1ea6e95a1b39d0a717f315800a39b0847e5cdeeb42615fb498775f632ee6.lock


Downloading:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

09/15/2021 08:01:34 - INFO - filelock -   Lock 140044066266320 released on /root/.cache/huggingface/transformers/6288b70ae87b5989f6801120b25a2f6ccf2e88c2f622d94805717b5b043ff71b.bceb1ea6e95a1b39d0a717f315800a39b0847e5cdeeb42615fb498775f632ee6.lock
Some weights of RagTokenForGeneration were not initialized from the model checkpoint at facebook/rag-token-nq and are newly initialized: ['rag.generator.lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
question = 'What century did the Normans first gain their separate identity?'
retriever_results = retriever.retrieve(
        query=question
    )

predicted_result = generator.predict(
        query=question,
        documents=retriever_results,
        top_k=1
    )

# Print your answer
answers = predicted_result["answers"]
print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')



Generated answer is ' medieval' for the question = 'What century did the Normans first gain their separate identity?'
