Check for GPU

In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

### Get text from documents

In [3]:
from haystack.nodes import PreProcessor
from haystack.utils import convert_files_to_dicts



Use haystack utils to read pdf documents and convert them to the dict convention of haystack

In [4]:
all_docs = convert_files_to_dicts(dir_path="../data/aerospace/")

INFO - haystack.utils.preprocessing -  Converting ../data/aerospace/7110.10BB_Basic_w_Chg_1_dtd_12-2-21.pdf
INFO - haystack.utils.preprocessing -  Converting ../data/aerospace/7110.65Z_ATC_Bsc_w_Chg_1_dtd_12-2-21.pdf


### Preprocess docs

Although Haystack provides a processor useful enough for most of the documents, to obtain production results a specific and dedicated preprocessing of the documents is usually needed.

Therefore, to understand documents and variability of them is mandatory to process them successfuly.

In this case, documents are structured always as follows:
    - First, a history of changes.
    - Secondly, table of contents.
    - Finally, the content.

Chapter and sections have the same naming convention, so it is easy to split them acording to subsections and save chapter and section as document metadata, which may help in the future to filter documents. 
    - Chapters always start with <Chapter>, followed by the number of the chapter and the name. They don't end with a fullstop.
    - Sections always start with <Section>, followed by the number of the section and the name. They don't end with a fullstop.
    - Subsections always start with the number of the subsection, which is: <chapter number>-<section number>-<subsection number>, followed by the name of the subsection in uppercase.

Also, headers and footers are well defined. Headers contain the filename and the date, and the footers always have section name and subsection number.

In [5]:
import re

In [6]:
docs = []
for doc in all_docs:
    name = doc['meta']['name']
    content = doc['content']
    # Remove the history of changes and the table of contents by starting at the chapter 1
    doc_init = re.search("(?<!Table of Contents\s)Chapter 1\. General", content).span()[0]
    content = content[doc_init:]
    new_content = ""
    chapter = ""
    chapter_number = ""
    section = ""
    section_number = ""
    subsection = ""
    subsection_number = ""
    for line in content.split("\n"):
        line = re.sub(" +", " ", line)  # Replace more than one space by only one
        # Check if line is a chapter declaration
        res_pat_chapter = re.findall("Chapter (\d)\. ([A-Za-z\s]*)", line)
        if res_pat_chapter and line[-1] != ".":
            chapter_tmp = res_pat_chapter[0][1]
            chapter_number_tmp = res_pat_chapter[0][0]
        # Check if line is a section declaration
        res_pat_section = re.findall("Section (\d)\. ([A-Za-z\s]*)", line) 
        if res_pat_section and line[-1] != ".":
            section_tmp = res_pat_section[0][1]
            section_number_tmp = res_pat_section[0][0]
        
        # Check is line is a subsection declaration. In case it is, store line before as a new document
        res_pat = re.findall("^\d[\-−]\d[\-−]\d\.", line)
        if (line.isupper() and res_pat) or res_pat_chapter or res_pat_section:
            if new_content:
                docs.append({
                    'content': new_content, 
                    'meta': {
                        'name': name,
                        'chapter': chapter,
                        'chapter_number': chapter_number,
                        'section': section,
                        'section_number': section_number,
                        'subsection': subsection,
                        'subsection_number': subsection_number
                    }
                })
            new_content = line if line.isupper() else ""
            subsection = line if line.isupper() else subsection
            subsection_number = res_pat[0] if res_pat else subsection_number
            chapter = chapter_tmp
            chapter_number = chapter_number_tmp
            section = section_tmp if res_pat_section else section
            section_number = section_number_tmp if res_pat_section else section_number
        # If it's not a subsection, add the line to the new document content
        else:
            # Check for headers or footers
            is_date = bool(re.findall("\d/\d/\d", line))
            is_filename = bool(re.findall("[A-Z][2] [0-9][4]\.[0-9]{,3}[A-Z]{,3}", line))
            is_section_number = bool(re.findall("\d[\-−]\d[\-−]\d", line))
            if line != section and not is_date and not is_filename and not is_section_number:
                new_content += "\n" + line

Create a generator to take a look at the documents

In [7]:
def gen_doc():
    for doc in docs:
        yield doc
    
gen = gen_doc()

In [8]:
next(gen)

{'content': '1-1-1. PURPOSE OF THIS ORDER\nThis order prescribes procedures and phraseology for\nuse by air traffic personnel providing flight services.\nFlight service specialists are required to be familiar\nwith the provisions of this order that pertain to their\noperational responsibilities and to exercise their best\njudgment if they encounter situations that are not\ncovered.',
 'meta': {'name': '7110.10BB_Basic_w_Chg_1_dtd_12-2-21.pdf',
  'chapter': 'General',
  'chapter_number': '1',
  'section': 'Introduction',
  'section_number': '1',
  'subsection': '1-1-1. PURPOSE OF THIS ORDER',
  'subsection_number': '1-1-1.'}}

### Store them into ElasticSearch

Store documents into an ElasticSearch cluster. This will help to retrieve documents with the QA pipelines.

In [9]:
from haystack.document_stores import ElasticsearchDocumentStore

In [10]:
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")



In [11]:
document_store.write_documents(docs)

### Retrieval

Haystack provides nodes to retrieve documents in different ways: based on TF-IDF, DPR, Tables, Embeddings, etc.

A complete understanding of documents and the usecase is needed to make good decissions about which one use.

In this case, documents are mostly written in natural language, they have some tables though. 
Users can make question in different ways, so to use a TF-IDF retriever would not be a good choice as it may fails depending on what words the user uses. 
The split has been done by subsections, so each document in elastic can have a lot of text, not like questions which will often be short.
Therefore, DPR is the best choice, as it will encode query and context in a different way, in order to get the best matches.

Next steps:
    - As there are some tables in the documents, to detect and parse them would be a good idea to find information within them, using the TableRetriever
    - To use a NER to save named entities and to have an ontology to save entities from it would be also of interest to filter documents or to train a TF-IDF retriever based on these entities.
    - As it is a complex task, to make users to select some features (like section, plane model, etc) would help a lot
    - As always, to finetune a model over these documents would improve language models performance.

In [12]:
from haystack.nodes import DensePassageRetriever

In [13]:
retriever = DensePassageRetriever(document_store=document_store, use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-question_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-question_encoder-single-nq-base
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-ctx_encoder-single-nq-base locally.
IN

In [14]:
# Update embeddings for DPR
document_store.update_embeddings(retriever)

INFO - haystack.document_stores.elasticsearch -  Updating embeddings for all 1023 docs ...


Updating embeddings:   0%|          | 0/1023 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/1024 [00:00<?, ? Docs/s]

### Load QA model

In [5]:
from haystack.nodes import FARMReader

In [6]:
# Use a standard model for Extractive QA
# It would be better to finetune a model over these documents, but for a demo is enough
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True, num_processes=1)

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find deepset/roberta-base-squad2 locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded deepset/roberta-base-squad2
INFO - haystack.modeling.logger -  ML Logging is turned off. No parameters, metrics or artifacts will be logged to MLFlow.
INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1


### Create the QA Pipe

In [7]:
from haystack.pipelines import ExtractiveQAPipeline

In [8]:
pipe = ExtractiveQAPipeline(reader, retriever)

### Ask!

In [9]:
from haystack.utils import print_answers

In [10]:
question = "What should operational systems record?"
# Document: JO 7110.10BB 
# Subsection: 2−1−8.LOGGING PILOT BRIEFINGS 
# the facility/sector, date, position, time, and specialist identification for each logged briefing
prediction = pipe.run(
    query=question
)

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  3.58 Batches/s]


In [11]:
[(answer.answer, answer.meta) for answer in prediction['answers']]

[('the facility/sector, date,\nposition, time, and specialist identification',
  {'chapter': 'Pilot Briefing',
   'subsection': '2-1-8. LOGGING PILOT BRIEFINGS',
   'subsection_number': '2-1-8.',
   'chapter_number': '2',
   'section': 'General',
   'section_number': '1',
   'name': '7110.10BB_Basic_w_Chg_1_dtd_12-2-21.pdf'}),
 ('Aircraft contact information',
  {'chapter': 'ICAO designators and',
   'subsection': '5-1-5. METHODS OF RECORDING DATA',
   'subsection_number': '5-1-5.',
   'chapter_number': '5',
   'section': 'General',
   'section_number': '1',
   'name': '7110.10BB_Basic_w_Chg_1_dtd_12-2-21.pdf'}),
 ('Aircraft contact information',
  {'chapter': 'Inflight Services',
   'subsection': '3-2-2. METHODS OF RECORDING DATA',
   'subsection_number': '3-2-2.',
   'chapter_number': '3',
   'section': 'Data Recording',
   'section_number': '2',
   'name': '7110.10BB_Basic_w_Chg_1_dtd_12-2-21.pdf'}),
 ('Flight plan information',
  {'chapter': 'Inflight Services',
   'subsection': '3

In [None]:
answers = []
for answer in prediction['answers'][:10]:
    answers.append({
        'answer': answer.answer,
        'chapter': f"Chapter {answer.meta['chapter_number']}: {answer.meta['chapter']}",
        'section': f"Section {answer.meta['section_number']}: {answer.meta['section']}",
        'subsection': f"subsection {answer.meta['subsection_number']}: {answer.meta['subsection']}"
    })