In [1]:
! pip install datasets
! pip install farm-haystack[elasticsearch]
! pip install farm-haystack[inference]

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [2]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt

import torch
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering

subjqa = load_dataset('subjqa', name='electronics')

MINI_CKPT = 'deepset/minilm-uncased-squad2'  # 0.1 GB
ROBERTA_CKPT = 'deepset/roberta-base-squad2' # 0.5 GB

model_ckpt = MINI_CKPT

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


Downloading builder script:   0%|          | 0.00/9.12k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/21.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1295 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/358 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/255 [00:00<?, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

## Example

In [3]:
question1 = "What mechanism did the W-Flyer use for control?" # ailerons
question2 = "What Wright brothers used to fly?" # the W-Flyer
question3 = "When Wright brothers flew?" # december 17, 1903
question4 = "How many horsepower had W-Flyer?" # 12


context = "The Wright brothers flew the motor-operated airplane on December 17, 1903. Their aircraft, the W-Flyer, used ailerons for control and had a 12-horsepower engine."
inputs = tokenizer(question1, context, return_tensors='pt')

for key, item in inputs.items():
    print(key, ':', item)
print(tokenizer.decode(inputs['input_ids'][0]))

input_ids : tensor([[  101,  2054,  7337,  2106,  1996,  1059,  1011, 23821,  2224,  2005,
          2491,  1029,   102,  1996,  6119,  3428,  5520,  1996,  5013,  1011,
          3498, 13297,  2006,  2285,  2459,  1010,  5778,  1012,  2037,  2948,
          1010,  1996,  1059,  1011, 23821,  1010,  2109,  9932,  3917,  5644,
          2005,  2491,  1998,  2018,  1037,  2260,  1011, 15149,  3194,  1012,
           102]])
token_type_ids : tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])
attention_mask : tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])
[CLS] what mechanism did the w - flyer use for control? [SEP] the wright brothers flew the motor - operated airplane on december 17, 1903. their aircraft, the w - flyer, used ai

## Model

In [4]:
model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)

for question in [question1, question2, question3, question4]:

    inputs = tokenizer(question, context, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**inputs)

    start = outputs.start_logits[0].argmax()
    end = outputs.end_logits[0].argmax()
    print(start, end)
    model_answer = tokenizer.decode(inputs['input_ids'][0][start:end+1])
    print(f'{question=}\n{model_answer=}\n')

Downloading model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/minilm-uncased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor(37) tensor(39)
question='What mechanism did the W-Flyer use for control?'
model_answer='ailerons'

tensor(27) tensor(30)
question='What Wright brothers used to fly?'
model_answer='the w - flyer'

tensor(17) tensor(20)
question='When Wright brothers flew?'
model_answer='december 17, 1903'

tensor(42) tensor(42)
question='How many horsepower had W-Flyer?'
model_answer='12'



### (Linux) Set Up ElasticSearch server:

In [6]:
import os
import requests
import time
from subprocess import Popen, PIPE, STDOUT

# Run Elasticsearch as a background process
url = """https://artifacts.elastic.co/downloads/elasticsearch/\
elasticsearch-7.9.2-linux-x86_64.tar.gz"""
!wget -nc -q {url}
!tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.9.2

es_server = Popen(args=['elasticsearch-7.9.2/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1))
time.sleep(30)

response = requests.get('http://localhost:9200')
assert response.status_code == 200, "Elasticsearch connection is not set ;("


### Instantiate document store and load SubjQA

In [7]:
from haystack.document_stores import ElasticsearchDocumentStore

# Return the document embedding for later use with dense retriever
document_store = ElasticsearchDocumentStore(return_embedding=True)

# Init empty list to keep documents for each split
document_store_data = {"train": [], "test": [], "validation": []}

# Load data to document_store
for split, dataset in  subjqa.flatten().items():
    # keep track of seen contexts to avoid duplicates
    seen_contexts = set()

    for row in dataset:
        cntxt = row['context']

        # skip duplicate reviews
        if cntxt in seen_contexts:
            continue
        seen_contexts.add(cntxt)

        # prepare the document
        document = {
            "content": cntxt,
            "meta": {
                "item_id": row["title"],
                "question_id": row["id"],
                "split": split
            }
        }

        # append the document to the appropriate list
        document_store_data[split].append(document)

    # write all documents to the document_store for current split
    document_store.write_documents(document_store_data[split], index="document")

print(f"Loaded {document_store.get_document_count()} documents")

Loaded 1615 documents


### Init Elastic Retriever & Reader

In [8]:
from haystack.nodes import BM25Retriever
from haystack.nodes import FARMReader

retriever = BM25Retriever(document_store=document_store)


query = "what is the length of the cord?"
retrieved_docs = retriever.retrieve(query=query, top_k=3)
for doc in retrieved_docs:
  print(f'{doc.score:.3f} {doc.content}\n')

0.822 I got a pair of these headphones a few months ago and they still provide a fantastic range of sound. Aside from the quality of the sound these earbuds deliver, they seem to fit my ears superbly and that makes a huge difference for me when it comes to headphones.The highs are crisp. The mids sound great. The lows aren't shake your skull with vibrations good, but the CX 300's provide a clean low with no distortion and a little kick in the ear. I've played with equalizers and bass heavy music and you can reach a deep low that the headphones manage really well.The length of the left ear cord is half the length of the right ear cord. This is to wrap the right ear cord around the back of your neck and keep the whole cord running down only one side of your body. I personally like this feature, but it's worth mentioning if you don't want that difference and would prefer equal length cords.These headphones come with a small carrying case that I use daily. It's a nice touch to throw in wit

Excellent! The retriever managed to retrieve related reviews where potential answer might be present.

In [9]:
reader = FARMReader(model_name_or_path=model_ckpt, progress_bar=False,
                    max_seq_len=384, doc_stride=128,
                    return_no_answer=True)

reader.predict_on_texts(question=question, texts=[context], top_k=1)

Some weights of the model checkpoint at deepset/minilm-uncased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'query': 'How many horsepower had W-Flyer?',
 'no_ans_gap': 15.891523361206055,
 'answers': [<Answer {'answer': '12', 'type': 'extractive', 'score': 0.9852672219276428, 'context': 'brothers flew the motor-operated airplane on December 17, 1903. Their aircraft, the W-Flyer, used ailerons for control and had a 12-horsepower engine.', 'offsets_in_document': [{'start': 140, 'end': 142}], 'offsets_in_context': [{'start': 129, 'end': 131}], 'document_ids': ['2d4760ca8fb01286b6b9b08ef938761'], 'meta': {}}>]}

Reader works well!

Now, let's combine all together.

In [18]:
from haystack.pipelines import ExtractiveQAPipeline

N_ANSWERS = 3
pipe = ExtractiveQAPipeline(reader, retriever)

# Amazon Kindle Fire HD 7" e-book
item_id = "B0074BW614"
query = "Is it good for reading?"

params={"Retriever": {
                "top_k": 3,
                "filters":{
                    "item_id": [item_id],
                    "split":["train"]
                          },
                    },
        "Reader": {
            "top_k": N_ANSWERS
                  }
        }
preds = pipe.run(query=query, params=params)

print(f"Question: {preds['query']} \n")

for idx in range(N_ANSWERS):
    print(f"Answer {idx+1}: {preds['answers'][idx].answer}")
    print(f"Review snippet: ...{preds['answers'][idx].context}...")
    print("\n\n")

Question: Is it good for reading? 

Answer 1: it is great for reading books when no light is available
Review snippet: ...ecoming addicted to hers! Our son LOVES it and it is great for reading books when no light is available. Amazing sound but I suggest good headphones t...



Answer 2: I mainly use it for book reading
Review snippet: ... is my third one.  I never thought I would want a fire for I mainly use it for book reading.  I decided to try the fire for when I travel I take my la...



Answer 3: 
Review snippet: ...None...





In [19]:
! git clone https://github.com/nikitakapitan/nlphub.git

Cloning into 'nlphub'...
remote: Enumerating objects: 386, done.[K
remote: Counting objects: 100% (54/54), done.[K
remote: Compressing objects: 100% (44/44), done.[K
remote: Total 386 (delta 26), reused 32 (delta 6), pack-reused 332[K
Receiving objects: 100% (386/386), 58.35 MiB | 24.40 MiB/s, done.
Resolving deltas: 100% (166/166), done.


In [20]:
!git config --global user.email "nickapch2@gmail.com"
!git config --global user.name "Nickapch 2"

In [21]:
!pwd

/content


In [22]:
%cd /content/nlphub
!git remote add my_fork https://github.com/NickCaptain/nlphub.git


/content/nlphub


In [23]:
import os
os.chdir('/content/nlphub')


In [25]:
!git config --global user.email "nickapch2@gmail.com"
!git config --global user.name "Nick Captain"


In [26]:
!git remote add my_fork https://github.com/NickCaptain/nlphub.git


error: remote my_fork already exists.


In [27]:
!git checkout -b my_new_branch


Switched to a new branch 'my_new_branch'
