In [None]:
# ! pip install farm-haystack[elasticsearch] datasets

In [1]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt

import torch
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering

subjqa = load_dataset('subjqa', name='electronics')

MINI_CKPT = 'deepset/minilm-uncased-squad2'  # 0.1 GB
ROBERTA_CKPT = 'deepset/roberta-base-squad2' # 0.5 GB

model_ckpt = MINI_CKPT

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


Found cached dataset subjqa (C:/Users/nikit/.cache/huggingface/datasets/subjqa/electronics/1.1.0/2c12e496c4c675ab4a57ffb5d3f538f2e7b89793956e50da37126393ce23b6c6)


  0%|          | 0/3 [00:00<?, ?it/s]

## Example

In [2]:
question1 = "What mechanism did the W-Flyer use for control?" # ailerons
question2 = "What Wright brothers used to fly?" # the W-Flyer
question3 = "When Wright brothers flew?" # december 17, 1903
question4 = "How many horsepower had W-Flyer?" # 12


context = "The Wright brothers flew the motor-operated airplane on December 17, 1903. Their aircraft, the W-Flyer, used ailerons for control and had a 12-horsepower engine."
inputs = tokenizer(question1, context, return_tensors='pt')

for key, item in inputs.items():
    print(key, ':', item)
print(tokenizer.decode(inputs['input_ids'][0]))

input_ids : tensor([[  101,  2054,  7337,  2106,  1996,  1059,  1011, 23821,  2224,  2005,
          2491,  1029,   102,  1996,  6119,  3428,  5520,  1996,  5013,  1011,
          3498, 13297,  2006,  2285,  2459,  1010,  5778,  1012,  2037,  2948,
          1010,  1996,  1059,  1011, 23821,  1010,  2109,  9932,  3917,  5644,
          2005,  2491,  1998,  2018,  1037,  2260,  1011, 15149,  3194,  1012,
           102]])
token_type_ids : tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])
attention_mask : tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])
[CLS] what mechanism did the w - flyer use for control? [SEP] the wright brothers flew the motor - operated airplane on december 17, 1903. their aircraft, the w - flyer, used ai

## Model

In [3]:
model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)

for question in [question1, question2, question3, question4]:

    inputs = tokenizer(question, context, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**inputs)

    start = outputs.start_logits[0].argmax()
    end = outputs.end_logits[0].argmax()
    print(start, end)
    model_answer = tokenizer.decode(inputs['input_ids'][0][start:end+1])
    print(f'{question=}\n{model_answer=}\n')

Some weights of the model checkpoint at deepset/minilm-uncased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor(37) tensor(39)
question='What mechanism did the W-Flyer use for control?'
model_answer='ailerons'

tensor(27) tensor(30)
question='What Wright brothers used to fly?'
model_answer='the w - flyer'

tensor(17) tensor(20)
question='When Wright brothers flew?'
model_answer='december 17, 1903'

tensor(42) tensor(42)
question='How many horsepower had W-Flyer?'
model_answer='12'



### (Linux) Set Up ElasticSearch server:

In [None]:
url = """https://artifacts.elastic.co/downloads/elasticsearch/\
elasticsearch-7.9.2-linux-x86_64.tar.gz"""
!wget -nc -q {url}
!tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz

import os
import requests
from subprocess import Popen, PIPE, STDOUT

# Run Elasticsearch as a background process
!chown -R daemon:daemon elasticsearch-7.9.2
es_server = Popen(args=['elasticsearch-7.9.2/bin/elasticsearch'],
                  stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1))
# Wait until Elasticsearch has started
!sleep 30


response = requests.get('http://localhost:9200')
assert response.status_code == 200, "Elasticsearch connection is not set ;("


### Instantiate document store and load SubjQA

In [95]:
from haystack.document_stores import ElasticsearchDocumentStore

# Return the document embedding for later use with dense retriever
document_store = ElasticsearchDocumentStore(return_embedding=True)

# Init empty list to keep documents for each split
document_store_data = {"train": [], "test": [], "validation": []}

# Load data to document_store
for split, dataset in  subjqa.flatten().items():
    # keep track of seen contexts to avoid duplicates
    seen_contexts = set()
    
    for row in dataset:
        context = row['context']

        # skip duplicate reviews
        if context in seen_contexts:
            continue
        seen_contexts.add(context)

        # prepare the document
        document = {
            "content": context,
            "meta": {
                "item_id": row["title"],
                "question_id": row["id"],
                "split": split
            }
        }

        # append the document to the appropriate list
        document_store_data[split].append(document)

    # write all documents to the document_store for current split    
    document_store.write_documents(document_store_data[split], index="document")

print(f"Loaded {document_store.get_document_count()} documents")

ModuleNotFoundError: No module named 'haystack.document_store'