In [1]:
from matplotlib import pyplot as plt
from datasets import get_dataset_config_names
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
domains=get_dataset_config_names("subjqa")
domains

['books', 'electronics', 'grocery', 'movies', 'restaurants', 'tripadvisor']

In [3]:
subjqa=load_dataset("subjqa",name="electronics")

subjqa

DatasetDict({
    train: Dataset({
        features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
        num_rows: 1295
    })
    test: Dataset({
        features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
        num_rows: 358
    })
    validation: Dataset({
        features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
        num_rows: 255
    })
})

In [4]:
dfs={label:data.to_pandas() for label,data in subjqa.flatten().items()}

for split,df in dfs.items():
    print(f"{split} has {len(df)} number of elements")

train has 1295 number of elements
test has 358 number of elements
validation has 255 number of elements


In [5]:
qa_columns=["title","question","answers.text","answers.answer_start","context"]

sample=dfs["train"][qa_columns].sample(2,random_state=7)

In [6]:
from transformers import AutoTokenizer

model_name="deepset/minilm-uncased-squad2"
tokenizer=AutoTokenizer.from_pretrained(model_name)



In [7]:
question = "How much music can this hold?"
context = """An MP3 is about 1 MB/minute, so about 6000 hours depending on \
file size."""


In [8]:
!curl -X GET "http://localhost:9200"


{
  "name" : "8484cf283848",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "4G6Yu1Q5R86hprBmt3km6w",
  "version" : {
    "number" : "8.15.1",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "253e8544a65ad44581194068936f2a5d57c2c051",
    "build_date" : "2024-09-02T22:04:47.310170297Z",
    "build_snapshot" : false,
    "lucene_version" : "9.11.1",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}


In [9]:
import torch
from transformers import AutoModelForQuestionAnswering

model=AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at deepset/minilm-uncased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
inputs=tokenizer(question,context,return_tensors="pt")


In [11]:
with torch.no_grad():
    outputs=model(**inputs)

In [12]:
print(outputs)

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-0.9862, -4.7750, -5.4025, -5.2378, -5.2863, -5.5117, -4.9819, -6.1880,
         -0.9862,  0.2596, -0.2144, -1.7136,  3.7806,  4.8561, -1.0546, -3.9097,
         -1.7374, -4.5944, -1.4278,  3.9949,  5.0391, -0.2018, -3.0193, -4.8549,
         -2.3107, -3.5110, -3.5713, -0.9862]]), end_logits=tensor([[-0.9623, -5.4733, -5.0326, -5.1639, -5.4278, -5.5151, -5.1749, -4.6233,
         -0.9623, -3.7855, -0.8715, -3.7745, -3.0162, -1.1780,  0.1758, -2.7365,
          4.8934,  0.3046, -3.1761, -3.2762,  0.8937,  5.6606, -0.3623, -4.9554,
         -3.2531, -0.0914,  1.6211, -0.9623]]), hidden_states=None, attentions=None)


In [13]:
import os
import time
import requests
from subprocess import Popen, PIPE, STDOUT

# Define the path to the Elasticsearch binary
es_path = '/usr/share/elasticsearch/bin/elasticsearch'  # Update this to the correct path

# Change ownership of Elasticsearch files without requiring a password
#os.system('sudo chown -R daemon:daemon /usr/share/elasticsearch/bin/elasticsearch')  # Update this path as well

# Start Elasticsearch server
es_server = Popen(args=[es_path],
                  stdout=PIPE, stderr=STDOUT)

# Wait for Elasticsearch to start
time.sleep(30)

# Check if Elasticsearch is up and running
try:
    response = requests.get("http://localhost:9200/", auth=('elastic', '8Ffm8AV5DvjhzB3NsQVY'))  # Update the password
    response.raise_for_status()  # Raise an error for bad responses
    print("Elasticsearch is running:", response.json())
except requests.exceptions.RequestException as e:
    print("Error connecting to Elasticsearch:", e)

Elasticsearch is running: {'name': '8484cf283848', 'cluster_name': 'docker-cluster', 'cluster_uuid': '4G6Yu1Q5R86hprBmt3km6w', 'version': {'number': '8.15.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '253e8544a65ad44581194068936f2a5d57c2c051', 'build_date': '2024-09-02T22:04:47.310170297Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [14]:
from haystack import Document
from haystack.nodes import BM25Retriever
from haystack.document_stores import ElasticsearchDocumentStore

In [15]:
document_store=ElasticsearchDocumentStore(
    host="localhost",
    scheme="http",
    index="document"
)

In [16]:
retriever=BM25Retriever(document_store)

In [17]:
for split, df in dfs.items():
    docs = [
        Document(content= row["context"],
    meta={"item_id": row["title"], "question_id": row["id"],
            "split": split}
                )
        for _,row in df.drop_duplicates(subset="context").iterrows()]
    document_store.write_documents(docs)

  return bulk(*args, **kwargs)


In [18]:
document_count = document_store.client.count(index='default')["count"]
print(f"Number of documents is {document_count}")

item_id = "B0074BW614"
query = "Is it good for reading?"

ret_doc=retriever.retrieve(
    query=query,top_k=3,filters={"item_id":[item_id],"split":["train"]}
)

Number of documents is 1875


In [19]:
for i, doc in enumerate(ret_doc):
    print(f"Document {i + 1}:")
    print(f"Item ID: {doc.meta['item_id']}")
    print(f"Split: {doc.meta['split']}")
    print(f"Question ID: {doc.meta['question_id']}")
    print(f"Content: {doc.content}")
    print(f"Score:{doc.score} ")
    print("\n")

Document 1:
Item ID: B0074BW614
Split: train
Question ID: 868e311275e26dbafe5af70774a300f3
Content: This is a gift to myself.  I have been a kindle user for 4 years and this is my third one.  I never thought I would want a fire for I mainly use it for book reading.  I decided to try the fire for when I travel I take my laptop, my phone and my iPod classic.  I love my iPod but watching movies on the plane with it can be challenging because it is so small. Laptops battery life is not as good as the Kindle.  So the Fire combines for me what I needed all three to do. So far so good.
Score:0.6859896945894133 


Document 2:
Item ID: B0074BW614
Split: train
Question ID: 998d564607f10bf6dbbd20b33b8fbbf1
Content: Plays Netflix great, WiFi capability has great range. Resolution on the screen is AMAZING! For the price you cannot go wrong. Bought one for my spouse and myself after becoming addicted to hers! Our son LOVES it and it is great for reading books when no light is available. Amazing soun

In [20]:
from haystack.nodes import FARMReader

model_ckpt = "deepset/minilm-uncased-squad2"
max_seq_length, doc_stride = 384, 128
reader = FARMReader(model_name_or_path=model_ckpt, progress_bar=False,
 max_seq_len=max_seq_length, doc_stride=doc_stride,
 return_no_answer=True)

Some weights of the model checkpoint at deepset/minilm-uncased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
print(reader.predict_on_texts(question=question,texts=[context],top_k=1))

{'query': 'How much music can this hold?', 'no_ans_gap': 12.648091793060303, 'answers': [<Answer {'answer': '6000 hours', 'type': 'extractive', 'score': 0.5293058156967163, 'context': 'An MP3 is about 1 MB/minute, so about 6000 hours depending on file size.', 'offsets_in_document': [{'start': 38, 'end': 48}], 'offsets_in_context': [{'start': 38, 'end': 48}], 'document_ids': ['e344757014e804eff50faa3ecf1c9c75'], 'meta': {}}>]}


In [22]:
from haystack.pipelines import ExtractiveQAPipeline

pipeline=ExtractiveQAPipeline(reader,retriever)

n_answers = 3
preds = pipeline.run(
    query=query,
    params={
        "Retriever": {
            "top_k": 3,
            "filters": {"item_id": [item_id], "split": ["train"]}
        },
        "Reader": {"top_k": n_answers}
    }
)

In [23]:
for idx in range(n_answers):
    answer=preds['answers'][idx]
    print(f"Answer {idx+1}: {answer.answer}")
    print(f"Review snippet: ... {answer.context} ")
    print("\n\n")

Answer 1: it is great for reading books when no light is available
Review snippet: ... ecoming addicted to hers! Our son LOVES it and it is great for reading books when no light is available. Amazing sound but I suggest good headphones t 



Answer 2: I mainly use it for book reading
Review snippet: ...  is my third one.  I never thought I would want a fire for I mainly use it for book reading.  I decided to try the fire for when I travel I take my la 



Answer 3: 
Review snippet: ... None 





In [26]:
from haystack.pipelines import Pipeline
    
#from haystack.evaluation import DocumentRecallEvaluator

In [29]:
import haystack
print(dir(haystack))

['Answer', 'BaseComponent', 'Document', 'EvaluationResult', 'Label', 'MultiLabel', 'Pipeline', 'Span', 'TableCell', '__annotations__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', 'document_stores', 'environment', 'errors', 'hash128', 'haystack', 'lazy_imports', 'metadata', 'mmh3', 'modeling', 'nodes', 'pipelines', 'schema', 'set_pytorch_secure_model_loading', 'silenceable_tqdm', 'telemetry', 'utils']
