In [1]:
from matplotlib import pyplot as plt
from datasets import get_dataset_config_names
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
domains=get_dataset_config_names("subjqa")
domains

['books', 'electronics', 'grocery', 'movies', 'restaurants', 'tripadvisor']

In [3]:
subjqa=load_dataset("subjqa",name="electronics")

subjqa

DatasetDict({
    train: Dataset({
        features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
        num_rows: 1295
    })
    test: Dataset({
        features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
        num_rows: 358
    })
    validation: Dataset({
        features: ['domain', 'nn_mod', 'nn_asp', 'query_mod', 'query_asp', 'q_reviews_id', 'question_subj_level', 'ques_subj_score', 'is_ques_subjective', 'review_id', 'id', 'title', 'context', 'question', 'answers'],
        num_rows: 255
    })
})

In [4]:
dfs={label:data.to_pandas() for label,data in subjqa.flatten().items()}

for split,df in dfs.items():
    print(f"{split} has {len(df)} number of elements")

train has 1295 number of elements
test has 358 number of elements
validation has 255 number of elements


In [5]:
qa_columns=["title","question","answers.text","answers.answer_start","context"]

sample=dfs["train"][qa_columns].sample(2,random_state=7)

In [6]:
from transformers import AutoTokenizer

model_name="deepset/minilm-uncased-squad2"
tokenizer=AutoTokenizer.from_pretrained(model_name)

In [7]:
question = "How much music can this hold?"
context = """An MP3 is about 1 MB/minute, so about 6000 hours depending on \
file size."""


In [8]:
!curl -X GET "http://localhost:9200"

{
  "name" : "8484cf283848",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "4G6Yu1Q5R86hprBmt3km6w",
  "version" : {
    "number" : "8.15.1",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "253e8544a65ad44581194068936f2a5d57c2c051",
    "build_date" : "2024-09-02T22:04:47.310170297Z",
    "build_snapshot" : false,
    "lucene_version" : "9.11.1",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}


# lets make a question and answering pipeline

In [9]:
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever
from haystack import Document

## initiate the elastic search document

In [10]:
#initialize document store
document_store=ElasticsearchDocumentStore(hosts="http://localhost:9200")

In [11]:
!curl -X GET "http://localhost:9200/_cat/indices?v"

health status index   uuid                   pri rep docs.count docs.deleted store.size pri.store.size dataset.size
yellow open   default Ae3l6f9fS4CZDT9EO53L2w   1   1                                                               


In [16]:
#for delete the previously hold data
!curl -X DELETE "http://localhost:9200/default"

{"acknowledged":true}

In [17]:
#load data into document store
for split, df in dfs.items():
    docs = [
        Document(content= row["context"],
    meta={"item_id": row["title"], "question_id": row["id"],
            "split": split}
                )
        for _,row in df.drop_duplicates(subset="context").iterrows()
    ]

    document_store.write_documents(docs)

# initiate retriver

In [18]:
retriever=ElasticsearchBM25Retriever(document_store=document_store)

In [26]:
item_id = "B0074BW614"
query = "Is it good for reading?"

filters = {
  "conditions": [
    {"field": "item_id", "value": ["B0074BW614"]},  # Corrected syntax
    {"field": "split", "value": ["train"]}
  ],
  "operator": "AND"  # or "OR" depending on your requirement
}

In [27]:
ret_doc=retriever.run(
    query=query,top_k=3,filters=filters
)

FilterError: 'operator' key missing in {'field': 'item_id', 'value': ['B0074BW614']}

In [29]:
import haystack
print(haystack.__version__)