In [None]:
from matplotlib import pyplot as plt
from datasets import get_dataset_config_names
from datasets import load_dataset
import pandas as pd

In [None]:
domains=get_dataset_config_names("subjqa")
domains

In [None]:
subjqa=load_dataset("subjqa",name="electronics")

In [None]:
subjqa

In [None]:
print(subjqa["train"][1].keys())

In [None]:
print(subjqa["train"][1]["question"])

In [None]:
print(subjqa["train"][1]["answers"])

In [None]:
print(subjqa["train"]["answers"][1].keys())

# turn the data into dataframe

In [None]:
dfs={label:data.to_pandas() for label,data in subjqa.flatten().items()}

In [None]:
type(dfs)

In [None]:
for split,df in dfs.items():
    print(f"{split} has {len(df)} number of elements")

In [None]:
qa_columns=["title","question","answers.text","answers.answer_start","context"]

sample=dfs["train"][qa_columns].sample(2,random_state=7)

In [None]:
sample

In [None]:
#lets extracte the text 
start_index=sample["answers.answer_start"].iloc[0][0]
end_index=start_index+len(sample["answers.text"].iloc[0][0])

In [None]:
sample["context"].iloc[0][start_index:end_index]

In [None]:
end_index,start_index

In [None]:
sample['context'].iloc[0]

In [None]:
dfs["train"].head()

# count question that start with certain phrases

In [None]:
counts = {}
question_types = ["What", "How", "Is", "Does", "Do", "Was", "Where", "Why"]

In [None]:
for q in question_types:
    counts[q]=dfs["train"]["question"].str.startswith(q).value_counts()[True]

## ploting 

In [None]:
pd.Series(counts).sort_values().plot.barh()
plt.title("frequency of question")
plt.show()

In [None]:
few_question=["How","What","Is"]

for questions in few_question:
    for question in (dfs["train"][dfs["train"].question.str.startswith(questions)].sample(n=3,random_state=42)['question']):
        print(question)

# lets load the model

In [None]:
from transformers import AutoTokenizer

In [None]:
model_name="deepset/minilm-uncased-squad2"
tokenizer=AutoTokenizer.from_pretrained(model_name)

In [None]:
import torch

In [None]:
question = "How much music can this hold?"
context = """An MP3 is about 1 MB/minute, so about 6000 hours depending on \
file size."""

inputs=tokenizer(question,context,return_tensors="pt")

In [None]:
inputs

In [None]:
tokenizer.decode(inputs["input_ids"][0])

In [None]:
tokenizer.decode(inputs["input_ids"][0])

# pass input to the model

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering

In [None]:
model=AutoModelForQuestionAnswering.from_pretrained(model_name)

In [None]:
##pass the inputs
with torch.no_grad():
    outputs=model(**inputs)

print(outputs)

In [None]:
#we get the start and end token
start_logits=outputs.start_logits
end_logits=outputs.end_logits

In [None]:
print(start_logits.shape)
print(end_logits.shape)
print(inputs['input_ids'].shape)

In [None]:
print(start_logits)
print()
print(end_logits)

## extract answer

In [None]:
start_idx=torch.argmax(start_logits)
end_idx=torch.argmax(end_logits)+1
answer_span=inputs['input_ids'][0][start_idx:end_idx]
answer=tokenizer.decode(answer_span)

print(f'question is {question}')
print(f'answer is {answer}')

# automate it with piepline

In [None]:
from transformers import pipeline

In [None]:
pipe=pipeline("question-answering",model=model,tokenizer=tokenizer)

In [None]:
pipe(question=question,context=context,top_k=3)

# more context size than model can handlem

In [None]:
example=dfs['train'].iloc[0][['question','context']]

In [None]:
tokenized_example=tokenizer(example['question'],example['context'],
                           return_overflowing_tokens=True,max_length=100,stride=25
                           )

In [None]:
tokenized_example

In [None]:
for idx,tokens in enumerate(tokenized_example['input_ids']):
    print(f"{idx} has {len(tokens)} number of tokens ")

In [None]:
for window in tokenized_example['input_ids']:
    print(f"{tokenizer.decode(window)}")

# lets install some stuff

## create a sub process

In [None]:
!chown -R daemon:daemon elasticsearch-8.15.1

In [None]:
#new_password

In [None]:
import os
import time
import requests
from subprocess import Popen, PIPE, STDOUT

# Define the path to the Elasticsearch binary
es_path = '/usr/share/elasticsearch/bin/elasticsearch'  # Update this to the correct path

# Change ownership of Elasticsearch files without requiring a password
#os.system('sudo chown -R daemon:daemon /usr/share/elasticsearch/bin/elasticsearch')  # Update this path as well

# Start Elasticsearch server
es_server = Popen(args=[es_path],
                  stdout=PIPE, stderr=STDOUT)

# Wait for Elasticsearch to start
time.sleep(30)

# Check if Elasticsearch is up and running
try:
    response = requests.get("http://localhost:9200/", auth=('elastic', 'peQBjqFfi-FSUMDfjORi'))  # Update the password
    response.raise_for_status()  # Raise an error for bad responses
    print("Elasticsearch is running:", response.json())
except requests.exceptions.RequestException as e:
    print("Error connecting to Elasticsearch:", e)

In [None]:
#!curl -k -u -X GET "https://localhost:9200/?pretty"

In [None]:
!curl -k -u elastic:peQBjqFfi-FSUMDfjORi -X GET "https://localhost:9200/?pretty"

#
* password is:peQBjqFfi-FSUMDfjORi

# add the document store

In [None]:
new_password="peQBjqFfi-FSUMDfjORi"

In [None]:
!curl -k -u elastic:peQBjqFfi-FSUMDfjORi -X GET "https://localhost:9200/?pretty"

In [None]:
!curl -X GET "localhost:9200/"

In [None]:
from haystack.document_stores import ElasticsearchDocumentStore

In [None]:
from haystack.telemetry import send_event

In [None]:
import haystack
print(haystack.__version__)

In [None]:
#!systemctl start elasticsearch

In [None]:
import warnings
from urllib3.exceptions import InsecureRequestWarning

warnings.filterwarnings("ignore", category=InsecureRequestWarning)
from elasticsearch import Elasticsearch

es = Elasticsearch(
    hosts=[{'host': 'localhost', 'port': 9200}],
    scheme='https',
    verify_certs=False,
    http_auth=('elastic','peQBjqFfi-FSUMDfjORi')
)


In [None]:
# Check connection
es.info()

In [None]:
!curl -X GET "http://localhost:9200"

In [3]:
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack import Document
#from elasticsearch import Elasticsearch

# Initialize the Elasticsearch Document Store
document_store = ElasticsearchDocumentStore(
    hosts=["http://localhost:9200"]
)

documents = [
    Document(content="This is the first document."),
    Document(content="This is the second document.")
]

In [4]:
document_store.delete_by_id(ids=["44c92a45a97fb23eaed24a20fc80da937d53735b23ea60550fea1c4654a8a370", "cda6bc67977d185948c791d9ef01d11a42b2457c503df2808ddb44d5a2b6cfb1"]})

TypeError: ElasticsearchDocumentStore.delete_documents() got an unexpected keyword argument 'filters'

In [2]:
document_store.write_documents(documents)
print(document_store.count_documents())

NameError: name 'document_store' is not defined