In [1]:
import uuid
import os
import json
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema.document import Document
from datasets import Dataset
from langchain_community.llms import Ollama
from langchain.llms import Ollama
import sys



In [2]:
__import__('pysqlite3')
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [3]:
def read_txtelements(output_folder, filename, delimiter):
    with open(os.path.join(output_folder, filename), "r") as f:
        return [s.strip() for s in f.read().split(delimiter) if s.strip()]

def read_jsonelements(output_folder, filename):
    with open(os.path.join(output_folder, filename), "r") as json_file:
        return json.load(json_file)

In [4]:
def add_data(image_text_summaries, image_text_elements, image_summaries, retriever):
    # Add texts
    # if text_summaries:
    #     doc_ids = [str(uuid.uuid4()) for _ in text_elements]
    #     summary_texts = [
    #         Document(page_content=s, metadata={id_key: doc_ids[i]})
    #         for i, s in enumerate(text_summaries)]
    #     retriever.vectorstore.add_documents(summary_texts)
    #     retriever.docstore.mset(list(zip(doc_ids, text_elements)))

    # Add image texts
    if image_text_summaries:
        doc_ids = [str(uuid.uuid4()) for _ in image_text_elements]
        summary_texts = [
            Document(page_content=s, metadata={id_key: doc_ids[i]})
            for i, s in enumerate(image_text_summaries)]
        retriever.vectorstore.add_documents(summary_texts)
        retriever.docstore.mset(list(zip(doc_ids, image_text_elements)))

    # Add tables
    # if table_summaries:
    #     table_ids = [str(uuid.uuid4()) for _ in table_elements]
    #     summary_tables = [
    #         Document(page_content=s, metadata={id_key: table_ids[i]})
    #         for i, s in enumerate(table_summaries)]
    #     retriever.vectorstore.add_documents(summary_tables)
    #     retriever.docstore.mset(list(zip(table_ids, table_elements)))

    # Add images
    if image_summaries:
        img_ids = [str(uuid.uuid4()) for _ in image_summaries]
        summary_img = [
            Document(page_content=s, metadata={id_key: img_ids[i]})
            for i, s in enumerate(image_summaries)]
        retriever.vectorstore.add_documents(summary_img)
        retriever.docstore.mset(list(zip(img_ids, image_summaries)))  
    return retriever

In [5]:
def process_subfolder(subfolder_path, retriever):
    print(f"Processing subfolder: {subfolder_path}")
    delimiter = "~~~"
    # text_elements = read_elements(output_folder, "text_elements.txt")
    # table_elements = read_elements(output_folder, "table_elements.txt")
    # text_summaries = read_elements(output_folder, "text_summaries.txt", delimiter)
    # table_summaries = read_elements(output_folder, "table_summaries.txt", delimiter)
    image_text_elements = read_jsonelements(subfolder_path, "imagestexts.json")
    image_text_elements = [i['text'] for i in image_text_elements]
    image_text_summaries = read_txtelements(subfolder_path, "imagestexts_summary.txt", delimiter)
    image_summaries = read_txtelements(subfolder_path, "image_summaries.txt", delimiter)

    retriever = add_data(image_text_summaries, image_text_elements, image_summaries, retriever)
    return retriever

In [6]:
vectorstore = Chroma(collection_name="summaries", embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))
store = InMemoryStore()  
id_key = "doc_id"
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key,)


In [7]:
output_folder = "/home/vqa/RAG/10_manuals_256_summaries"

In [8]:
for item in os.listdir(output_folder):
    item_path = os.path.join(output_folder, item)
    if os.path.isdir(item_path):
        # Now item_path is a subdirectory within output_folder
        # For each subdirectory, perform the operations you need
        process_subfolder(item_path, retriever)

Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/samsung_cell phone accessories manuals_de30be54-38b5-46f4-8dc4-a2376a6e571d_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/bose_headphones manuals_9917ef89-897c-6524-2502-2b0a91ec7d62_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/owg_en_wms_soundlink_adapterkit_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/bose_headphones_manuals_6d12e20d-cf8d-4337-b073-53d7e85e4163_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/samsung_video gaming accessories manuals_f0ee75e7-18ff-4260-ac5c-3db2ec0f8fd4_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/sony_laptop_manuals_a02cf092-3538-4646-ab93-8cae84a07ad2_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/og_wave-bma_en_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/samsung_audio_box_eo-sb330_um_sea_rev.1.0_140728_screen_extrac

In [9]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

In [10]:
# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
Answer:
"""

In [11]:
# prompt = PromptTemplate.from_template(template)
prompt = ChatPromptTemplate.from_template(template)

In [12]:
model = Ollama(model="llama2:7b-chat")
# model = Ollama(model="llama2", verbose=True)

In [13]:
# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser())

In [14]:
question = "What should I do after removing a screw?"



answer = chain.invoke(question)
print(answer)


After removing a screw, you should ensure that the component or part being secured by the screw is properly secured and stable before proceeding with any further steps. This may involve tightening other screws or bolts to maintain the stability of the component, or it may involve replacing the removed screw with a new one to secure the component in place. It is important to follow proper safety protocols when working with mechanical components and to consult a professional if you are unsure about how to properly secure a component after removing a screw.


In [15]:
from datasets import Dataset
import pandas as pd
import json
from tabulate import tabulate

EVALUATION

In [16]:

data = []
# with open('10_manuals.jsonl', 'r') as f:
with open('10_manuals.jsonl', 'r') as f:
    for line in f:

        json_data = json.loads(line)
        id = json_data['id'][:-6]
        qa_data = json_data['qa_data']
        for i in qa_data:
            question = i['question']['text']
            answer = i['answer']['text']
        data.append({'id': id, 'question' : question, 'ground_truth': answer})

df = pd.DataFrame(data)
df = pd.DataFrame(data, columns = ['id', 'question', 'ground_truth'])
print(tabulate(df, headers='keys', tablefmt='psql'))

n = len(pd.unique(df['id']))
print("No.of.unique values :", n)

+-----+-------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|     | id                                                                            | question                                                                                                                        | ground_truth                                                                                                                                                                                                                                                                      

In [17]:
import pandas as pd


unique_values = (pd.unique(df['id']))
print(unique_values)

unique_values_df = pd.DataFrame({'unique_values': unique_values})

# Merge the original DataFrame with the DataFrame containing unique values
merged_df = pd.merge(df, unique_values_df, how='inner', left_on='id', right_on='unique_values')

# Group by the unique values and select the first 5 occurrences of each
df = merged_df.groupby('unique_values').head(10)

['bose_headphones manuals_9917ef89-897c-6524-2502-2b0a91ec7d62'
 'bose_headphones_manuals_6d12e20d-cf8d-4337-b073-53d7e85e4163'
 'dell_cell phone manuals_4686e2e1-87a4-4f6a-bf20-61c646c11bb9'
 'og_wave-bma_en' 'owg_en_wms_soundlink_adapterkit'
 'samsung_audio_box_eo-sb330_um_sea_rev.1.0_140728_screen'
 'samsung_cell phone accessories manuals_de30be54-38b5-46f4-8dc4-a2376a6e571d'
 'samsung_vacuum cleaner manuals_8cb9360e-cafe-4c53-9d35-ef193667e586'
 'samsung_video gaming accessories manuals_f0ee75e7-18ff-4260-ac5c-3db2ec0f8fd4'
 'sony_laptop_manuals_a02cf092-3538-4646-ab93-8cae84a07ad2']


In [18]:
questions = df['question'].tolist()
ground_truth = df['ground_truth'].tolist()
ground_truths = [[value] for value in ground_truth]

print(questions)
print(ground_truths)
answers = []
contexts = []

[['You can search the website at www.Bose.com/compliance.'], ['It includes on-ear headphones, a headphone cord with an inline microphone, four 3.5mm to 2.5mm mobile phone adapters, and a carrying case.'], ['Each adapter has a symbol of a circle, triangle, diamond, or square.'], ['You should find the appropriate adapter and connect the headphone cord plug to the adapter. Then you should connect the adapter to your phone.  '], ['You can rotate the earcups until the cushions face you, lay the headset down in front of you, and pivot the earcups into the headband and place the headset in the case.'], ['You can lift the headset out of the case and pivot the earcups out of the headband, hold the headset with the headband up, and rotate the earcups until the cushions face teacher other.'], ['You need to be sure the earcup ports are kept clear and that no moisture is allowed to get inside the earcup.'], ['You can make sure the cushion is flush all the way around and that there are no gaps betwe

In [19]:
print(len(ground_truths))

100


In [20]:
num_lists = len(questions) // 5 + (len(questions) % 5 > 0)  # Calculate the number of lists needed
splitted_questions = [questions[i * 5 : (i + 1) * 5] for i in range(num_lists)] 
empty_lists = [f'a{i} = []' for i in range(1, 21)]

In [21]:
splitted_questions[1]

['How can I take the headset out of the case?',
 'How can I clean the headset?',
 'What should I make sure of when reattaching earcup cushions?',
 'What can I do if I still need help?',
 'What is the official website of Bose in Japan?']

In [22]:
answers = []

In [23]:
for sublist_index, sublist in enumerate(splitted_questions, start=1):
    if sublist_index > 10:
        break
    for index, question in enumerate(sublist):
        print(f"Index: {index}")
        print(question)
        answer = chain.invoke(question)
        print(answer)
        answers.append(answer)

Index: 0
Where can I know the complete Declaration of Conformity?
Based on the context provided, the complete Declaration of Conformity can be found in the image with the word "Specifications" at the bottom.
Index: 1
What does my Bose mobile on-ear headset include?


KeyboardInterrupt: 

: 

In [None]:
answers2 = []

In [None]:
for sublist_index, sublist in enumerate(splitted_questions[10:], start=11):
    for index, question in enumerate(sublist):
        print(f"Index: {index}")
        print(question)
        answer = chain.invoke(question)
        print(answer)
        answers2.append(answer)

In [None]:
print(len(answers))
print(len(answers2))
print(len(questions))


In [None]:
final = answers + answers2

In [None]:
data = {
    "question": questions,
    "answer": final,
    "ground_truths": ground_truths,
    # "contexts": contexts,
}

In [None]:
# convert dict to dataset
dataset = Dataset.from_dict(data)
print(dataset)

In [None]:
print(answers[1:])

In [None]:
from bert_score import score

# Example texts
generated = final

# Calculate BERTScore
P, R, F1 = score(generated, ground_truths, lang='en', verbose=True)

# Print scores
print(f"Precision: {P.mean()}")
print(f"Recall: {R.mean()}")
print(f"F1 Score: {F1.mean()}")