In [1]:
import uuid
import os
import json
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema.document import Document
from datasets import Dataset
from langchain_community.llms import Ollama
from langchain.llms import Ollama
import sys



In [2]:
__import__('pysqlite3')
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [3]:
def read_txtelements(output_folder, filename, delimiter):
    with open(os.path.join(output_folder, filename), "r") as f:
        return [s.strip() for s in f.read().split(delimiter) if s.strip()]

def read_jsonelements(output_folder, filename):
    with open(os.path.join(output_folder, filename), "r") as json_file:
        return json.load(json_file)

In [4]:
def add_data(image_text_summaries, image_text_elements, image_summaries, retriever): 
    # Add texts
    # if text_summaries:
    #     doc_ids = [str(uuid.uuid4()) for _ in text_elements]
    #     summary_texts = [
    #         Document(page_content=s, metadata={id_key: doc_ids[i]})
    #         for i, s in enumerate(text_summaries)]
    #     retriever.vectorstore.add_documents(summary_texts)
    #     retriever.docstore.mset(list(zip(doc_ids, text_elements)))

    # Add image texts
    if image_text_summaries:
        doc_ids = [str(uuid.uuid4()) for _ in image_text_elements]
        summary_texts = [
            Document(page_content=s, metadata={id_key: doc_ids[i]})
            for i, s in enumerate(image_text_summaries)]
        retriever.vectorstore.add_documents(summary_texts)
        retriever.docstore.mset(list(zip(doc_ids, image_text_elements)))

    # Add tables
    # if table_summaries:
    #     table_ids = [str(uuid.uuid4()) for _ in table_elements]
    #     summary_tables = [
    #         Document(page_content=s, metadata={id_key: table_ids[i]})
    #         for i, s in enumerate(table_summaries)]
    #     retriever.vectorstore.add_documents(summary_tables)
    #     retriever.docstore.mset(list(zip(table_ids, table_elements)))

    # Add images
    if image_summaries:
        img_ids = [str(uuid.uuid4()) for _ in image_summaries]
        summary_img = [
            Document(page_content=s, metadata={id_key: img_ids[i]})
            for i, s in enumerate(image_summaries)]
        retriever.vectorstore.add_documents(summary_img)
        retriever.docstore.mset(list(zip(img_ids, image_summaries)))  
    return retriever

In [5]:
def process_subfolder(subfolder_path, retriever):
    print(f"Processing subfolder: {subfolder_path}")
    delimiter = "~~~"
    # text_elements = read_elements(output_folder, "text_elements.txt")
    # table_elements = read_elements(output_folder, "table_elements.txt")
    # text_summaries = read_elements(output_folder, "text_summaries.txt", delimiter)
    # table_summaries = read_elements(output_folder, "table_summaries.txt", delimiter)
    image_text_elements = read_jsonelements(subfolder_path, "imagestexts.json")
    image_text_elements = [i['text'] for i in image_text_elements]
    image_text_summaries = read_txtelements(subfolder_path, "imagestexts_summary.txt", delimiter)
    image_summaries = read_txtelements(subfolder_path, "image_summaries.txt", delimiter)

    retriever = add_data(image_text_summaries, image_text_elements, image_summaries, retriever)
    return retriever

In [6]:
vectorstore = Chroma(collection_name="summaries", embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))
store = InMemoryStore()  
id_key = "doc_id"
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key,)
retriever.search_kwargs['k'] = 2


In [7]:
output_folder = "/home/vqa/RAG/10_manuals_256_summaries"

In [8]:
for item in os.listdir(output_folder):
    item_path = os.path.join(output_folder, item)
    if os.path.isdir(item_path):
        # Now item_path is a subdirectory within output_folder
        # For each subdirectory, perform the operations you need
        process_subfolder(item_path, retriever)

Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/samsung_cell phone accessories manuals_de30be54-38b5-46f4-8dc4-a2376a6e571d_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/bose_headphones manuals_9917ef89-897c-6524-2502-2b0a91ec7d62_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/owg_en_wms_soundlink_adapterkit_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/bose_headphones_manuals_6d12e20d-cf8d-4337-b073-53d7e85e4163_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/samsung_video gaming accessories manuals_f0ee75e7-18ff-4260-ac5c-3db2ec0f8fd4_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/sony_laptop_manuals_a02cf092-3538-4646-ab93-8cae84a07ad2_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/og_wave-bma_en_extracted
Processing subfolder: /home/vqa/RAG/10_manuals_256_summaries/samsung_audio_box_eo-sb330_um_sea_rev.1.0_140728_screen_extrac

In [9]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
# from langchain_core.output_parsers import OutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

In [10]:
# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
Answer:
"""

In [11]:
# prompt = PromptTemplate.from_template(template)
prompt = ChatPromptTemplate.from_template(template)

In [12]:
model = Ollama(model="llama2:7b-chat")
# model = Ollama(model="llama2", verbose=True)

In [13]:
# RAG pipeline
chain = (
    {"context": retriever, 
     "question": RunnablePassthrough(),}
    | prompt
    | model
    | StrOutputParser())

In [14]:
from datasets import Dataset
import pandas as pd
import json
from tabulate import tabulate

In [15]:

data = []
with open('10_manuals.jsonl', 'r') as f:
# with open('20_manuals.jsonl', 'r') as f:
    for line in f:

        json_data = json.loads(line)
        id = json_data['id'][:-6]
        qa_data = json_data['qa_data']
        for i in qa_data:
            question = i['question']['text']
            answer = i['answer']['text']
        data.append({'id': id, 'question' : question, 'ground_truth': answer})

df = pd.DataFrame(data)
df = pd.DataFrame(data, columns = ['id', 'question', 'ground_truth'])
print(tabulate(df, headers='keys', tablefmt='psql'))

n = len(pd.unique(df['id']))
print("No.of.unique values :", n)

+-----+-------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|     | id                                                                            | question                                                                                                                        | ground_truth                                                                                                                                                                                                                                                                      

In [16]:
import pandas as pd

unique_values = (pd.unique(df['id']))
print(unique_values)
unique_values_df = pd.DataFrame({'unique_values': unique_values})
merged_df = pd.merge(df, unique_values_df, how='inner', left_on='id', right_on='unique_values')
df = merged_df.groupby('unique_values').head(10)



['bose_headphones manuals_9917ef89-897c-6524-2502-2b0a91ec7d62'
 'bose_headphones_manuals_6d12e20d-cf8d-4337-b073-53d7e85e4163'
 'dell_cell phone manuals_4686e2e1-87a4-4f6a-bf20-61c646c11bb9'
 'og_wave-bma_en' 'owg_en_wms_soundlink_adapterkit'
 'samsung_audio_box_eo-sb330_um_sea_rev.1.0_140728_screen'
 'samsung_cell phone accessories manuals_de30be54-38b5-46f4-8dc4-a2376a6e571d'
 'samsung_vacuum cleaner manuals_8cb9360e-cafe-4c53-9d35-ef193667e586'
 'samsung_video gaming accessories manuals_f0ee75e7-18ff-4260-ac5c-3db2ec0f8fd4'
 'sony_laptop_manuals_a02cf092-3538-4646-ab93-8cae84a07ad2']


In [17]:
df.drop(columns=['unique_values'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['unique_values'], inplace=True)


In [18]:
# MAKE THE DATEFRAME INTO A LIST OF TUPLES

qa_list = [(row['question'], row['ground_truth']) for index, row in df.iterrows()]
print(qa_list)



In [19]:
answer = chain.invoke("Which battery does the CD player use?")
print(answer)

The CD player in the 3D diagram of the watch uses its own built-in battery. The CD symbol located above the main row of buttons indicates that the remote control has the ability to control music playback from a CD player or other similar devices, but it does not use the battery of the watch itself.


In [20]:
docs = retriever.invoke("Which battery does the CD player use?")
len(docs)

2

In [21]:
print(len(retriever.get_relevant_documents("Which battery does the CD player use?")))

2


In [None]:

import os

# Update with your API URL if using a hosted instance of Langsmith.
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
# os.environ["LANGCHAIN_API_KEY"] = "ls__9478a4fed44b477bbd2a4040c50cc935"  # Cissis
os.environ["LANGCHAIN_API_KEY"] = "ls__dfec5f97d1de407f93106c572de7ca06" # Martas
project_name = "summary_rag"  # Update with your project name

In [None]:
from langsmith import Client

client = Client()

In [None]:
import uuid

dataset_name = f"Retrieval QA Questions {str(uuid.uuid4())}"
dataset = client.create_dataset(dataset_name=dataset_name)
for q, a in qa_list:
    client.create_example(
        inputs={"question": q}, outputs={"answer": a}, dataset_id = dataset.id
    )

In [None]:
print(qa_list)

In [None]:
from langchain.smith import RunEvalConfig
from langchain.evaluation import EvaluatorType

eval_config = RunEvalConfig(
    evaluators=[
        RunEvalConfig.LabeledCriteria("relevance"),
        RunEvalConfig.LabeledCriteria("coherence"), 
        "cot_qa"
        ],
    eval_llm = ChatOllama(model="llama2"),
)

In [None]:
gen_ans = []
questions = []

In [None]:
def predict(inputs: dict):
    gen_an = chain.invoke(inputs["question"])
    questions.append(inputs["question"])
    gen_ans.append(gen_an)
    return gen_an

In [None]:
print(gen_ans)

In [None]:
_ = await client.arun_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory= predict, #lambda: chain,
    evaluation=eval_config,
)


__________________

In [None]:
print(len(gen_ans))

In [None]:
print(len(questions))

In [None]:
questions

In [None]:
filtered_df = df[df['question'].isin(questions)]

# Reorder the rows based on the order of questions in 'qs'
ordered_df = filtered_df.set_index('question').reindex(questions).reset_index()

ordered_df

In [None]:
ground_truth = ordered_df['ground_truth'].tolist()
ground_truths = [[value] for value in ground_truth]
print(ground_truths)

In [None]:
from bert_score import score

# Example texts
generated = gen_ans

# Calculate BERTScore
P, R, F1 = score(generated, ground_truths, lang='en', verbose=True)

# Print scores
print(f"Precision: {P.mean()}")
print(f"Recall: {R.mean()}")
print(f"F1 Score: {F1.mean()}")