In [1]:
import sys
import sqlite3
import os
import streamlit as st
from dotenv import load_dotenv, find_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA, LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import  ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain

In [2]:
os.environ["OPENAI_API_KEY"] = "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"

In [3]:
class LLM_QA:
    """
    Defines the method to perform QA over given source of data using LLM Models
    """

    def load_document(self,file):
        """
        Loads PDF, DOCX and TXT files as LangChain Documents

        Args:
            file: the input file which needs to be loaded as Langchain document
        Returns:
            data: <langchain loader> a langchain loader object
        """
        _, extension = os.path.splitext(file)

        if extension == '.pdf':
            from langchain.document_loaders import PyPDFLoader
            print(f'Loading {file}')
            loader = PyPDFLoader(file)
        elif extension == '.docx':
            from langchain.document_loaders import Docx2txtLoader
            print(f'Loading {file}')
            loader = Docx2txtLoader(file)
        elif extension == '.txt':
            from langchain.document_loaders import TextLoader
            loader = TextLoader(file, encoding='utf8')
        else:
            print('Document format is not supported!')
            return None
        data = loader.load()
        return data

    def chunk_data(self,data, chunk_size=256, chunk_overlap=0):
        """
        Splits the data into chunks

        Args:
            data: <langchain loader> a langchain loader object
            chunk_size: <int> integer input which suggest what should be the chunk size
            chunk_overlap: <int> integer determining the chunk overlap size
        Returns:
            chunks: <langchain chunk object> a langchain chunk object
        """
        from langchain.text_splitter import RecursiveCharacterTextSplitter
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        chunks = text_splitter.split_documents(data)
        return chunks

    def create_embeddings(self,chunks):
        """
        Create embeddings using OpenAIEmbeddings() and save them in a Chroma vector store

        Args:
            chunks: <langchain chunk object> a langchain chunk object
        Returns:
            vector_store: <vector store> a vectore store which stores the generated embeddings
        """
        embeddings = OpenAIEmbeddings()
        vector_store = Chroma.from_documents(chunks, embeddings)
        return vector_store

    def calculate_embedding_cost(self,texts):
        """
        Calculate embedding cost using tiktoken

        Args:
            texts: <str> the input text
        Returns:
            total_tokens: <str> total tokens present in the input text
            embedding cost: <float> the estimated embedding cost
        """
        import tiktoken
        enc = tiktoken.encoding_for_model('text-embedding-ada-002')
        total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
        return total_tokens, total_tokens / 1000 * 0.0004

    def identify_source_language(self,query):
        """
        Identifies the language of the query

        Args:
            query: <str> the input query
        Returns:
            source_language: <str> language of the query
        """
        system_template_lang_detection = "I want you act as a language detector. I will input a sentence in any language and you will answer me in which language the sentence I wrote is in. Do not write any explanations or other words, just reply with the language name"
        human_template_lang_detection = "Sentence: {source_sentence}"

        system_prompt_lang_detection = SystemMessagePromptTemplate.from_template(system_template_lang_detection)
        human_prompt_lang_detection = HumanMessagePromptTemplate.from_template(human_template_lang_detection)
        lang_detection_prompt = ChatPromptTemplate.from_messages([system_prompt_lang_detection, human_prompt_lang_detection])

        lang_detection_chain = LLMChain(llm=llm, prompt=lang_detection_prompt)
        source_language = lang_detection_chain.run(source_sentence=query)
        return source_language

    def translate_text(self,query,source_language,target_language):
        """
        Translate the given query from source_language to target_language

        Args:
            query: <str> the input query which needs to be translated
            source_language: <str> source language of the query
            target_language: <str> target language of the query
        Returns:
            target_sentence: <str> the translated sentence in target language
        """
        system_template_translation = "You are a good Translator. Translate this sentence from {source_language} to {target_language}. Do not write any other words, just reply with the translated sentence."
        human_template_translation = "Sentence: {source_sentence}"

        system_prompt_translation = SystemMessagePromptTemplate.from_template(system_template_translation)
        human_prompt_translation = HumanMessagePromptTemplate.from_template(human_template_translation)
        translation_prompt = ChatPromptTemplate.from_messages([system_prompt_translation, human_prompt_translation])
        translation_chain = LLMChain(llm=llm, prompt=translation_prompt)
        target_sentence = translation_chain.run(source_language=source_language,source_sentence=query,target_language=target_language)
        return target_sentence

    def ask_and_get_answer(self,vector_store, query, k=3):
        """
        Given a vector store and input query use LLM Chain to return the answer

        Args:
            vector_store: <vector store> a vectore store which stores the embeddings
            query: <str> the input query
            k: <int> integer determining number of documents to be retreived
        Returns:
            answer: <str> the generated answer
        """
        retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})
        chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
        answer = chain.run(query)
        return answer

    def ask_and_get_answer_only_if_present(self,vector_store, query, k=3):
        """
        Given a vector store and input query use LLM Chain to return the answer with minimization of hallucinations

        Args:
            vector_store: <vector store> a vectore store which stores the embeddings
            query: <str> the input query
            k: <int> integer determining number of documents to be retreived
        Returns:
            answer: <str> the generated answer
        """
        prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

        {context}

        Question: {question}
        Answer:"""
        PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
        )
        chain_type_kwargs = {"prompt": PROMPT}

        retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})
        chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, chain_type_kwargs=chain_type_kwargs)

        answer = chain.run(query)
        return answer

    def ask_with_memory(self,vector_store, question, chat_history=[], k=3):
        """
        Given a vector store, input question and a context history use theLLM Chain to return the answer

        Args:
            vector_store: <vector store> a vectore store which stores the embeddings
            question: <str> the input query
            chat_history:  <list> a list containing the history of previous asked question and returned answers
            k: <int> integer determining number of documents to be retreived
        Returns:
            answer: <str> the generated answer
            chat_history: <list> a list containing the history of previous asked question and returned answers
        """
        from langchain.chains import ConversationalRetrievalChain
        from langchain.chat_models import ChatOpenAI

        llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)
        retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

        crc = ConversationalRetrievalChain.from_llm(llm, retriever)
        print('question: ',question)
        #print('chat_history: ',chat_history)
        result = crc({'question': question, 'chat_history': chat_history})
        chat_history.append((question, result['answer']))
        return result['answer'], chat_history


In [4]:
llm_obj  =  LLM_QA()
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)

file_name="data//MFL71521482_new_lower.txt"

chunk_size = 512

data = llm_obj.load_document(file_name)
chunks = llm_obj.chunk_data(data, chunk_size=chunk_size)
print(f'Chunk size: {chunk_size}, Chunks: {len(chunks)}')

tokens, embedding_cost = llm_obj.calculate_embedding_cost(chunks)
print(f'Embedding cost: ${embedding_cost:.4f}')

# creating the embeddings and returning the Chroma vector store
vector_store = llm_obj.create_embeddings(chunks)

queries = open('data/TestCases/combined_data.txt').readlines()
queries = [q.strip() for q in queries]
answers = []
for query in queries:
    chat_history=[]
    answer,chat_history = llm_obj.ask_with_memory(vector_store, query, chat_history=chat_history)
    print('Answer:',answer)
    print('\n')
    #chat_history.append((q, answer))
    answers.append(answer)


Chunk size: 512, Chunks: 296
Embedding cost: $0.0094
question:  Why does my washer have very little water in it? How does it wash clothes?
Answer: Your washer has very little water because it is a high-efficiency washer designed to use a minimum amount of water and energy. Despite the low water volume, the small amount of detergent used is more concentrated in the water, resulting in excellent cleaning performance. The washer rotates the laundry in a way that allows it to use less water while still fully saturating the clothing. It is normal to not see much water during the wash cycle.


question:  Please explain why my washer makes such loud banging noises when the water fills up?
Answer: The loud banging noises that your washer makes while it is filling with water are actually caused by a plumbing issue in your home. When the water valve in the washer shuts off, it can cause the pipes in your home to move or flex, resulting in the loud banging noise. This effect is known as water ham

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


Answer: I'm sorry, but as a text-based AI, I am unable to display any visual content. However, I can provide you with the safety messages in text format.


question:  what to avoid when using the product
Answer: To avoid any potential hazards or damage when using the product, you should:

1. Follow all instructions provided in the product's owner's manual.
2. Use the product only for normal household use and avoid using it for commercial or industrial purposes.
3. Use only approved and authorized accessories, components, and cleaning products recommended by the manufacturer.
4. Avoid using the product in locations or settings that are not outlined in the product's owner's manual, such as commercial offices, recreational facilities, or vehicles.
5. Ensure that the product is installed properly according to the instructions in the owner's manual.
6. Do not attempt to remove or reinstall the product if it is in an inaccessible location or if you are not familiar with the proper procedure.

In [5]:
with open('data/TestCases/combined_data_answers.txt','w') as f:
    for ans in answers:
        f.write(ans+'\n')

In [6]:
import pandas as pd

In [7]:
df = pd.read_excel('data/TestCases/combined_data.xlsx')
df['LLM'] = answers
df.to_excel('data/TestCases/combined_data.xlsx',index=False)