# Setup

In [1]:
# Import necessary modules
import getpass
import os
from enum import Enum
from operator import itemgetter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter


USER_AGENT environment variable not set, consider setting it to identify your requests.


# Loading / Splitting dataset

In [5]:
# Load and split documents
pdf_directory = "./PDF_FILES"
recursive_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000, add_start_index=True)

all_docs = []
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        print(filename)
        pdfloader = PyPDFLoader(os.path.join(pdf_directory, filename))
        docs = pdfloader.load_and_split(text_splitter=recursive_splitter)
        all_docs.extend(docs)
        

AS-BIKE-SHOP-WEB-MANUAL.pdf
dyson_contrarotator.pdf


# Embedding Phase


In [6]:

# Embedding Phase
sentence_transformer_ef = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(documents=all_docs, embedding=sentence_transformer_ef)


# Other embedding models are available in:
# - lang_chain_embedding_models
# - sentence-transformers
# - Kaggle (includes all ML models, not just embedding ones)

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
2024-06-19 11:14:50.361574: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Retrieving Phase

In [7]:
# Retrieving Phase
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

# Retrieve test
retrieved_docs = retriever.invoke("what does happen to bracket components over time?")
print("Context retrieved: ")
for retrieved_doc in retrieved_docs:
    print(retrieved_doc.page_content)

Context retrieved: 
bracket bearings. During rotation, the bearings roll on the bottombracket cup surfaces. Over time, the bottom bracketsystem canbecome loose and this can accelerate the wear on thebearings andcup surfaces. It is important to properly maintainbottom bracketcomponents to extend their lifetime.Typically, inthe shop weseepress-fittedone-piece bottom brackets(common onbeach cruisers),English threaded three-piece bottom brackets, andEnglishthreadedcartridge bottom brackets.
One-Piece Bottom Brackets
In one-piece bottom brackets, one S-shaped piece ofsteel forms thecrank arms and goes from the pedal, through the bottombracket, tothe other pedal. This crank also acts as the bottombracket bearingaxle. The crank arm is threaded in the middle andacts as the bottombracket’s spindle. The crank drive side threadingis right-handthreaded, and the non-drive side (left side) is left-handthreaded. It isnecessary to first remove the left side pedal in orderto remove thecrank.
One-Piece 

# Answer generation phase

In [8]:

#get password using GUI
os.environ["OPENAI_API_KEY"] = getpass.getpass()

#class to grade llm answer
class gradeEnum(str, Enum):
    correct = "correct"
    incorrect = "incorrect"

#class to eval answer by question and context
class LLMEvalResult(BaseModel):
    grade: gradeEnum = Field(description="Final grade label. Accepted labels: correct, incorrect")
    description: str = Field(description="Explanation of why the specific grade was assigned. Must be concise. Not more than 2 sentences")
    llm_answer: str = Field(description="The original LLM answer evaluated", default="")
    context: str = Field(description="The context used for the evaluation", default="")

json_parser = JsonOutputParser(pydantic_object=LLMEvalResult)


# LLM for answering questions and evaluating Q-A pairs
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")
llm_selfeval = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Define prompts
qa_prompt = ChatPromptTemplate.from_messages([
    ("human", "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Question: {question} Context: {context} Answer:")
])

qa_eval_prompt_text = """
You are a teacher evaluating a test. 
You are provided with a question along with an answer for the question written by a student. Evaluate the question-answer pair and provide feedback.
{format_instructions}
Question: {question}
Answer: {answer}
"""

qa_eval_prompt = PromptTemplate(
    template=qa_eval_prompt_text,
    input_variables=["question", "answer"],
    partial_variables={"format_instructions": json_parser.get_format_instructions()}
)

qa_eval_prompt_with_context_text = """
You are a teacher evaluating a test. 
You are provided with a question along with an answer for the question written by a student. Evaluate the question-answer pair using the provided context and provide feedback. Only mark the answer as correct if it agrees with the provided context.
{format_instructions}
Context: {context}
Question: {question}
Answer: {answer}
"""

qa_eval_prompt_with_context = PromptTemplate(
    template=qa_eval_prompt_with_context_text,
    input_variables=["question", "answer", "context"],
    partial_variables={"format_instructions": json_parser.get_format_instructions()}
)

# Define RAG Chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def retrieve_answer(output):
    return output.content

def process_eval_output(output, llm_answer, context):
    if output['grade'] == "correct":
        output['llm_answer'] = llm_answer
    output['context'] = context
    return output

rag_chain = (
    RunnableParallel(
        context=retriever | format_docs,
        question=RunnablePassthrough()
    ) |
    RunnableParallel(
        answer=qa_prompt | llm | retrieve_answer,
        question=itemgetter("question"),
        context=itemgetter("context")
    ) |
    RunnableParallel(
        eval_result=qa_eval_prompt_with_context | llm_selfeval | json_parser,
        context=itemgetter("context"),
        question=itemgetter("question"),
        answer=itemgetter("answer")
    ) |
    (lambda x: process_eval_output(x['eval_result'], x['answer'], x['context']))
)

# Get user prompt and invoke RAG chain
user_prompt = "what does happen to bracket components over time?"
json_answer = rag_chain.invoke(user_prompt)
print(f"Prompt: {user_prompt}\n\nContext: {json_answer['context']}\n\nAnswer: {json_answer['llm_answer']}\n\nEvaluation: {json_answer['grade']}\n\nAnswer description: {json_answer['description']}")


Prompt: what does happen to bracket components over time?

Context: bracket bearings. During rotation, the bearings roll on the bottombracket cup surfaces. Over time, the bottom bracketsystem canbecome loose and this can accelerate the wear on thebearings andcup surfaces. It is important to properly maintainbottom bracketcomponents to extend their lifetime.Typically, inthe shop weseepress-fittedone-piece bottom brackets(common onbeach cruisers),English threaded three-piece bottom brackets, andEnglishthreadedcartridge bottom brackets.
One-Piece Bottom Brackets
In one-piece bottom brackets, one S-shaped piece ofsteel forms thecrank arms and goes from the pedal, through the bottombracket, tothe other pedal. This crank also acts as the bottombracket bearingaxle. The crank arm is threaded in the middle andacts as the bottombracket’s spindle. The crank drive side threadingis right-handthreaded, and the non-drive side (left side) is left-handthreaded. It isnecessary to first remove the left s