In [1]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key: ")

In [2]:
import os
import getpass

os.environ["PINECONE_API_KEY"] = getpass.getpass("Pinecone API Key:")

In [4]:
from langchain.document_loaders import PyMuPDFLoader
from dotenv import load_dotenv
import os

load_dotenv()

# car_manual = PyMuPDFLoader('docs/owner_manual.pdf')
car_manual = PyMuPDFLoader(os.environ.get('pdfurl'))

In [5]:
car_manual_data = car_manual.load()
print(car_manual_data)



In [6]:
partial_car_manual_data = car_manual_data[:20]

print(partial_car_manual_data[0])

page_content="XC60\nOWNER'S MANUAL\n" metadata={'source': 'https://az685612.vo.msecnd.net/pdfs/20w17/XC60_OwnersManual_MY21_en-GB_TP32005/XC60_OwnersManual_MY21_en-GB_TP32005.pdf', 'file_path': 'https://az685612.vo.msecnd.net/pdfs/20w17/XC60_OwnersManual_MY21_en-GB_TP32005/XC60_OwnersManual_MY21_en-GB_TP32005.pdf', 'page': 0, 'total_pages': 720, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'GPL Ghostscript 9.06', 'creationDate': "D:20200421104728+02'00'", 'modDate': "D:20200421104728+02'00'", 'trapped': ''}


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken

def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
        text,
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 50,
    length_function = tiktoken_len,
)

car_manual_chunks = text_splitter.split_documents(partial_car_manual_data)
car_manual_chunks

[Document(page_content="XC60\nOWNER'S MANUAL", metadata={'source': 'https://az685612.vo.msecnd.net/pdfs/20w17/XC60_OwnersManual_MY21_en-GB_TP32005/XC60_OwnersManual_MY21_en-GB_TP32005.pdf', 'file_path': 'https://az685612.vo.msecnd.net/pdfs/20w17/XC60_OwnersManual_MY21_en-GB_TP32005/XC60_OwnersManual_MY21_en-GB_TP32005.pdf', 'page': 0, 'total_pages': 720, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'GPL Ghostscript 9.06', 'creationDate': "D:20200421104728+02'00'", 'modDate': "D:20200421104728+02'00'", 'trapped': ''}),
 Document(page_content="VÄLKOMMEN!\nWe hope you will enjoy many years of driving pleasure in your Volvo.\nThe car has been designed for the safety and comfort of you and\nyour passengers. Volvo strives to design one of the safest cars in the\nworld. Your Volvo is also designed to meet applicable safety and\nenvironmental requirements.\nTo increase your enjoyment of your Volvo, we recommend that you\nread the ins

In [8]:
max_chunk_length = 0

for chunk in car_manual_chunks:
    max_chunk_length = max(max_chunk_length, tiktoken_len(chunk.page_content))

max_chunk_length

392

In [9]:
from pinecone import Pinecone, PodSpec
pinecone_client = Pinecone()

pinecone_client.create_index(
    name=os.environ.get('index'),
    dimension=1536,
    metric="cosine",
    spec=PodSpec(
        environment="gcp-starter"
    )
)

while not pinecone_client.describe_index(os.environ.get('index')).status['ready']:
    print("loading")

print(f"index created: {os.environ.get('index')}")

index created: mechainic


In [17]:
from langchain.vectorstores import Pinecone
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

# vector_store = Pinecone.from_documents(car_manual_chunks, embedding_model, index_name=os.environ.get('index'))
vector_store = Pinecone.from_documents(car_manual_chunks, embedding_model, index_name="mechainic")

In [18]:
retriever = vector_store.as_retriever()

In [48]:
from langchain_core.prompts import ChatPromptTemplate

RAG_PROMPT = """

CONTEXT:
{context}

QUERY:
{question}

You are a car specialist and can only provide your answers based on the context

"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [49]:
from langchain_openai import ChatOpenAI
model = ChatOpenAI(model="gpt-3.5-turbo")

In [50]:
from operator import itemgetter
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | model, "context": itemgetter("context")}
)


In [51]:
response = retrieval_augmented_qa_chain.invoke({"question" : "What is the puporse of Event Data Recorder?"})

In [52]:
response["response"]

