## LLM + Retrieval Augmented Generation (RAG)

## Install additional libraries

In [None]:
# Install Required Libraries (only if not already installed)
import sys
import subprocess
import importlib.util

def install_if_missing(import_name, pip_name=None):
    """Install a package if it is not already installed."""
    if importlib.util.find_spec(import_name) is None:
        subprocess.check_call([sys.executable, "-m", 
                               "pip", 
                               "install", 
                               pip_name or import_name])

# Dictionary: {import_name: pip_install_name}
packages = {
    "faiss": "faiss-cpu",
    "tiktoken": "tiktoken"
}

for import_name, pip_name in packages.items():
    install_if_missing(import_name, pip_name)


## Imports and API keys

In [None]:
# Imports and API Keys
import os
import json
import textwrap
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# Read OpenAI API key
try:
    with open('./data/credentials.json') as f:
        credentials = json.load(f)
    api_key = credentials['openai']['api_key']
except:
    print("Please provide your OpenAI API key in the credentials.json file.")

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Load PDF and convert to LangChain documents

In [None]:
# Load PDF and Convert to LangChain Documents
def load_pdf_with_pypdf(path):
    """Load a PDF file and convert it into LangChain documents using PyPDF2."""
    reader = PdfReader(path)
    documents = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            documents.append(Document(page_content=text, metadata={"page": i + 1}))
    return documents

# Provide PDF file
pdf_path = "./data/entire-vw-ar23.pdf"
documents = load_pdf_with_pypdf(pdf_path)

## Split text into chunks

In [None]:
# Split Text into Chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(documents)

## Create and store embeddings

In [None]:
# Create embeddings
embeddings = OpenAIEmbeddings(openai_api_key=api_key)

# Store chunks in FAISS vector store
vectorstore = FAISS.from_documents(chunks, embeddings)

## Conver the vector store into a retriever object and initialize LLM

In [None]:
# Convers the vector store into a retriever object
retriever = vectorstore.as_retriever()

# Initialize LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", 
                 temperature=0,
                 openai_api_key=api_key)

# Creates a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True)

## Ask questions about the PDF

In [None]:
# Ask a question about the PDF content
query = """What was the total sales revenue and operating result for the 
           Passenger Cars and Light Commercial Vehicles segment in 2023? In 
           which section can I find this information?"""

# Get answer
result = qa_chain(query)

# Print formatted answer
print("Answer:")
wrapped = textwrap.fill(result["result"], width=80)
print(wrapped)

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')