### 01_load2vectordb

#### Overview
This script is part of a pipeline that processes PDF documents and stores their data in a vector database for easy retrieval and analysis. The script performs the following key steps:

#### Use Cases
- Enhance search capabilities within a document database.
- Enable similarity-based document analysis.
- Facilitate advanced text analytics in a variety of domains.

##### 1. Data Loading from PDFs
##### 2. Text Splitting
##### 3. Embedding Creation
##### 4. Context-based chatbot


##### Imports

In [2]:
from dotenv import load_dotenv
import os
from langchain.chains import create_extraction_chain
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
from langchain_openai import OpenAIEmbeddings
import PyPDF2


##### Load my api key

In [3]:
# Load the .env file
load_dotenv()

# Accessing variables
myapikey = os.getenv('OPENAI_API_KEY')

##### 1. Data Loading from PDFs

In [5]:
# Define the folder path to the PDF files
folder_path = '../data/cz_vfr_manual/'

# Get a list of PDF files in the directory
pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
pdf_paths = [os.path.join(folder_path, file) for file in pdf_files]

##### 2. Text Splitting

In [21]:


tokenizer_name = tiktoken.encoding_for_model('gpt-4')
tokenizer_name.name

# create the length function
def tiktoken_len(text):
    tokens = tokenizer_name.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=64,
    length_function=tiktoken_len,
    separators=["."],
    is_separator_regex=False,
)

def extract_cz_tags_from_filename(filename):
    # Extract the first occurrence of ICAO code from the filename
    match = re.search(r'lk[a-z]{2,6}', filename)
    
    # Return the found code in uppercase if there's a match, otherwise return None
    return match.group().upper() if match else None

OpenAI Chat model

In [None]:
llm = ChatOpenAI(openai_api_key=myapikey, temperature=0, model_name="gpt-4")

OpenAI Embeddings model

In [22]:


model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=myapikey
)

Loading PDF to chunks enhanced with ICAO code

In [25]:
chunked_texts_with_tags = []
# Iterate over each PDF file
for path in pdf_paths:
    # Extract the filename from the path
    filename = os.path.basename(path)

    # Extract tags from the filename
    pdf_tags = extract_cz_tags_from_filename(filename)
    #print(pdf_tags)
    # Open and read the PDF file
    pdf_reader = PyPDF2.PdfReader(path)
    full_text = f"Document: {filename}\n"  # Start with the filename

    for page in pdf_reader.pages:
        # if less than 200 char then skip
        if len(page.extract_text()) < 200:
            continue
        else:
            page_text = page.extract_text()
            if page_text:  # Check if text extraction is successful
                page_text = page_text.replace('\n', '')
                full_text += page_text
            else:
                full_text += f"[No text extracted from this page]\n"
            
            chunks = text_splitter.split_text(full_text)
            # Add tags to each chunk
            for chunk in chunks:
                chunk_with_tags = f"{(pdf_tags)}, {chunk}"
                chunked_texts_with_tags.append(chunk_with_tags)

##### 3. Embedding Creation

In [27]:
from langchain.vectorstores import Chroma
vectorstore = Chroma(embedding_function=OpenAIEmbeddings(), persist_directory="../data/vectordb/")
db2 = vectorstore.from_texts(chunked_texts_with_tags, embed, persist_directory="../data/vectordb/")

##### 4. Context-based chatbot

In [36]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    retriever=db2.as_retriever(search_kwargs={'k': 5}),
    return_source_documents=True
)

# we can now execute queries against our Q&A chain
result = qa_chain({'query': 'Altitude of Letnany Aiport'})
print(result['result'])

 The altitude of Letnany Airport is 912 ft / 278 m above mean sea level.
