### Local RAG Part 1: RAG Pipeline Initialization and Creation

- Gattering all text documents in the provided local directory
- Processing the files, extacting and embeding the text content
- Creating a new vector database and saving the embedings
- visualizing the created vector database

In [1]:
# imports

import os
import glob
from typing import List
import gradio as gr
from pathlib import Path
from dotenv import load_dotenv
import chardet  # For detecting file encoding

In [2]:
# imports for langchain

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [3]:
# environment

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')

In [5]:
# price is a factor for our company, so we're going to use a low cost model

OPENAI_MODEL = "gpt-4o-mini" #"gpt-4o"
CLAUDE_MODEL = "claude-3-5-sonnet-20240620"
MODEL = "gpt-4o-mini"

# Want to keep costs ultra-low? Uncomment these lines:
# OPENAI_MODEL = "gpt-4o-mini"
# CLAUDE_MODEL = "claude-3-haiku-20240307"

db_name = "mj_vector_db"

In [6]:
def find_documents(directory: str) -> List[str]:
    """
    Find all PDF and Word files in the given directory and its subdirectories,
    excluding hidden files and directories
    
    Args:
        directory (str): Path to the directory to search
        
    Returns:
        List[str]: List of file paths
    """
    document_paths = []
    extensions = ('.pdf', '.doc', '.docx')
    
    for root, dirs, files in os.walk(directory):
        # Skip hidden directories (in-place modification of dirs)
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        
        for file in files:
            # Skip hidden files and check for valid extensions
            if not file.startswith(('.', '~', '$')) and file.lower().endswith(extensions):
                full_path = os.path.join(root, file)
                document_paths.append(full_path)
                print(f"Found document: {full_path}")
    
    # Sort the paths for consistent ordering
    document_paths.sort()
    
    print(f"\nTotal document files found: {len(document_paths)}")
    return document_paths

In [7]:
def load_documents(document_paths: List[str]) -> List[Document]:
    """
    Load documents using appropriate LangChain loaders based on file extension
    
    Args:
        document_paths (List[str]): List of file paths to load
        
    Returns:
        List[Document]: List of LangChain Document objects
    """
    from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
    
    documents = []
    extensions = {
        '.pdf': PyPDFLoader,
        '.docx': Docx2txtLoader,
        '.doc': Docx2txtLoader  # Note: .doc files might need a different loader
    }
    
    for file_path in document_paths:
        file_extension = os.path.splitext(file_path.lower())[1]
        if file_extension in extensions:
            try:
                # Load the document using appropriate loader
                loader = extensions[file_extension](file_path)
                doc_folder = os.path.basename(os.path.dirname(file_path))
                doc_name = file_path.split('/')[-1]
                
                # Load and process the documents
                loaded_docs = loader.load()
                
                # Add metadata to each document
                for doc in loaded_docs:
                    doc.metadata.update({
                        "source": file_path,
                        "doc_folder": doc_folder,
                        "doc_name": doc_name,
                        "file_type": file_extension
                    })
                    documents.append(doc)
                    
            except Exception as e:
                print(f"Error loading {file_path}: {str(e)}")
    
    print(f"\nTotal documents loaded: {len(documents)}")
    return documents

In [29]:
# Directory to search
search_directory =  "/Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering"

# Find all documents
print("Finding documents...")
document_paths = find_documents(search_directory)
print(f"Found {len(document_paths)} document files")

# Then load them into LangChain Document objects
documents = load_documents(document_paths)

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

Finding documents...
Found document: /Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/SETUP-PC.pdf
Found document: /Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/all_summaries copy.docx
Found document: /Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/reqdoc.docx
Found document: /Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/AISera_2024.pdf
Found document: /Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/SETUP-linux.pdf
Found document: /Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/SETUP-mac.pdf
Found document: /Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/all_summaries.docx
Found document: /Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/mj_rag/files/all_summaries_copy.docx
Found document: /Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/mj_rag/fi

In [9]:
documents[0]

Document(metadata={'source': '/Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/udemy_summarizer/all_summaries copy.docx', 'doc_folder': 'udemy_summarizer', 'doc_name': 'all_summaries copy.docx', 'file_type': '.docx'}, page_content="128. Day 1 - Fine-Tuning Large Language Models- From Inference to Training\n\nSummary\n\nIntroduction\n\nThe speaker is moving from the excitement around the topic of inference in earlier weeks to the world of training models. Training models involves going into the specifics of how to enhance their performance at inference. \n\nFocus on Data\n\nThe speaker indicates that understanding the data is essential. This involves curating the data, cleaning it, and visualizing it to ensure that it is in an optimal state for model training. Achieving project success and how to gauge it is also discussed.\n\nReview of Past Weeks\n\nThe speaker then reviews the previous weeks:\n\nWeek 1: Introduced frontier models.\n\nWeek 2: Covered using mul

In [10]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

# embeddings = OpenAIEmbeddings()

# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers
# Then replace embeddings = OpenAIEmbeddings()
# with:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete if already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Vectorstore created with 95 documents


In [11]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 384 dimensions


### Visualizing the Vector Store

Let's take a minute to look at the documents and their embedding vectors to see what's going on.

In [12]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_name'] for metadata in result['metadatas']]

In [13]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
