<a href="https://colab.research.google.com/github/mickymultani/pinecone-RAG/blob/main/Copy_of_Working_RAG_Pinecone_and_Llama_Index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG Pinecone & Llama Index


# Set up environment
Libraries and modules accurate as of July 2, 2024


In [None]:
!pip install -q llama-index-vector-stores-pinecone llama-index-readers-file pymupdf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.9/215.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.8/130.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.5/327.5 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [None]:
!pip install -q llama-index==0.10.36 pinecone-client[grpc]==3.0.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.4/201.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.9/141.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import re
import fitz
import urllib.parse
from pathlib import Path
from ipywidgets import widgets, interactive, Output
from google.colab import files
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.vector_stores.pinecone import PineconeVectorStore
from IPython.display import Markdown, display
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.readers.file import (
    PDFReader,
    DocxReader,
    HWPReader,
    EpubReader,
    FlatReader,
    HTMLTagReader,
    ImageReader,
    IPYNBReader,
    MarkdownReader,
    MboxReader,
    PptxReader,
    PandasCSVReader,
    PyMuPDFReader,
    XMLReader,
    PagedCSVReader,
    CSVReader,
)


In [None]:
%env PINECONE_API_KEY=xxxxxxxxx
%env OPENAI_API_KEY=xxxxxxxxx
pinecone_api_key = os.environ.get('PINECONE_API_KEY')
openai_api_key = os.environ.get('OPENAI_API_KEY')

In [None]:
# Initialize Pinecone and OpenAI
pc = Pinecone(api_key=pinecone_api_key)
index_name = "llama-rag"
pc.create_index(
    index_name,
    dimension=1536,
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
pinecone_index = pc.Index(index_name)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
embed_model = OpenAIEmbedding(api_key=openai_api_key)




In [None]:
# Define the ingestion pipeline
pipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(
            buffer_size=1,
            breakpoint_percentile_threshold=95,
            embed_model=OpenAIEmbedding(api_key=openai_api_key),
        ),
        OpenAIEmbedding(api_key=openai_api_key),
    ],
    vector_store=vector_store  # Make sure vector_store is defined
)

# sanatize
def clean_up_text(content: str) -> str:
    content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)
    unwanted_patterns = [
        "\\n", "  —", "——————————", "—————————", "—————",
        r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
    ]
    for pattern in unwanted_patterns:
        content = re.sub(pattern, "", content)
    content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
    content = re.sub(r'\s+', ' ', content)
    return content

# Function to process documents and upload to Pinecone
def process_documents(file_paths, file_extension, reader):
    documents = []
    parser = reader()
    for file_path in file_paths:
        loaded_docs = parser.load_data(file_path)  # Assuming this returns a list of documents
        for doc in loaded_docs:
            cleaned_text = clean_up_text(doc.text)
            doc.text = cleaned_text
            doc.metadata.update({"authors": [], "title": "Uploaded Document"})
            documents.append(doc)

    pipeline.run(documents=documents)

    return f"Files with {file_extension} uploaded and indexed successfully!"



# Function to handle file upload event
def on_file_upload(change):
    uploaded_files = change['new']
    for filename in uploaded_files.keys():
        content = uploaded_files[filename]['content']
        with open(filename, 'wb') as f:
            f.write(content)

        file_paths = [filename]
        file_extension = Path(filename).suffix.lower()  # Assuming all files have the same extension

        reader_map = {
        '.pdf': PyMuPDFReader,
        '.docx': DocxReader,
        '.hwp': HWPReader,
        '.epub': EpubReader,
        '.txt': FlatReader,
        '.html': HTMLTagReader,
        '.jpg': ImageReader,
        '.jpeg': ImageReader,
        '.png': ImageReader,
        '.ipynb': IPYNBReader,
        '.md': MarkdownReader,
        '.mbox': MboxReader,
        '.pptx': PptxReader,
        '.csv': PandasCSVReader,
        '.xml': XMLReader,
        '.csv': PagedCSVReader,
        '.csv': CSVReader,
         }
        if file_extension in reader_map:
            reader = reader_map[file_extension]
            upload_message = process_documents(file_paths, file_extension, reader)
            print(upload_message)
        else:
            print(f"Unsupported file format: {file_extension}")

# Create file upload widget and observe changes
upload_widget = widgets.FileUpload(accept='.pdf', multiple=True)  # Adjust accept attribute as needed
upload_widget.observe(on_file_upload, names='value')
display(upload_widget)

FileUpload(value={}, accept='.pdf', description='Upload', multiple=True)

Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]

Files with .pdf uploaded and indexed successfully!


In [None]:
# Query function
def query_data(query):
    vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
    retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)
    query_engine = RetrieverQueryEngine(retriever=retriever)
    return query_engine.query(query).response

# Query input widget
query_input = widgets.Text(
    placeholder='Type your query here',
    description='Query:',
    disabled=False
)

query_button = widgets.Button(
    description='Query',
    tooltip='Click to query',
    icon='search'
)

output_area = widgets.Output()

def on_query_button_clicked(b):
    output_area.clear_output()
    response = query_data(query_input.value)
    with output_area:
        print(response)

query_button.on_click(on_query_button_clicked)

# Display widgets
display(query_input, query_button, output_area)


Text(value='', description='Query:', placeholder='Type your query here')

Button(description='Query', icon='search', style=ButtonStyle(), tooltip='Click to query')

Output()