# Vectorize PDF File with LlamaIndex and Pinecone


## Setup


In [None]:
# Install libraries
%pip install -U llama-index \
    llama-index-readers-file \
    llama-index-vector-stores-pinecone \
    pinecone-client \
    arxiv==2.1.0 \
    python-dotenv \
    setuptools  # (Optional)

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

# Set environment variables for API keys
pinecone_api_key = os.getenv("PINECONE_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

# Set file and index names
index_name = os.getenv("INDEX_NAME")
file_path = os.getenv("FILE_PATH")
doc_title = os.getenv("DOCUMENT_TITLE")

## Process PDF with LlamaIndex


### Import file


In [None]:
from pathlib import Path
from llama_index.readers.file import PDFReader

loader = PDFReader()

documents = loader.load_data(file=Path(file_path))

In [None]:
# Inspect imported file
documents[0]

### Clean up document content


In [None]:
import re


def clean_up_text(content: str) -> str:
    """
    Remove unwanted characters and patterns in text input.

    :param content: Text input.

    :return: Cleaned version of original text input.
    """

    # Fix hyphenated words broken by newline
    content = re.sub(r"(\w+)-\n(\w+)", r"\1\2", content)

    # Remove specific unwanted patterns and characters
    unwanted_patterns = [
        "\\n",
        "  —",
        "——————————",
        "—————————",
        "—————",
        r"\\u[\dA-Fa-f]{4}",
        r"\uf075",
        r"\uf0b7",
    ]
    for pattern in unwanted_patterns:
        content = re.sub(pattern, " ", content)

    # Fix improperly spaced hyphenated words and normalize whitespace
    content = re.sub(r"(\w)\s*-\s*(\w)", r"\1-\2", content)
    content = re.sub(r"\s+", " ", content)

    return content


# Call function
cleaned_docs = []
for d in documents:
    cleaned_text = clean_up_text(d.text)
    d.text = cleaned_text
    cleaned_docs.append(d)

# Inspect output
cleaned_docs[0].get_content()

### Add metadata


In [None]:
# Iterate through `documents` and add our new key:value pairs
metadata_additions = {"title": doc_title}

# Update dict in place
[cd.metadata.update(metadata_additions) for cd in cleaned_docs]

# Let\'s confirm everything worked:
cleaned_docs[0].metadata

In [None]:
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline

# This will be the model we use both for Node parsing and for vectorization
embed_model = OpenAIEmbedding(api_key=openai_api_key)

# Define the initial pipeline
pipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(
            buffer_size=1,
            breakpoint_percentile_threshold=95,
            embed_model=embed_model,
        ),
        embed_model,
    ],
)

## Upsert vectors to Pinecone


In [None]:
from pinecone import Pinecone, ServerlessSpec
from llama_index.vector_stores.pinecone import PineconeVectorStore

# Initialize connection to Pinecone
pc = Pinecone(api_key=pinecone_api_key)

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

# check if index already exists
if index_name not in existing_indexes:
    # Create your index (can skip this step if your index already exists)
    pc.create_index(
        index_name,
        dimension=1536,
        spec=ServerlessSpec(cloud="aws", region="us-west-2"),
    )

# Initialize your index
pinecone_index = pc.Index(index_name)

# Initialize VectorStore
# OPTIONAL: upsert to namespace by commenting out the first line and uncommenting the second
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
# vector_store = PineconeVectorStore(pinecone_index=pinecone_index, namespace="<NAMESPACE_NAME>")

In [None]:
# Our pipeline with the addition of our PineconeVectorStore
pipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(
            buffer_size=1,
            breakpoint_percentile_threshold=95,
            embed_model=embed_model,
        ),
        embed_model,
    ],
    vector_store=vector_store,  # Our new addition
)

# Now we run our pipeline!
pipeline.run(documents=cleaned_docs)

In [None]:
pinecone_index.describe_index_stats()

## Send a test query


In [None]:
# Define a query relevant to your document content
test_query = ""

In [None]:
from llama_index.core import VectorStoreIndex
from IPython.display import display

# Instantiate VectorStoreIndex object from your vector_store object
vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

query_engine = vector_index.as_query_engine()
response = query_engine.query(test_query)

# Inspect results
display(response.response)