In [17]:
import pinecone
from pinecone import Pinecone, Index, ServerlessSpec
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
import pandas as pd
from dotenv import load_dotenv
import os
from typing import List
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [20]:
load_dotenv()

True

In [21]:
api_key = os.getenv("PINECONE_API_KEY")
pinecone = Pinecone(api_key=api_key)

# Books

In [3]:
index_name = "books"

In [4]:
if index_name not in [pinecone.list_indexes().indexes[i].name for i in range(len(pinecone.list_indexes().indexes))]:
    pinecone.create_index(index_name, dimension=1536, spec=ServerlessSpec(cloud="aws", region="us-east-1"))

In [8]:
books = pd.read_csv(r'..\data\final_data\books.csv')[['book_id','Title','description']]

In [9]:
books.head()

Unnamed: 0,book_id,Title,description
0,1,History of Magic and the Occult,"See the history of witchcraft, magic and super..."
1,2,By Honor Bound (The Lassiter Law) (Silhouette ...,A comprehensive guide that defines the literat...
2,3,Usa Laminated Map,"The United States ""Explorer"" map is a classic ..."
3,4,America at 1750: A Social Portrait,Demonstrates how the colonies developed into t...
4,5,How to Discipline Kids without Losing Their Lo...,Imagine... No More Arguing. Imagine... No More...


In [10]:
index: Index = pinecone.Index("books")
vector_store = PineconeVectorStore(
    index=index, embedding=OpenAIEmbeddings(model="text-embedding-ada-002")
)

In [11]:
titles= [
    Document(
        page_content=title,  #embedding
        metadata={"book_id": book_id, "type": "title"}  
    )
    for book_id, title in zip(books['book_id'], books['Title'])
]

In [12]:
vector_store.add_documents(documents=titles, ids=["title_" + str(book_id) for book_id in books['book_id']])

['title_1',
 'title_2',
 'title_3',
 'title_4',
 'title_5',
 'title_6',
 'title_7',
 'title_8',
 'title_9',
 'title_10',
 'title_11',
 'title_12',
 'title_13',
 'title_14',
 'title_15',
 'title_16',
 'title_17',
 'title_18',
 'title_19',
 'title_20',
 'title_21',
 'title_22',
 'title_23',
 'title_24',
 'title_25',
 'title_26',
 'title_27',
 'title_28',
 'title_29',
 'title_30',
 'title_31',
 'title_32',
 'title_33',
 'title_34',
 'title_35',
 'title_36',
 'title_37',
 'title_38',
 'title_39',
 'title_40',
 'title_41',
 'title_42',
 'title_43',
 'title_44',
 'title_45',
 'title_46',
 'title_47',
 'title_48',
 'title_49',
 'title_50',
 'title_51',
 'title_52',
 'title_53',
 'title_54',
 'title_55',
 'title_56',
 'title_57',
 'title_58',
 'title_59',
 'title_60',
 'title_61',
 'title_62',
 'title_63',
 'title_64',
 'title_65',
 'title_66',
 'title_67',
 'title_68',
 'title_69',
 'title_70',
 'title_71',
 'title_72',
 'title_73',
 'title_74',
 'title_75',
 'title_76',
 'title_77',
 'title_

In [13]:
descriptions = [
    Document(
        page_content=description,  #embedding
        metadata={"book_id": book_id, "type": "description"} 
    )
    for book_id, description in zip(books['book_id'], books['description'])
]

In [14]:
vector_store.add_documents(documents=descriptions, ids=["desc_" + str(book_id) for book_id in books['book_id']])

['desc_1',
 'desc_2',
 'desc_3',
 'desc_4',
 'desc_5',
 'desc_6',
 'desc_7',
 'desc_8',
 'desc_9',
 'desc_10',
 'desc_11',
 'desc_12',
 'desc_13',
 'desc_14',
 'desc_15',
 'desc_16',
 'desc_17',
 'desc_18',
 'desc_19',
 'desc_20',
 'desc_21',
 'desc_22',
 'desc_23',
 'desc_24',
 'desc_25',
 'desc_26',
 'desc_27',
 'desc_28',
 'desc_29',
 'desc_30',
 'desc_31',
 'desc_32',
 'desc_33',
 'desc_34',
 'desc_35',
 'desc_36',
 'desc_37',
 'desc_38',
 'desc_39',
 'desc_40',
 'desc_41',
 'desc_42',
 'desc_43',
 'desc_44',
 'desc_45',
 'desc_46',
 'desc_47',
 'desc_48',
 'desc_49',
 'desc_50',
 'desc_51',
 'desc_52',
 'desc_53',
 'desc_54',
 'desc_55',
 'desc_56',
 'desc_57',
 'desc_58',
 'desc_59',
 'desc_60',
 'desc_61',
 'desc_62',
 'desc_63',
 'desc_64',
 'desc_65',
 'desc_66',
 'desc_67',
 'desc_68',
 'desc_69',
 'desc_70',
 'desc_71',
 'desc_72',
 'desc_73',
 'desc_74',
 'desc_75',
 'desc_76',
 'desc_77',
 'desc_78',
 'desc_79',
 'desc_80',
 'desc_81',
 'desc_82',
 'desc_83',
 'desc_84',
 

In [15]:
response = index.describe_index_stats()
print(response["namespaces"])

{'': {'vector_count': 39484}}


# Documents

In [18]:
index_name = "documents"

In [22]:
if index_name not in [pinecone.list_indexes().indexes[i].name for i in range(len(pinecone.list_indexes().indexes))]:
    pinecone.create_index(index_name, dimension=1536, spec=ServerlessSpec(cloud="aws", region="us-east-1"))

In [23]:
def get_text_from_pdf(pdf_file: str) -> List[Document]:
    """
    Extracts text from a PDF file and returns it as a list of Document objects.

    Args:
        pdf_file (str): Path to the PDF file to be processed.

    Returns:
        List[Document]: A list of Document objects, where each represents a page in the PDF.
    """
    load_dotenv()
    # Initialize the PyMuPDFLoader with the given PDF file
    loader = PyMuPDFLoader(pdf_file)

    # Initialize an empty list to store the pages
    pages: List[Document] = []

    # Iterate over each loaded page and add it to the list
    for page in loader.load():
        pages.append(page)

    # Return the list of extracted pages
    return pages

In [24]:
def extract_pdf_text(pdf_files: List[str]) -> List[Document]:
    """
    Extracts text from multiple PDF files and returns it as a list of Document objects.

    Args:
        pdf_files (List[str]): A list of paths to the PDF files to be processed.
  

    Returns:
        List[Document]: A list of Document objects, where each represents a page in the PDF files.
    """
    # Initialize an empty list to store the extracted docs
    docs: List[Document] = []

    # Iterate over each PDF file and extract the text from it
    for pdf_file in pdf_files:
        file_path = os.path.join('..','data', 'pdfs', pdf_file)
        docs.extend(get_text_from_pdf(file_path))


    # Return the list of extracted pages
    return docs

In [None]:
pdf_files = [f for f in os.listdir('../data/pdfs') if f.endswith(".pdf")]
docs= extract_pdf_text(pdf_files) 
    
text_splitter = RecursiveCharacterTextSplitter(
        separators="\n",  # Split on newlines
        chunk_size=1000,  # Maximum size of each chunk
        chunk_overlap=250,  # Overlap between chunks to preserve context
        add_start_index=True)  # Include the starting index of each chunk 
all_splits = text_splitter.split_documents(docs)
    
pc = Pinecone()
index: Index = pc.Index("documents")
vector_store = PineconeVectorStore(index=index, embedding=OpenAIEmbeddings(model="text-embedding-ada-002"))


ids = [str(i) for i in range(len(all_splits))]
vector_store.add_documents(documents=all_splits, ids=ids)

In [33]:
response = index.describe_index_stats()
print(response["namespaces"])

{'': {'vector_count': 62}}
