# All imports and inits

In [None]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from pinecone import Pinecone, ServerlessSpec
import os

# Load environment variables from .env file
load_dotenv()

DATA_PATH = os.getenv("DATA_PATH")
PINECONE_API = os.getenv("PINECONE_API")

print("PINECONE_API", PINECONE_API)



# PDF loader

In [None]:


def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()


documents = load_documents()
documents[0]


# Text Splitting \ Chunking using Langchain

In [30]:


def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size = 800,
        chunk_overlap  = 80,
        length_function = len,
        is_separator_regex = False # considers separators like '\n\n'if true
    )
    docs = text_splitter.split_documents(documents)
    return docs


chunks = split_documents(documents)
chunks[0]


Document(metadata={'producer': 'Nitro Pro  (11. 0. 1. 10)', 'creator': 'Nitro Pro  (11. 0. 1. 10)', 'creationdate': '2025-03-09T10:44:04+00:00', 'moddate': '2025-03-09T15:44:28+05:00', 'title': 'PowerPoint Presentation', 'author': 'James Kurose', 'source': 'D:\\Disrupt Labs\\Rag Experiments\\env\\Rag-pipelines-experiments\\data\\4.1_video_slides.pdf', 'total_pages': 12, 'page': 0, 'page_label': '1'}, page_content='Network Layer:\nData Plane\n\uf0a7 Overview of Network Layer\n\uf0a7 What’s Inside a Router?\n\uf0a7 The Internet Protocol: IPv4, Addressing, NAT\nIPv6\n\uf0a7 Generalized Forwarding and SDN\n\uf0a7 Middleboxes\n\uf0a7 Summary\nCOMPSCI 453 Computer Networks\nProfessor Jim Kurose\nCollege of Information and Computer Sciences\nUniversity of Massachusetts\nClass textbook:\nComputer Networking: A Top-\nDown Approach (8th ed.)\nJ.F. Kurose, K.W . Ross\nPearson, 2020\nhttp://gaia.cs.umass.edu/kurose_ross')

# Creating Embeddings and Index via Pinecone 

- Creating a serverless index

In [None]:
from pinecone import Pinecone, ServerlessSpec


#  --------------- initialize pinecone -----------------------------
pc = Pinecone(api_key=PINECONE_API)
print(PINECONE_API)
pc.create_index_for_model(
    name="test-index",
    cloud="aws",
    region="us-east-1",
    embed={
        "model":"llama-text-embed-v2",
        "field_map":{"text": "page_content"}
    }
)


# Upsert Data to Pinecone

In [None]:
import itertools

index = pc.Index(host="https://llama-text-embed-v2-ai-chatbot-xjnfxjq.svc.aped-4627-b74a.pinecone.io")

def chunker(iterable, batch_size=200):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

# # Function to upsert chunks to Pinecone
def upsert_chunks_to_pinecone(chunks, index_name="test-index"):
    for i, chunk in enumerate(chunks):
        # Prepare the data for upserting
        vectors = []
        vector = {
            "id": f"chunk-{i}",          # Unique ID for each chunk
            "values": chunk.page_content,  # The chunk content (this will be embedded automatically)
            "metadata": chunk.metadata  # Optional: Store any metadata (e.g., page numbers)
        }
        vectors.append(vector)
        # Upsert the chunk into Pinecone
        # pc.upsert(
        #     index=index_name,
        #     vectors=[vector]  # We pass a list of vectors, here it's just one
        # )
    # Upsert data with 200 vectors per upsert request
    # for ids_vectors_chunk in chunker(vectors, batch_size=200):
    #     index.upsert(vectors=ids_vectors_chunk}) 
    index.upsert_records(vectors)

# Upsert the chunks to Pinecone
upsert_chunks_to_pinecone(chunks)




ListConversionException: Expected a list or list-like data structure, but got: Link Layer
COMPSCI 453 Computer Networks
Professor Jim Kurose
College of Information and Computer Sciences
University of Massachusetts
Class textbook:
Computer Networking: A Top-
Down Approach (8th ed.)
J.F. Kurose, K.W . Ross
Pearson, 2020
http://gaia.cs.umass.edu/kurose_ross
Video:     2020, J.F. Kurose, All Rights Reserved
Powerpoint:    1996-2020, J.F. Kurose, K.W. Ross, All Rights Reserved
 Introduction to the Link Layer
 Error-detection and -correction Techniques
 Multiple Access Links and Protocols
 Switched Local Area Networks
 Link Virtualization: a Network as a Link Layer
 Data Center Networking
 Retrospective: A Day in the Life of a Web Page Request