In [1]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from tqdm.autonotebook import tqdm
import pinecone

  from tqdm.autonotebook import tqdm


In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [38]:
def process_pdf_documents(pdf_folder_path: str):
    documents = []
    file_Ids = []
    for file in os.listdir(pdf_folder_path):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder_path, file)
            loader = PyPDFLoader(pdf_path)
            documents.extend(loader.load())

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, 
        chunk_overlap=100, 
        length_function=len, 
        is_separator_regex=False,
        separators=["\n\n", "\n", " ", ""]
    )
    chunked_documents = text_splitter.split_documents(documents)
    return chunked_documents

In [60]:
data = process_pdf_documents('./new_dataset')

In [61]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[30].page_content)} characters in your document')

You have 1468 document(s) in your data
There are 645 characters in your document


In [62]:
from langchain.embeddings.openai import OpenAIEmbeddings

OPENAI_API_KEY =os.environ["OPENAI_API_KEY"]
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [63]:
# initialize pinecone
pinecone.init(
    api_key=os.environ["PINECONE_API_KEY"],
    environment=os.environ["PINECONE_ENV"]
)
index_name = "esg-index" 

In [43]:
if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )

In [64]:
index = pinecone.GRPCIndex(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.1,
 'namespaces': {'': {'vector_count': 8757}},
 'total_vector_count': 8757}

In [65]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_size = 100

texts = []
metadatas = []

for i in tqdm(range(0, len(data), batch_size)):
    # get end of batch
    i_end = min(len(data), i+batch_size)
    batch = data[i:i_end]

    # first get metadata fields for this record
    metadatas = [{
        'text': record.page_content
    } for _, record in enumerate(batch)]
    # get the list of contexts / documents
    documents = batch
    # create document embeddings

    embeds = embed.embed_documents([str(doc.page_content) for doc in documents])
    # get IDs
    # Create IDs for each chunk
    ids = [uuid4().hex for _ in range(len(embeds))]
    # add everything to pinecone
    index.upsert(vectors=zip(ids, embeds, metadatas))

  0%|          | 0/15 [00:00<?, ?it/s]

In [52]:
index.delete(delete_all=True)



In [None]:
import openai

MODEL = "text-embedding-ada-002"

# Define a function to create embeddings
def create_embeddings(texts):
    embeddings_list = []
    file_ids = []
    for text in texts:
        res = openai.Embedding.create(input=[text], engine=MODEL)
        embeddings_list.append(res['data'][0]['embedding'])
        file_ids.append()
    return embeddings_list

# Define a function to upsert embeddings to Pinecone
def upsert_embeddings_to_pinecone(index, embeddings, ids):
    index.upsert(vectors=[(id, embedding) for id, embedding in zip(ids, embeddings)])


In [22]:
import re
import openai


embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

# Initialize OpenAI
openai.api_key = os.environ["OPENAI_API_KEY"]
MODEL = "text-embedding-ada-002"

# Define a function to preprocess text
def preprocess_text(text):
    # Replace consecutive spaces, newlines and tabs
    text = re.sub(r'\s+', ' ', text)
    return text

def process_pdf(file_path):
    # create a loader
    loader = PyPDFLoader(file_path)
    # load your data
    data = loader.load()
    # Split your data up into smaller documents with Chunks
    text_splitter = RecursiveCharacterTextSplitter(        
        chunk_size=1000, 
        chunk_overlap=100, 
        length_function=len, 
        is_separator_regex=False,
        separators=["\n\n", "\n", " ", ""])
    documents = text_splitter.split_documents(data)
    # Convert Document objects into strings
    texts = [doc for doc in documents]
    return texts

# Define a function to upsert embeddings to Pinecone
def upsert_embeddings_to_pinecone(index, embeddings, ids):
    index.upsert(vectors=[(id, embedding) for id, embedding in zip(ids, embeddings)])

pdf_folder_path = './dataset'

for file in os.listdir(pdf_folder_path):
        if file.endswith('.pdf'):
            file_path = os.path.join(pdf_folder_path, file)
            # Process a PDF and create embeddings
            texts = process_pdf(file_path)
            # create document embeddings
            embeds = embed.embed_documents(texts)

            # Upsert the embeddings to Pinecone
            upsert_embeddings_to_pinecone(index, embeds, [file_path])

In [26]:
total = 0
for file in os.listdir(pdf_folder_path):
        if file.endswith('.pdf'):
            file_path = os.path.join(pdf_folder_path, file)
            # Process a PDF and create embeddings
            texts = process_pdf(file_path)
            total += len(texts)

print(total)

2752


In [None]:
import os
import pandas as pd
from uuid import uuid4
from tqdm.auto import tqdm

def process_pdf_documents(pdf_folder_path: str):
    records = []
    for file in os.listdir(pdf_folder_path):
        if file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder_path, file)
            loader = PyPDFLoader(pdf_path)
            loaded_docs = loader.load()

            for doc in loaded_docs:
                records.append({
                    'title': os.path.splitext(file)[0],  # File name without extension as title
                    'context': doc,  # Storing the document content in 'context',
                    'file_name': file
                })

    return pd.DataFrame(records)

def split_and_embed_documents(dataframe, embed, index):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, 
        chunk_overlap=100, 
        length_function=len, 
        is_separator_regex=False,
        separators=["\n\n", "\n", " ", ""]
    )
    
    batch_size = 100
    for i in tqdm(range(0, len(dataframe), batch_size)):
        batch_df = dataframe.iloc[i:i + batch_size]

        # first get metadata fields for this record
        metadatas = [{
            'title': record['title'],
            'text': record['context'],
            'file_name': record['file_name']
            } for _, record in batch_df.iterrows()]
        
        # Splitting the documents into chunks
        batch_df['context_chunks'] = batch_df['context'].apply(lambda x: text_splitter.split_documents([x]))

        # Flatten the context for embedding
        flattened_contexts = batch_df.explode('context_chunks')['context_chunks'].tolist()

        # Ensure each element in flattened_contexts is a string
        flattened_contexts = [str(context) for context in flattened_contexts if context]

        # Check if embed_documents expects a list or single strings
        try:
            # If it expects a list
            embeds = embed.embed_documents(flattened_contexts)
        except TypeError:
            # If it expects individual strings
            embeds = [embed.embed_documents(context) for context in flattened_contexts]

        # Create IDs for each chunk
        ids = [uuid4() for _ in range(len(embeds))]

        # Upsert to Pinecone
        index.upsert(vectors=zip(ids, embeds, metadatas))

# Usage
pdf_folder_path = './dataset/'
data = process_pdf_documents(pdf_folder_path)

# Assuming 'embed' is your embedding model and 'index' is your Pinecone index
split_and_embed_documents(data, embed, index)


In [53]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.1,
 'namespaces': {},
 'total_vector_count': 0}

In [57]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader(["", ""])
docs = loader.load()
len(docs)

2