In [2]:
# All the import statements
import os
import re
import unicodedata
import cohere
from dotenv import load_dotenv

import pinecone as pc
from langchain.document_loaders import PyPDFLoader
from langchain_community.embeddings import CohereEmbeddings
from langchain_community.vectorstores import Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pinecone import Pinecone as PineconeClient

  from tqdm.autonotebook import tqdm


In [3]:
# Loading all required clients 
load_dotenv()
pinecone = PineconeClient(api_key=os.getenv('PINECONE_API_KEY'), environment=os.getenv('PINECONE_ENVIRONMENT'))
co = cohere.Client(os.getenv("COHERE_API_KEY"))
index = pinecone.Index(os.getenv('PINECONE_INDEX_NAME'))

In [9]:
# Extracting texts from pdfs

# Reading pdfs
def read_pdf(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load_and_split()
    return documents


# Process pdfs
def process_documents(documents):
    doc_text = ''
    for doc in documents:
        text = doc.page_content
        
        # preprocess
        text = clean_text(text)
        doc_text += text
    return doc_text

# Preprocess the text
def clean_text(text):
    # Replace newline characters with spaces
    text = text.replace('\n', ' ')
    # Remove unknown characters
    text = ''.join(c for c in text if unicodedata.category(c) != 'Co')
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text
    


# Load and process all PDF files in the the directory
pdf_dir_path = "pdfs/"

all_texts = []
for filename in os.listdir(pdf_dir_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_dir_path, filename)
        documents = read_pdf(file_path)
        texts = process_documents(documents)
        all_texts.append(texts)

In [10]:
# Create embeddings
def embed(text):
    embeds = co.embed(
        texts=text,
        model='embed-english-v3.0',
        input_type='search_document',
        truncate='END'
    ).embeddings
    return embeds

# Create embeddings
embeds = embed(all_texts)

In [11]:
# check shape to create index dimension

import numpy as np

shape = np.array(embeds).shape
print(shape)

(24, 1024)


In [12]:
# Uploading data in batches

# Define the "index" variable here or make sure it is accessible in the current scope

batch_size = 128

ids = [str(i) for i in range(shape[0])]
# create list of metadata dictionaries
meta = [{'text': text} for text in all_texts]

# create list of (id, vector, metadata) tuples to be upserted
to_upsert = list(zip(ids, embeds, meta))

for i in range(0, shape[0], batch_size):
    i_end = min(i+batch_size, shape[0])
    index.upsert(vectors=to_upsert[i:i_end])

# let's view the index statistics
print(index.describe_index_stats())

{'dimension': 1024,
 'index_fullness': 1e-05,
 'namespaces': {'': {'vector_count': 1}},
 'total_vector_count': 1}


In [5]:
embeddings = CohereEmbeddings(model="embed-english-v3.0")
vectorstore = Pinecone.from_existing_index(index_name=os.getenv('PINECONE_INDEX_NAME'), embedding=embeddings)
retriever = vectorstore.as_retriever()

In [6]:
# RAG prompt
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# RAG
model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")

chain = (
    RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
    | prompt
    | model 
    | StrOutputParser()
)

In [16]:
chain.invoke("""Are the hidden states of the Hidden Markov Model discrete or continuous?

  Continuous 
  Could be either discrete or continuous 
  Discrete""")

'Discrete'