In [None]:
# %pwd

'/home/makesh/RAG/AI-Chatbot'

In [None]:
# import os
# os.chdir('/home/makesh/RAG/AI-Chatbot')

In [None]:
# %pwd

'/home/makesh/RAG/AI-Chatbot'

In [69]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [70]:
def pdf_loader(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    document = loader.load()
    return document


In [71]:
extract_data = pdf_loader('data')

In [73]:
len(extract_data)

492

In [74]:
# extract_data

In [75]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    given a list of document object, return a new list of document object containing only 'source' 
    in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get('source')
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={'source': src}
            )
        )
    return minimal_docs

In [76]:
minimal_docs = filter_to_minimal_docs(extract_data)
# minimal_docs

In [77]:
# split the documents into smaller chunks

def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    split_docs = text_splitter.split_documents(minimal_docs)
    return split_docs

In [78]:
text_split_docs = text_split(minimal_docs)
print(f"Number of split documents: {len(text_split_docs)}")

Number of split documents: 2872


In [79]:
# text_split_docs

In [80]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download the embeddings model from HuggingFace.
    """
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={"device": "cpu"}
    )
    return embeddings
embeddings = download_embeddings()

In [81]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={}, multi_process=False, show_progress=False)

In [82]:
vector = embeddings.embed_query("Hello world")
vector

[-0.034477245062589645,
 0.031023195013403893,
 0.006735047325491905,
 0.0261090025305748,
 -0.039362046867609024,
 -0.16030250489711761,
 0.06692401319742203,
 -0.006441461853682995,
 -0.047450561076402664,
 0.014758877456188202,
 0.07087534666061401,
 0.05552760511636734,
 0.019193338230252266,
 -0.026251371949911118,
 -0.010109527967870235,
 -0.026940524578094482,
 0.022307442501187325,
 -0.02222667634487152,
 -0.14969265460968018,
 -0.017493028193712234,
 0.007676188834011555,
 0.05435232073068619,
 0.003254490904510021,
 0.03172596916556358,
 -0.08462143689393997,
 -0.02940598502755165,
 0.05159567669034004,
 0.04812401160597801,
 -0.0033147495705634356,
 -0.05827918276190758,
 0.04196929931640625,
 0.02221069484949112,
 0.12818889319896698,
 -0.022338949143886566,
 -0.011656278744339943,
 0.06292832642793655,
 -0.03287630155682564,
 -0.09122612327337265,
 -0.03117535449564457,
 0.05269956216216087,
 0.04703480005264282,
 -0.08420300483703613,
 -0.030056221410632133,
 -0.020744783

In [83]:
print(f"Vector length: {len(vector)}")

Vector length: 384


In [108]:
from dotenv import load_dotenv
import os
load_dotenv()

python-dotenv could not parse statement starting at line 1


True

In [109]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [110]:
from pinecone import Pinecone
Pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=Pinecone_api_key)

In [87]:
pc

<pinecone.pinecone.Pinecone at 0x713a9dca23c0>

In [88]:
from pinecone import ServerlessSpec

index_name = "ai-chatbot"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)

In [89]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_split_docs,
    embedding=embeddings,
    index_name=index_name)

In [90]:
# load existion index

from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [91]:
# add more data to the existing pinecone index
# docsearch.add_documents(text_split_docs)

In [92]:
retriver = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 3})

In [93]:
retriver_docs = retriver.invoke("What is AI")
retriver_docs

[Document(id='9ec9b71f-3648-40d4-978a-5a38c122a6bd', metadata={'source': 'data/ai-at-the-edge-solving-real-world-problems-with-embedded-machine-learning-9781098120207_compress.pdf'}, page_content='There are two points we are trying to make here: the first is that intelligence is quite hard todefine, and many rather simple problems require a degree of intelligence to solve. The second isthat the programs that implement this intelligence do not necessarily need to be particularlycomplex. Sometimes, a slime mold will do.\nSo, what is AI? In simple terms, it’s an artificial system that makes intelligent decisions based onsome kind of input. And one way to create AI is with machine learning.'),
 Document(id='6ab6e46a-2a90-444e-8d6b-2f91e74efe3d', metadata={'source': 'data/ai-at-the-edge-solving-real-world-problems-with-embedded-machine-learning-9781098120207_compress.pdf'}, page_content='There are two points we are trying to make here: the first is that intelligence is quite hard todefine, 

In [115]:
from langchain_groq import ChatGroq

chatModel = ChatGroq(
    model="llama-3.3-70b-versatile",
    groq_api_key=os.environ.get("GROQ_API_KEY")  
)


In [116]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
system_prmpt = (
    "You are an assistant for question answering tasks."
    "Use the following pieces of retrieved context to answer"
    "the question. If you don't know the answer, just say that you"
    "don't know use three sentence maximum. and keep the"
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prmpt),
        ("human", "{input}"),
    ]
)

In [118]:
question_answer_chain = create_stuff_documents_chain(
    llm=chatModel,
    prompt=prompt)
rag_chain = create_retrieval_chain(
    retriver,
    question_answer_chain)

In [119]:
response = rag_chain.invoke({"input": "What is edge computing?"})

print(response["answer"]) 

I don't know the exact definition of edge computing from the given context, but it appears to be related to moving compute to the edge, increasing access to technology, and differing from traditional AI. Edge AI development involves exploration, data analysis, and machine learning. The context doesn't provide a direct definition of edge computing.


In [120]:
response = rag_chain.invoke({"input": "What is microcontroller?"})

print(response["answer"]) 

A microcontroller is a small computer used for single-purpose applications, like controlling machinery. It is simpler than other computers and typically doesn't use an operating system. It has components like a processor, memory, and communication technologies implemented on a single piece of silicon.
