In [None]:
from pinecone import Pinecone, ServerlessSpec
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

from langchain_groq import ChatGroq



import time

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
groq_key = os.environ["GROQ_KEY"]

llm = ChatGroq(
    api_key=groq_key,
    name="meta-llama/Llama-3.1-8B"
)

In [None]:
# read the document
def read_doc(directory):
    file_loader = PyPDFLoader(directory)
    documents = file_loader.load()
    return documents 

In [None]:
doc = read_doc('documents/linear_algebra_book.pdf')
len(doc)
print(doc[20])

In [None]:
# divide docs into chunks
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
        )
    chunked_docs = text_splitter.split_documents(docs)
    return chunked_docs

In [None]:
chunked_documents = chunk_data(doc)
len(chunked_documents)

In [None]:
# embedding by openai
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings

In [None]:
vectors = embeddings.embed_query("heyy there")
len(vectors)

In [None]:
# vector search db in pinecone
pinecone = Pinecone(
    api_key=os.environ["PINECONE_KEY"],
)

index_name = "langchainvector"

existing_indexes = [index_info["name"] for index_info in pinecone.list_indexes()]

if index_name not in existing_indexes:
    pinecone.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pinecone.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pinecone.Index(index_name)

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

ids = vector_store.add_documents(
    documents=chunked_documents
)

In [None]:
print(vector_store._index.describe_index_stats())  # Confirm documents are stored

In [None]:
def retreive_query(query, k=2):
    results = vector_store.similarity_search(
        query=query,
        k=3,
    )

    return results

In [None]:
results = retreive_query("What's algebra'?", k=3)
results

In [None]:
retreiver = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k":2},
)

system_template = """<|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>
    You are an encouraging maths tutor for STEM students. Follow these rules:
    1. Explain concepts in the user INPUT below, using simple terms with real-world examples
    2. Break problems into step-by-step solutions
    3. Use LaTeX formatting for equations: $x^2 + 3x = 0$
    4. Ask guiding questions instead of giving direct answers
    5. Explain concepts using examples from CONTEXT below
    6. Cite sources when using specific formulas/theorems
    7. Combine CONTEXT with your general math knowledge

    CONTEXT: {context}
    INPUT: {input}<|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
"""

prompt = ChatPromptTemplate.from_template(system_template)

In [None]:
def format_docs(docs):
    return"\n\n".join([
            doc.page_content
            if hasattr(doc, "page_content")
            else str(doc.get("page_content", "")) 
            for doc in docs 
    ])

In [None]:
from langchain.schema import HumanMessage

def convert_to_chat_message(state):
    input_text = state.get("input", "")
    state["input"] = [HumanMessage(content=input_text)]
    return state

In [73]:
chain = (
    {
        "context": RunnablePassthrough() | retreiver,
        "input": RunnablePassthrough()
    }
    | prompt
    | llm
    
)



In [76]:
result = chain.invoke("what does an euclidian space mean?")
print(result)

content="An Euclidean space is a mathematical concept that generalizes the properties of our three-dimensional world. It's named after the ancient Greek mathematician Euclid, who laid the foundations of geometry. \n\nIn simpler terms, an Euclidean space is a set of points, along with a system of measurements for distances and angles between these points. The most common example is the three-dimensional space we live in, where the measurements are given by the familiar coordinates $(x, y, z)$.\n\nThis concept is not limited to three dimensions. For instance, the Euclidean plane (a two-dimensional space) is an example of an Euclidean space, as well as the one-dimensional space of real numbers.\n\nIn the context of your document, the Euclidean space is likely referring to the real number space $\\mathbb{R}^n$, where $n$ is the number of dimensions. This is often used in linear algebra and other mathematics fields.\n\nDo you have any specific properties or aspects of Euclidean spaces you'd

In [77]:
result = chain.invoke("can u explain eugn values and vectors")
print(result)

content='Sure, I\'d be happy to explain!\n\nIn the context of linear algebra, eigenvalues and eigenvectors are special values and vectors associated with a linear transformation (represented by a matrix). \n\nLet\'s break this down:\n\n1. **Vector**: A vector is a mathematical object that has both magnitude (size) and direction. You can visualize a vector as an arrow in space.\n\n2. **Linear Transformation**: This is a function that maps vectors to other vectors. In other words, it changes the magnitude and/or direction of a vector. \n\n3. **Eigenvector and Eigenvalue**: When a linear transformation is applied to an eigenvector, the resulting vector is just a scaled version of the original eigenvector. The amount by which the eigenvector is scaled is the eigenvalue. \n\nFor example, if we have a matrix A and an eigenvector v, applying A to v gives us Av = λv, where λ is the eigenvalue.\n\nIn the context of your documents, the Singular Value Decomposition (SVD) is being discussed. The S

In [None]:
test_docs = retreiver.get_relevant_documents("test query")
test_docs

In [None]:
output=llm.invoke([HumanMessage(content="Hello!")])
print(output)