In [21]:
import os
from dotenv import load_dotenv

load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
# MODEL = "gpt-3.5-turbo"
MODEL = "llama3.1"


In [22]:
from langchain_community.llms import Ollama # type: ignore
#from langchain_openai.chat_models import ChatOpenAI
from langchain_community.embeddings import OllamaEmbeddings
#from langchain_openai.embeddings import OpenAIEmbeddings

model = Ollama(model=MODEL)
embeddings = OllamaEmbeddings(model='mxbai-embed-large')

#model.invoke("Who was Napolean")

In [3]:
from langchain_core.output_parsers import StrOutputParser # type: ignore

parser = StrOutputParser()

chain = model | parser 
#chain.invoke("Who was Napoleon?")

In [4]:
from langchain.prompts import PromptTemplate # type: ignore

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt.format(context="Here is some context", question="Here is a question")

'\nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Here is some context\n\nQuestion: Here is a question\n'

In [24]:
chain = prompt | model | parser

chain.invoke({"context": "My parents named me Umar", "question": "What's my name'?"})

'Your name is Umar.'

In [None]:
import json
from pathlib import Path
from pprint import pprint
from langchain_community.document_loaders import JSONLoader


file_path='dev-v2.0.json'
data = json.loads(Path(file_path).read_text())
pprint(data)


In [None]:
from langchain.schema import Document

def create_documents(squad_data):
    documents = []
    for entry in squad_data['data']:
        for paragraph in entry['paragraphs']:
            context = paragraph['context']
            
            documents.append(Document(page_content=context))
    return documents

documents = create_documents(data)

In [None]:
# Save the combined context and question into a text file
def save_documents_to_txt(documents, file_name):
    with open(file_name, 'w', encoding='utf-8') as file:
        for document in documents:
            file.write(document.page_content + "\n\n")

# Call the function to save the documents
save_documents_to_txt(documents, "squad.txt")


In [114]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
documents = TextLoader("squad.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=10000, chunk_overlap=10)
docs = text_splitter.split_documents(documents)



In [115]:
from langchain_community.vectorstores import DocArrayInMemorySearch # type: ignore

vectorstore = DocArrayInMemorySearch.from_documents(docs, embedding=embeddings)

In [None]:
retriever = vectorstore.as_retriever()
retriever.invoke("Normans")

In [119]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

In [None]:
question = """ Who are Normans?
"""
print(f"Answer: {chain.invoke({'question': question})}")

## Test

In [33]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/privacy policy.pdf")
pages = loader.load_and_split()
pages

[Document(metadata={'source': 'data/privacy policy.pdf', 'page': 0}, page_content='GOOGLE PRIVACY POLICY\nWhen you use our services, you’re trusting us\nwith your information. We understand this is a big\nresponsibility and work hard to protect your\ninformation and put you in control.\nThis Priv acy P olicy is meant t o help y ou understand what information we collect, why we\ncollect it, and how y ou can update, manage, expor t, and delete y our information.\nPrivacy Checkup\nLooking t o change y our priv acy settings?\nTake the Priv acy Checkup\nEffectiv e September 16, 2024 | Archived versions9/18/24, 10:49 PM Privacy Policy – Privacy & Terms – Google\nhttps://policies.google.com/privacy?hl=en-US 1/51'),
 Document(metadata={'source': 'data/privacy policy.pdf', 'page': 1}, page_content='We build a r ange of ser vices that help millions of people daily t o explor e and inter act with\nthe world in new wa ys. Our ser vices include:\nGoogle apps, sites, and de vices, lik e Sear ch, Y o

In [34]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
docs = text_splitter.split_documents(pages)

In [35]:
from langchain_community.vectorstores import DocArrayInMemorySearch # type: ignore

vectorstore = DocArrayInMemorySearch.from_documents(docs, embedding=embeddings)

In [36]:
retriever = vectorstore.as_retriever()
retriever.invoke("information")

[Document(metadata={'source': 'data/privacy policy.pdf', 'page': 2}, page_content='INFORMATION GOOGLE COLLECTS\nWe want you to understand the types of\ninformation we collect as you use our services\nWe collect information t o provide better ser vices t o all our users — fr om \x00guring out\nbasic stuff lik e which language y ou speak, t o mor e complex things lik e which ads y ou’ll\n\x00nd most useful , the people who matter most t o you online , or which Y ouTube videos y ou\nmight lik e. The information Google collects, and how that information is used, depends'),
 Document(metadata={'source': 'data/privacy policy.pdf', 'page': 3}, page_content='you might choose t o provide us with information — lik e an email addr ess t o\ncommunicate with Google or r eceiv e updates about our ser vices.\nWe also collect the content y ou cr eate, upload, or r eceiv e from others when using our\nservices. This includes things lik e email y ou write and r eceiv e, phot os and videos y ou\nsave, doc

In [37]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

In [39]:
question = """ What information does google collects from user?
"""
print(f"Answer: {chain.invoke({'question': question})}")

Answer: According to the documents, Google collects various types of information from users, including:

* Unique identifiers
* Browser type and settings
* Device type and settings
* Operating system
* Mobile network information (including carrier name and phone number)
* Application version number
* Language spoken
* Information for showing relevant ads (e.g., which ads you'll find most useful)
* YouTube video preferences
* Activity data (e.g., what websites you visit, what searches you perform)

Note that Google also collects information stored with the user's Google Account when they are signed in.


# Pinecone testing

In [14]:

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

In [17]:

index_name = "quickstart2"

pc.create_index(
    name=index_name,
    dimension=1024, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [None]:
data = [
    {"id": "vec1", "text": "Apple is a popular fruit known for its sweetness and crisp texture."},
    {"id": "vec2", "text": "The tech company Apple is known for its innovative products like the iPhone."},
    {"id": "vec3", "text": "Many people enjoy eating apples as a healthy snack."},
    {"id": "vec4", "text": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces."},
    {"id": "vec5", "text": "An apple a day keeps the doctor away, as the saying goes."},
    {"id": "vec6", "text": "Apple Computer Company was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership."}
]

embeddings = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[d['text'] for d in data],
    parameters={"input_type": "passage", "truncate": "END"}
)
print(embeddings[0])

In [None]:
# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)

vectors = []
for d, e in zip(data, embeddings):
    vectors.append({
        "id": d['id'],
        "values": e['values'],
        "metadata": {'text': d['text']}
    })

index.upsert(
    vectors=vectors,
    namespace="ns1"
)

In [None]:
print(index.describe_index_stats())

In [191]:
query = "Tell me about the tech company known as Apple."

embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

In [None]:
results = index.query(
    namespace="ns1",
    vector=embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)