In [96]:
import os
from dotenv import load_dotenv

load_dotenv()

# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# MODEL = "gpt-3.5-turbo"
MODEL = "gemma:2b"


In [97]:
from langchain_community.llms import Ollama
#from langchain_openai.chat_models import ChatOpenAI
from langchain_community.embeddings import OllamaEmbeddings
#from langchain_openai.embeddings import OpenAIEmbeddings

model = Ollama(model=MODEL)
embeddings = OllamaEmbeddings(model='mxbai-embed-large')

model.invoke("Who was Napolean")

'Napolean was a powerful military genius and one of the greatest commanders in history. He was born in Corsica in 1769 and died in 1821. He led the French army to many victories, including the conquest of Italy and the capture of the Turkish city of Constantinople. He was also a brilliant administrator and statesman, and he was a key figure in the rise of France to global dominance in the early 19th century.'

In [80]:
from langchain_core.output_parsers import StrOutputParser # type: ignore

parser = StrOutputParser()

chain = model | parser 
chain.invoke("Who was Napoleon?")

'Napoleon Bonaparte was a Corsican general and statesman who rose to prominence in the French Revolution and Napoleonic Wars. Born in Corsica in 1769, he fought in several conflicts before leading the French forces to a series of major victories against the British Empire, culminating in his rise to power in the French Empire. He was a brilliant strategist, a skilled military leader, and a charismatic public figure who captured the imagination of the people of France and Europe. Napoleon was eventually defeated and killed by Russian forces at the age of 32 in 1821.'

In [43]:
from langchain.prompts import PromptTemplate # type: ignore

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt.format(context="Here is some context", question="Here is a question")

'\nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Here is some context\n\nQuestion: Here is a question\n'

In [None]:
chain = prompt | model | parser

chain.invoke({"context": "My parents named me Santiago", "question": "What's your name'?"})

In [None]:
import json
from pathlib import Path
from pprint import pprint
from langchain_community.document_loaders import JSONLoader


file_path='dev-v2.0.json'
data = json.loads(Path(file_path).read_text())
pprint(data)


In [None]:
from langchain.schema import Document

def create_documents(squad_data):
    documents = []
    for entry in squad_data['data']:
        for paragraph in entry['paragraphs']:
            context = paragraph['context']
            
            documents.append(Document(page_content=context))
    return documents

documents = create_documents(data)

In [None]:
# Save the combined context and question into a text file
def save_documents_to_txt(documents, file_name):
    with open(file_name, 'w', encoding='utf-8') as file:
        for document in documents:
            file.write(document.page_content + "\n\n")

# Call the function to save the documents
save_documents_to_txt(documents, "squad.txt")


In [114]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
documents = TextLoader("squad.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=10000, chunk_overlap=10)
docs = text_splitter.split_documents(documents)



In [115]:
from langchain_community.vectorstores import DocArrayInMemorySearch # type: ignore

vectorstore = DocArrayInMemorySearch.from_documents(docs, embedding=embeddings)

In [116]:
retriever = vectorstore.as_retriever()
retriever.invoke("Normans")

[Document(metadata={'source': 'squad.txt'}, page_content='The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.\n\nThe Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian

In [119]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

In [120]:
question = """ Who are Normans?
"""
print(f"Answer: {chain.invoke({'question': question})}")

Answer: The context does not provide any information about Normans, so I cannot answer this question from the provided context.


## Test

In [174]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("privacy policy.pdf")
pages = loader.load_and_split()
pages

[Document(metadata={'source': 'privacy policy.pdf', 'page': 0}, page_content='GOOGLE PRIVACY POLICY\nWhen you use our services, you’re trusting us\nwith your information. We understand this is a big\nresponsibility and work hard to protect your\ninformation and put you in control.\nThis Priv acy P olicy is meant t o help y ou understand what information we collect, why we\ncollect it, and how y ou can update, manage, expor t, and delete y our information.\nPrivacy Checkup\nLooking t o change y our priv acy settings?\nTake the Priv acy Checkup\nEffectiv e September 16, 2024 | Archived versions9/18/24, 10:49 PM Privacy Policy – Privacy & Terms – Google\nhttps://policies.google.com/privacy?hl=en-US 1/51'),
 Document(metadata={'source': 'privacy policy.pdf', 'page': 1}, page_content='We build a r ange of ser vices that help millions of people daily t o explor e and inter act with\nthe world in new wa ys. Our ser vices include:\nGoogle apps, sites, and de vices, lik e Sear ch, Y ouTube, and

In [175]:
from langchain_community.vectorstores import DocArrayInMemorySearch # type: ignore

vectorstore = DocArrayInMemorySearch.from_documents(pages, embedding=embeddings)

In [176]:
retriever = vectorstore.as_retriever()
retriever.invoke("information")

[Document(metadata={'source': 'privacy policy.pdf', 'page': 2}, page_content='INFORMATION GOOGLE COLLECTS\nWe want you to understand the types of\ninformation we collect as you use our services\nWe collect information t o provide better ser vices t o all our users — fr om \x00guring out\nbasic stuff lik e which language y ou speak, t o mor e complex things lik e which ads y ou’ll\n\x00nd most useful , the people who matter most t o you online , or which Y ouTube videos y ou\nmight lik e. The information Google collects, and how that information is used, depends\non how y ou use our ser vices and how y ou manage y our priv acy contr ols.\nWhen y ou’re not signed in t o a Google Account, we st ore the information we collect with\nunique identi\x00ers  tied t o the br owser , application, or device you’re using. This allows us\nto do things lik e maintain y our pr eferences acr oss br owsing sessions, such as y our\npreferred language or whether t o show y ou mor e relevant sear ch result

In [178]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

In [180]:
question = """ What information does google collects?
"""
print(f"Answer: {chain.invoke({'question': question})}")

Answer: Sure, here's a summary of the information that Google collects from users:

- Personal information such as name, email address, and password.
- Location information such as IP address, device type, and carrier name.
- Activity information such as search queries, videos watched, and communications made.
- Purchase history.
- People with whom users communicate or share content.
- Call and message log information.


# Pinecone testing

In [186]:
PINECONE_API_KEY="c6fa525d-b0e7-4854-9da4-786254c1611c"
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

In [187]:

index_name = "quickstart"

pc.create_index(
    name=index_name,
    dimension=1024, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [188]:
data = [
    {"id": "vec1", "text": "Apple is a popular fruit known for its sweetness and crisp texture."},
    {"id": "vec2", "text": "The tech company Apple is known for its innovative products like the iPhone."},
    {"id": "vec3", "text": "Many people enjoy eating apples as a healthy snack."},
    {"id": "vec4", "text": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces."},
    {"id": "vec5", "text": "An apple a day keeps the doctor away, as the saying goes."},
    {"id": "vec6", "text": "Apple Computer Company was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership."}
]

embeddings = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[d['text'] for d in data],
    parameters={"input_type": "passage", "truncate": "END"}
)
print(embeddings[0])

{'values': [0.04913330078125, -0.0131988525390625, ..., -0.0196990966796875, -0.011016845703125]}


In [189]:
# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)

vectors = []
for d, e in zip(data, embeddings):
    vectors.append({
        "id": d['id'],
        "values": e['values'],
        "metadata": {'text': d['text']}
    })

index.upsert(
    vectors=vectors,
    namespace="ns1"
)

{'upserted_count': 6}

In [190]:
print(index.describe_index_stats())

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 6}},
 'total_vector_count': 6}


In [191]:
query = "Tell me about the tech company known as Apple."

embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

In [192]:
results = index.query(
    namespace="ns1",
    vector=embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': 'vec2',
              'metadata': {'text': 'The tech company Apple is known for its '
                                   'innovative products like the iPhone.'},
              'score': 0.861149192,
              'values': []},
             {'id': 'vec6',
              'metadata': {'text': 'Apple Computer Company was founded on '
                                   'April 1, 1976, by Steve Jobs, Steve '
                                   'Wozniak, and Ronald Wayne as a '
                                   'partnership.'},
              'score': 0.846173108,
              'values': []},
             {'id': 'vec4',
              'metadata': {'text': 'Apple Inc. has revolutionized the tech '
                                   'industry with its sleek designs and '
                                   'user-friendly interfaces.'},
              'score': 0.841096759,
              'values': []}],
 'namespace': 'ns1',
 'usage': {'read_units': 6}}
