In [27]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from firebase import initialize_firebase_app
from firebase_admin import credentials, storage
import firebase_admin
from dotenv import load_dotenv
import os


load_dotenv()

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../firebase-service-account.json"
os.environ["FIREBASE_SERVICE_ACCOUNT_FILE"] = "../firebase-service-account.json"
os.environ["FIREBASE_STORAGE_BUCKET_NAME"] = "gs://sample-firebase-ai-app-7d7d8.appspot.com"


In [29]:
cred = credentials.Certificate(os.getenv("FIREBASE_SERVICE_ACCOUNT_FILE"))
firebase_admin.initialize_app(
    cred,
    {"storageBucket": os.getenv("FIREBASE_STORAGE_BUCKET_NAME")},
    "sample-firebase-ai-app-7d7d8"
)

<firebase_admin.App at 0x27b127dd240>

In [3]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

In [10]:
messages = [
    (
        "system",
        "You are a helpful assistant that translates English to Vietnamese. Translate the user sentence.",
    ),
    ("human", "I love programming."),
]

ai_msg = llm.invoke(messages)

In [11]:
ai_msg
# print(ai_msg)

AIMessage(content='Tôi yêu lập trình. \n', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': [{'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'probability': 'NEGLIGIBLE', 'blocked': False}]}, id='run-3188dd8b-f4c1-462a-90ab-dbe2fec3eb7d-0', usage_metadata={'input_tokens': 21, 'output_tokens': 5, 'total_tokens': 26})

In [12]:
print(ai_msg.content)

Tôi yêu lập trình. 



In [7]:
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf_embedding = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [18]:
text = "The effective date of the contract is October 20, 2024."

sentence_embeddings = hf_embedding.embed_query(text)

In [21]:
print(len(sentence_embeddings))
print(sentence_embeddings)

1024
[0.0025073369033634663, -0.029914017766714096, -0.047713752835989, 0.05490971729159355, -0.007672500796616077, 0.020584454759955406, -0.02137702889740467, -0.013789522461593151, 0.03160515055060387, 0.040291283279657364, 0.03415193408727646, 0.00101476872805506, 0.008601142093539238, 0.0013447335222736, -0.03391849994659424, 0.04106535390019417, -0.016040492802858353, 0.041280556470155716, 0.0008175491821020842, 0.08231118321418762, 0.031972650438547134, 0.0593823567032814, -0.05489709973335266, -0.027793100103735924, -0.03643123805522919, 0.022062398493289948, 0.018496155738830566, -0.0361071340739727, 0.06050730124115944, 0.029101330786943436, -0.04677126556634903, -0.026466336101293564, -0.01449707429856062, -0.032529883086681366, 0.01701241172850132, -0.006003469228744507, 0.04559257999062538, -0.010234677232801914, -0.019754085689783096, -0.05166472867131233, 0.02540542371571064, 0.039858851581811905, 0.021001655608415604, -0.023927051573991776, 0.004645569249987602, -0.04305

# Vector Store with Pinecone and Hugging Face

The below code follows this docs https://python.langchain.com/docs/integrations/vectorstores/pinecone/

In [30]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [46]:
import time

# Before initializing our vector store, let's connect to a Pinecone index.
# If one named index_name doesn't exist, it will be created.
index_name = "langchain-test-index"

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1024, # The dimension of the embedding model in below cell
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [47]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")

In [48]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [61]:
from uuid import uuid4
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

index.delete(delete_all=True)
vector_store.add_documents(documents=documents, uuids=uuids)

['011cb4d9-961b-4af1-9481-2274e89f51e7',
 '14e3ed81-0929-49d8-a752-5ffe1fd80de1',
 'd80d1c2c-2d68-435f-9c45-c0f05f6bfd05',
 '81c38741-e6c6-45ca-81fe-7a451c0bcc06',
 '6cac8255-af56-49cd-9b01-98350d88ed4c',
 'df2165db-566b-4520-a008-850065ca3b4c',
 '8f1a191d-8055-4f41-889e-990476f83513',
 '3ee2de5c-43e0-40c6-b37a-b912294fbda5',
 'b28e8b21-8cf3-4657-8ff7-d8de2eb844f0',
 'e9a9a343-e3df-4032-95e9-66eda49c8dec']

In [62]:
# Delete items from vector store
vector_store.delete(ids=[uuids[-1]])

In [63]:
# Query vector store directly
results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=2,
    filter={"source": "tweet"},
)

for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]
* LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet'}]


In [65]:
# Similarity search with score
results = vector_store.similarity_search_with_score(
    "Will it be hot tomorrow?",
    k=1,
    filter={"source": "news"},
)

for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.672703] The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news'}]


In [69]:
# Query by turning into retriever
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 1, "score_threshold": 0.5},
)

retriever.invoke("Stealing from a bank is a crime", filter={"source": "news"})

[Document(id='81c38741-e6c6-45ca-81fe-7a451c0bcc06', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.')]