In [1]:
import setup
setup.init_django()

In [2]:
from decouple import config
from blog.models import BlogPost, EMBEDDING_LENGTH
from blog import services

In [3]:
qs=BlogPost.objects.filter(can_delete=True)
qs

<QuerySet [<BlogPost: BlogPost object (5)>, <BlogPost: BlogPost object (6)>, <BlogPost: BlogPost object (7)>, <BlogPost: BlogPost object (8)>]>

!pip install llama-index sqlalchemy llama-index-vector-stores-postgres

In [4]:
#!pip install llama-index sqlalchemy llama-index-vector-stores-postgres

In [5]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

In [6]:
LLM_MODEL = config("LLM_MODEL", default="gpt-4o") # not in use use
EMBEDDING_LENGTH = config("EMBEDDING_LENGTH", default=1536, cast=int)
EMBEDDING_MODEL =config("EMBEDDING_MODEL", default="text-embedding-3-small")
OPENAI_API_KEY = config("OPENAI_API_KEY")
llm = OpenAI(model=LLM_MODEL, api_key=OPENAI_API_KEY)
embed_model = OpenAIEmbedding(model=EMBEDDING_MODEL, api_key=OPENAI_API_KEY)

In [28]:
from typing import List

class MyOpenAIEmbedding(OpenAIEmbedding):
    def _get_query_embedding(self, query: str) -> List[float]:
        """Get query embedding."""
        print('my query', query)
        return super()._get_query_embedding(query)

    def _get_text_embedding(self, text: str) -> List[float]:
        """Get text embedding."""
        print("texts",text)
        return super()._get_text_embedding(text)

    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        """Get text embeddings.

        By default, this is a wrapper around _get_text_embedding.
        Can be overridden for batch queries.

        """
        print("texts",texts)
        return super()._get_text_embeddings(texts)
        
embed_model = MyOpenAIEmbedding(model=EMBEDDING_MODEL, api_key=OPENAI_API_KEY)    

In [29]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

In [30]:
vector_db_name="vector_db"
vector_db_table_name="blogpost"

In [31]:
DATABASE_URL = config("DATABASE_URL_POOL")
if DATABASE_URL.startswith("postgres://"):
    DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql://", 1)

In [32]:
DATABASE_URL

'postgresql://neondb_owner:npg_eixHPS8AM7hZ@ep-holy-paper-a66bzvpf-pooler.us-west-2.aws.neon.tech/vector_db?sslmode=require'

In [33]:
# create a new database
from sqlalchemy import create_engine, text

engine = create_engine(DATABASE_URL, isolation_level="AUTOCOMMIT")
with engine.connect() as connection:
    result = connection.execute(text("SELECT 1 FROM pg_database WHERE datname = :db_name"), {"db_name": vector_db_name})
    db_exists = result.scalar() == 1
    if not db_exists:
        session.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
        connection.execute(text(f"CREATE DATABASE {vector_db_name}"))

In [34]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

url = make_url(DATABASE_URL)
vector_store = PGVectorStore.from_params(
    database=vector_db_name,
    host=url.host,
    password=url.password,
    port=url.port or 5432,
    user=url.username,
    table_name=vector_db_table_name,
    embed_dim=EMBEDDING_LENGTH,  
    hnsw_kwargs={
        "hnsw_m": 16,
        "hnsw_ef_construction": 64,
        "hnsw_ef_search": 40,
        "hnsw_dist_method": "vector_cosine_ops",
    },
)

In [35]:
from llama_index.core import VectorStoreIndex, StorageContext

storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context, show_progress=True
)
query_engine = index.as_query_engine()

In [36]:
response=query_engine.query("My query")

my query My query


In [40]:
from llama_index.core import Document
qs=BlogPost.objects.filter(can_delete=True)
docs=[]
for obj in qs:
    docs.append(Document(text=f"{obj.get_embedding_text_raw()}", 
                         doc_id=str(obj.id), 
                         embedding=obj.embedding.tolist(),
                         metadata={"pk": obj.pk, "title":obj.title}))
#docs                
    

In [39]:
for doc in docs:
    index.delete_ref_doc(f"{doc.id_}", delete_from_docstore=True)
    index.insert(doc)
    

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

texts ['pk: 5\ntitle: Blog Post 1\n\nThe dog jumped over the cat']


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

texts ['pk: 6\ntitle: Blog Post 2\n\nThe cat jumped over the dog']


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

texts ['pk: 7\ntitle: Blog Post 3\n\nThe weather is very hot']


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

texts ['pk: 8\ntitle: Blog Post 4\n\nthe cat is yellow and dog is red']


In [18]:
query_engine = index.as_query_engine()
response=query_engine.query("The dog jumped")

my query The dog jumped


In [19]:
response

Response(response='The phrase "The dog jumped" is part of the content in "Blog Post 1."', source_nodes=[NodeWithScore(node=TextNode(id_='ec49581f-d896-4c1e-afb5-34340017dfb5', embedding=None, metadata={'pk': 6, 'title': 'Blog Post 2'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6', node_type='4', metadata={'pk': 6, 'title': 'Blog Post 2'}, hash='2af3f298807390e7f0ace006e7f1317efc941c64ce5cec99f42c967c5cee4c80')}, metadata_template='{key}: {value}', metadata_separator='\n', text='The cat jumped over the dog', mimetype='text/plain', start_char_idx=0, end_char_idx=27, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=0.7849359768705452), NodeWithScore(node=TextNode(id_='b5c80adf-08c0-4137-9f2e-6aa9b099f20f', embedding=None, metadata={'pk': 5, 'title': 'Blog Post 1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: 

In [20]:
for k in response.metadata.keys():
    for subk, v in response.metadata[k].items():
        print(subk,v)

pk 6
title Blog Post 2
pk 5
title Blog Post 1


In [21]:
str(response.response)

'The phrase "The dog jumped" is part of the content in "Blog Post 1."'

In [22]:
port = url.port or 5432
db_url = f"postgresql://{url.username}:{url.password}@{url.host}:{port}/{vector_db_name}"

from sqlalchemy import create_engine, text
import numpy as np

# Create the SQLAlchemy engine
engine = create_engine(db_url)

with engine.connect() as connection:
    # Define the SQL query to select only the id and embedding columns
    query = text(f"SELECT * FROM data_{vector_db_table_name}")
    query = text(f"SELECT metadata_, embedding FROM data_{vector_db_table_name}")
    
    # Execute the query
    result = connection.execute(query)
    
    # Fetch all rows
    rows = result.fetchall()

In [23]:
def calculate_cosine_metrics(v1, v2):
    dot_product = np.dot(v1, v2)
    magnitude1 = np.linalg.norm(v1)
    magnitude2 = np.linalg.norm(v2)
    cosine_similarity = dot_product / (magnitude1 * magnitude2)
    cosine_distance = 1 - cosine_similarity
    return int(cosine_similarity* 100), int(cosine_distance * 100)

In [24]:
for row in rows:
    metadata_,embedding=row[0],row[1]
    #print(metadata_)
    blog_post_pk=metadata_.get("pk")
    obj=BlogPost.objects.get(pk=blog_post_pk)
    embedding_array=np.array(embedding.strip('[]').split(','),dtype=float)
    print(calculate_cosine_metrics((obj.embedding).shape, embedding_array.shape))
  

(100, 0)
(100, 0)
(100, 0)
(100, 0)
