In [1]:
from pgvector.sqlalchemy import Vector
from sqlalchemy import select, MetaData, create_engine, Integer, String, func
from typing import Optional
from sqlalchemy.orm import Mapped
from sqlalchemy.orm import declarative_base, mapped_column, Session
from sqlalchemy.dialects.postgresql import TEXT, DATE, TIME, TIMESTAMP, ARRAY, JSONB, FLOAT
from sqlalchemy.orm import relationship
from sqlalchemy.sql.schema import ForeignKey    
from sqlalchemy.sql.expression import join


In [2]:

engine = create_engine('postgresql+pg8000://app:2214@localhost/icognition')

metadata_obj = MetaData()
Base = declarative_base()


In [3]:
engine

Engine(postgresql+pg8000://app:***@localhost/icognition)

In [8]:
class Bookmark(Base):
    __tablename__ = "bookmarks"
    __table_args__ = {"schema": "bmks", "extend_existing": True}
    id = mapped_column(Integer, primary_key=True)
    url = mapped_column(String(255))
    update_at = mapped_column(TIMESTAMP, onupdate=func.now())
    document_id = mapped_column(Integer)

class Document(Base):
    __tablename__ = "documents"
    __table_args__ = {"schema": "bmks", "extend_existing": True}
    id = mapped_column(Integer, primary_key=True)
    title = mapped_column(String(255))
    url = mapped_column(String(255))
    authors = mapped_column(ARRAY(String, dimensions=1))
    summary_generated = mapped_column(TEXT)
    publication_date = mapped_column(DATE())
    

class Keyphrase(Base):
    __tablename__ = "keyphrases"
    __table_args__ = {"schema": "bmks", "extend_existing": True}
    id = mapped_column(Integer, primary_key=True)
    word = mapped_column(String())
    word_vec = mapped_column(Vector(384))
    start = mapped_column(Integer)
    end = mapped_column(Integer)
    score = mapped_column(FLOAT)
    type = mapped_column(String())
    entity_group = mapped_column(String(50))
    context = mapped_column(String(255))
    context_vec = mapped_column(Vector(384))
    document_id = mapped_column(Integer)


In [9]:
#Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

In [55]:
metadata_obj = MetaData()
Base = declarative_base()
Base.metadata.tables

FacadeDict({})

In [22]:
str_query = 'machine learning'
with Session(engine) as session:
    vec_query = sentence_transformer.encode(
        str_query, show_progress_bar=False)
    items = session.query(ItemVector).order_by(
        ItemVector.text_vec.cosine_distance(vec_query)).limit(2).all()
    
    for item in items:
        distance = util.cos_sim(item.text_vec, vec_query)
        print('------ Word -------')
        print(f"word: {str_query}")
        print(item.text)
        print(distance)



------ Word -------
word: machine learning
Machine Learning
tensor([[1.0000]])
------ Word -------
word: machine learning
machine learning
tensor([[1.0000]])


In [19]:
a = sentence_transformer.encode('machine learning', show_progress_bar=False)
b = sentence_transformer.encode('Machine Learning', show_progress_bar=False)
distance = util.cos_sim(a, b)
print(distance)

tensor([[1.]])


In [12]:
# stmt = select(Item).limit(10)
with Session(engine) as session:
    number_done =  session.scalar(done_stmt)
    print(type(number_done))
    total = session.scalar(total_stmt)
    print(total)


<class 'int'>
5370004


In [8]:
subquery = select(ItemVector.id)
stmt = select(Item).where(Item.id.not_in(subquery), Item.description != None).limit(10)
print(stmt)

SELECT wikidata.items.id, wikidata.items.label, wikidata.items.description, wikidata.items.wikipedia_page_id, wikidata.items.views, wikidata.items.links 
FROM wikidata.items 
WHERE (wikidata.items.id NOT IN (SELECT wikidata.items_vectors.id 
FROM wikidata.items_vectors)) AND wikidata.items.description IS NOT NULL
 LIMIT :param_1


In [21]:
j = join(Item, ItemVector, Item.id == ItemVector.id)
done_stmt = select(func.count()).select_from(j)
        

print(done_stmt)

SELECT count(*) AS count_1 
FROM wikidata.items JOIN wikidata.items_vectors ON wikidata.items.id = wikidata.items_vectors.id


In [152]:
with Session(engine) as session:
    results = []
    for r in session.scalars(stmt):
        iv = ItemVector()
        iv.id = r.id
        iv.text = r.label
        iv.text_vec = sentence_transformer.encode(r.label)
        results.append(iv)
    
    session.add_all(results)
    session.commit()

print("done")




done


In [112]:
stmt = select(ItemVector).limit(10)
print(stmt)

SELECT wikidata.items_vectors.id, wikidata.items_vectors.text, wikidata.items_vectors.keywords, wikidata.items_vectors.text_vec, wikidata.items_vectors.keywords_vec 
FROM wikidata.items_vectors
 LIMIT :param_1


In [150]:
with Session(engine) as session:
    for row in session.scalars(select(ItemVector).limit(10)):
        print(row.text_vec)

[-6.54455051e-02  1.63151175e-02 -1.48429293e-02  3.58537063e-02
 -6.71814010e-02 -3.87995765e-02  1.36350796e-01  8.17985181e-03
  2.76977122e-02 -7.43086636e-02  3.80756259e-02 -5.41808419e-02
  4.96185347e-02 -3.96770984e-02 -8.67176354e-02  3.54869813e-02
  6.92783445e-02  2.33764648e-02  5.83562478e-02  1.65943597e-02
  3.76055855e-03  6.50990056e-03  4.02431786e-02  4.13679369e-02
  4.04583737e-02  3.26571576e-02  2.46169996e-02  2.09292816e-03
 -2.18593329e-02 -8.23428705e-02  5.58406040e-02 -5.08324709e-03
  3.18289809e-02  3.92273851e-02  3.97116691e-02  4.41422760e-02
 -4.38304469e-02 -2.16891710e-02  1.80840250e-02  8.48890021e-02
 -6.27726540e-02  4.99085803e-03 -8.75579417e-02 -4.33810949e-02
  2.71884780e-02 -1.14898188e-02  3.16205509e-02  3.49126793e-02
 -1.16132563e-02 -1.14094764e-02 -5.48579358e-02 -1.11365326e-01
 -9.00912061e-02  1.99680962e-02  2.49293260e-02  1.17959818e-02
 -9.64732841e-03  1.55592160e-02  2.80425306e-02  3.65109518e-02
  6.69692755e-02  5.00974

In [174]:
with Session(engine) as session:
    query = sentence_transformer.encode("rifle")        
    items = session.query(ItemVector).order_by(ItemVector.text_vec.cosine_distance(query)).limit(5).all()

    for item in items:
        distance = util.cos_sim(item.text_vec, query) 
        print(item.text, distance)


sniper rifle tensor([[0.8239]])
sedan tensor([[0.3184]])
invoice tensor([[0.2445]])
Mikkeli tensor([[0.2444]])
Love Actually tensor([[0.2257]])
