# Query

## Load data

In [1]:
import pandas as pd

Xy = pd.read_csv('./stash/Xytc.csv')
X = Xy[Xy.columns.drop(['title', 'y', 'c'])]
M = X.values
y = Xy['y']
t = Xy['title']
c = Xy['c']

Xy.shape, X.shape, M.shape, t.shape, y.shape, c.shape

((90, 1539), (90, 1536), (90, 1536), (90,), (90,), (90,))

In [2]:
Xy[['title', 'y', 'c']].head()

Unnamed: 0,title,y,c
0,The Hundred-Page Machine Learning Book,machine_learning,7
1,Programming Python,python,11
2,The Java Language Specification,java,12
3,Artificial Intelligence: A Modern Approach,computer_science,7
4,Speaking JavaScript: An In-Depth Guide for Pro...,javascript,5


## Manual similarity

In [3]:
from openai import OpenAI
import numpy as np

def embed_func(docs, model='text-embedding-ada-002'):
    client = OpenAI()
    docs = [d.replace('\n', ' ') for d in docs]
    res = client.embeddings.create(input=docs, model=model)
    return [d.embedding for d in res.data]

def get_query(doc):
    v = embed_func([doc])
    return np.array(v[0]).reshape(1, -1)

In [4]:
q = get_query('java')

In [5]:
q.shape

(1, 1536)

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(q, M)

array([[0.75620226, 0.82191143, 0.85270945, 0.77329626, 0.81568013,
        0.83236009, 0.78334782, 0.77840476, 0.79508663, 0.80488755,
        0.79164975, 0.73128099, 0.78161609, 0.75559099, 0.81313488,
        0.76628753, 0.7614802 , 0.78256975, 0.75278296, 0.73819919,
        0.73481335, 0.77438821, 0.7849678 , 0.7358972 , 0.76641887,
        0.78284678, 0.73030078, 0.77287208, 0.75486842, 0.76532978,
        0.76187055, 0.77997775, 0.75899437, 0.8462908 , 0.75206765,
        0.77664705, 0.76767852, 0.8108311 , 0.78038733, 0.881127  ,
        0.75612125, 0.79508663, 0.80369215, 0.74262596, 0.74555309,
        0.79233768, 0.77471897, 0.75338057, 0.7803994 , 0.85961638,
        0.79324922, 0.7915885 , 0.78046965, 0.74783312, 0.77565827,
        0.78968255, 0.83236009, 0.81233559, 0.79691991, 0.83185281,
        0.7583959 , 0.78431428, 0.77444394, 0.73503698, 0.81339892,
        0.75255622, 0.76638075, 0.77589243, 0.82187042, 0.76227486,
        0.85538339, 0.79819681, 0.74123089, 0.81

## Langchain vector search similarity

In [7]:
import pathlib
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

def get_db(db_path=pathlib.Path('./book_vdb')):
    db = Chroma(
        persist_directory=str(db_path), 
        embedding_function=OpenAIEmbeddings(model='text-embedding-ada-002')
    )
    return db

db = get_db()

In [8]:
docs = db.search('java', search_type='similarity')
docs

[Document(page_content='The Java™ Programming Language', metadata={'subject': 'java', 'topic': 12}),
 Document(page_content='Effective Java', metadata={'subject': 'java', 'topic': 12}),
 Document(page_content='Thinking in Java', metadata={'subject': 'java', 'topic': 12}),
 Document(page_content='Head First Java', metadata={'subject': 'java', 'topic': 12})]

In [9]:
docs = db.search('java', search_type='mmr')
docs

[Document(page_content='The Java™ Programming Language', metadata={'subject': 'java', 'topic': 12}),
 Document(page_content='Effective Java', metadata={'subject': 'java', 'topic': 12}),
 Document(page_content='JavaScript: The Good Parts', metadata={'subject': 'javascript', 'topic': 13}),
 Document(page_content='Python Crash Course', metadata={'subject': 'python', 'topic': 11})]

In [10]:
topic = docs[0].metadata['topic']
topic

12

## Chroma search similarity

In [11]:
import chromadb
from chromadb.config import Settings

client = chromadb.PersistentClient(
    path='./vdb', 
    settings=Settings(anonymized_telemetry=False)
)
collection = client.create_collection(
    name='books',
    get_or_create=True
)

In [12]:
def collection_query(txt, n_results=5):
    return collection.query(
        query_embeddings=get_query(txt),
        n_results=n_results,
        include=['metadatas']
    )

collection_query('java')

{'ids': [['39', '79', '49', '70', '2']],
 'distances': None,
 'metadatas': [[{'subject': 'java', 'topic': 12},
   {'subject': 'java', 'topic': 12},
   {'subject': 'java', 'topic': 12},
   {'subject': 'java', 'topic': 12},
   {'subject': 'java', 'topic': 12}]],
 'embeddings': None,
 'documents': None,
 'uris': None,
 'data': None}

## Probabilistic search similarity

In [13]:
from pybbn.serde import dict_to_model
import json

with open('./stash/model.json', 'r') as fp:
    model = dict_to_model(json.load(fp))

In [14]:
def get_lift(m, p):
    return m \
    .join(p[['__p__']], lsuffix='lhs', rsuffix='rhs') \
    .assign(__p__=lambda d: d['__p__rhs'] / d['__p__lhs']) \
    [m.columns]

In [15]:
mq = model.pquery()

In [16]:
e = model.e({f'c{topic}': 1})
pq = model.pquery(evidences=e)

In [17]:
lq = {k: get_lift(mq[k], pq[k]) for k in pq}

In [18]:
mq['c0']

Unnamed: 0,c0,__p__
0,0,0.011807
1,1,0.988193


In [19]:
pq['c0']

Unnamed: 0,c0,__p__
0,0,0.002822
1,1,0.997178


In [20]:
lq['c0']

Unnamed: 0,c0,__p__
0,0,0.238969
1,1,1.009093


In [21]:
lq['c0']['__p__'].max()

1.0090930330818166

In [22]:
topics = ((_k, _df) for _k, _df in lq.items())
topics = filter(lambda tup: f'c{topic}' != tup[0], topics)
topics = map(lambda tup: (tup[0], tup[1], tup[1]['__p__'].max()), topics)
topics = sorted(topics, key=lambda tup: tup[2], reverse=True)
topics = map(lambda tup: (int(tup[0][1:]), tup[2]), topics)
topics = filter(lambda tup: tup[1] > 1.5, topics)
topics = map(lambda tup: {'topic': tup[0], 'lift': tup[1]}, topics)

pd.DataFrame(topics)

Unnamed: 0,topic,lift
0,6,9.103931
1,3,8.829003
2,7,1.923106


## Search other topics

In [23]:
db.search('java', search_type='mmr', filter={'topic': 6})

[Document(page_content='The Art of Computer Programming', metadata={'subject': 'computer_science', 'topic': 6}),
 Document(page_content='Structure and Interpretation of Computer Programs', metadata={'subject': 'computer_science', 'topic': 6}),
 Document(page_content='Introduction to Algorithms', metadata={'subject': 'computer_science', 'topic': 6})]

In [27]:
db.search('java', search_type='mmr', filter={'topic': 3})

[Document(page_content='Operating System Concepts', metadata={'subject': 'computer_science', 'topic': 3}),
 Document(page_content='Computer Networking: A Top-Down Approach', metadata={'subject': 'computer_science', 'topic': 3}),
 Document(page_content='Principles: Life and Work', metadata={'subject': 'finance', 'topic': 3}),
 Document(page_content='Machine Learning: A Probabilistic Perspective', metadata={'subject': 'machine_learning', 'topic': 3})]

In [28]:
db.search('java', search_type='mmr', filter={'topic': 7})

[Document(page_content='Spring in Action', metadata={'subject': 'java', 'topic': 7}),
 Document(page_content='Machine Learning Yearning', metadata={'subject': 'machine_learning', 'topic': 7}),
 Document(page_content='Artificial Intelligence: A Modern Approach', metadata={'subject': 'computer_science', 'topic': 7}),
 Document(page_content='Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow', metadata={'subject': 'machine_learning', 'topic': 7})]

In [24]:
db.search('java', search_type='similarity', filter={'topic': 6})

[Document(page_content='The Art of Computer Programming', metadata={'subject': 'computer_science', 'topic': 6}),
 Document(page_content='Structure and Interpretation of Computer Programs', metadata={'subject': 'computer_science', 'topic': 6}),
 Document(page_content='Introduction to Algorithms', metadata={'subject': 'computer_science', 'topic': 6})]

In [26]:
collection.query(
    query_embeddings=get_query('java'),
    where={'topic': {'$eq': 6}},
    n_results=3
)

{'ids': [['57', '45', '6']],
 'distances': [[0.3753288051549073, 0.41532464663156055, 0.43330439504246915]],
 'metadatas': [[{'subject': 'computer_science', 'topic': 6},
   {'subject': 'computer_science', 'topic': 6},
   {'subject': 'computer_science', 'topic': 6}]],
 'embeddings': None,
 'documents': [['The Art of Computer Programming',
   'Structure and Interpretation of Computer Programs',
   'Introduction to Algorithms']],
 'uris': None,
 'data': None}