In [16]:
import sqlite3
import json
import umsgpack
import numpy as np
import os
os.environ["TOKENIZERS_PARALLELISM"] = "True"
from datetime import datetime
from pathlib import Path


In [5]:
import glob

inmem_sorted = sorted(glob.glob("inmem-*.db"))
inmem_sorted

['inmem-0.db',
 'inmem-1.db',
 'inmem-2.db',
 'inmem-3.db',
 'inmem-4.db',
 'inmem-5.db',
 'inmem-6.db']

In [12]:
select_triple = "select priority, id, title, embedding from title_embedding"

totaldb = [
    (priority, id, title, umsgpack.unpackb(embedding_bytes))
    for dbname in inmem_sorted
    for (priority, id, title, embedding_bytes) in sqlite3.connect(dbname).execute(select_triple).fetchall()
]


In [13]:
totaldb[0:2]

[(0,
  21013155,
  'List of Supernatural and The Winchesters characters',
  [19,
   109,
   105,
   38,
   61,
   9,
   103,
   74,
   19,
   35,
   33,
   87,
   64,
   102,
   120,
   100,
   37,
   69,
   9,
   46,
   54,
   57,
   126,
   36,
   35,
   6,
   67,
   45,
   24,
   36,
   60,
   35,
   93,
   87,
   57,
   112,
   35,
   113,
   127,
   0,
   92,
   115,
   111,
   56,
   56,
   98,
   87,
   38]),
 (1,
  32770618,
  'List of characters in mythology novels by Rick Riordan',
  [0,
   42,
   57,
   38,
   65,
   70,
   103,
   80,
   5,
   35,
   58,
   50,
   64,
   88,
   94,
   100,
   127,
   65,
   39,
   58,
   54,
   116,
   17,
   82,
   8,
   6,
   32,
   84,
   14,
   100,
   18,
   18,
   93,
   13,
   11,
   121,
   32,
   125,
   120,
   115,
   92,
   115,
   109,
   79,
   15,
   38,
   35,
   9])]

In [14]:
from sentence_transformers import SentenceTransformer

minilml6v2 = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

onomato_query = minilml6v2.encode("word that sounds like what it represents, like meow, oink, woof")
cheeses_query = minilml6v2.encode("cheeses")

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
np.array([[[1.1]]]).shape

(1, 1, 1)

In [36]:
codewords = np.array(json.loads(Path("codewords.json").read_text()), dtype=np.float32)

(cwM, cwKs, cwDs) = codewords.shape
(cwM, cwKs, cwDs)

(48, 128, 8)

In [37]:
cheeses_query.reshape(cwM, 1, -1)

array([[[-6.28128201e-02, -1.18328584e-02, -4.85378765e-02,
          7.56069869e-02, -6.29612729e-02,  5.24101360e-03,
          8.88406560e-02,  3.82924452e-02]],

       [[ 1.63012110e-02, -2.92984396e-02,  3.20276879e-02,
         -3.97614948e-02, -2.54703797e-02, -8.77302373e-04,
         -7.12979510e-02, -3.26210000e-02]],

       [[ 7.33206421e-03,  7.21905679e-02, -9.11571644e-03,
         -4.77604903e-02, -4.73998226e-02, -2.30509676e-02,
         -2.15109996e-02,  3.86618674e-02]],

       [[-1.05217239e-02,  6.76695630e-02,  1.06834043e-02,
         -8.60001333e-03,  3.92565504e-04, -5.64423129e-02,
         -2.41308101e-02, -4.65472043e-02]],

       [[-1.24787949e-02, -4.98961918e-02, -2.42104828e-02,
         -7.50152569e-04,  9.32364836e-02,  2.36256863e-03,
          5.46098687e-02, -3.35524371e-03]],

       [[-1.13846706e-02, -2.73707714e-02,  2.27206759e-02,
          7.84932077e-02, -5.59639558e-02,  2.68044788e-03,
         -4.89668660e-02,  1.22303493e-01]],

    

In [38]:
import nanopq

In [39]:
pq = nanopq.PQ(M=cwM, Ks=cwKs)
pq.codewords = codewords
pq.Ds = cwDs

M: 48, Ks: 128, code_dtype: <class 'numpy.uint8'>


In [40]:
cheeses_pqdtable = pq.dtable(cheeses_query).dtable

In [41]:
cheeses_dtable = np.linalg.norm(codewords - cheeses_query.reshape(cwM, 1, cwDs), axis=2) ** 2

In [42]:
cheeses_pqdtable[0]

array([0.0262751 , 0.03831497, 0.0366899 , 0.03406782, 0.04419767,
       0.03547133, 0.01783223, 0.0373632 , 0.03374524, 0.03435639,
       0.0246892 , 0.02875292, 0.01850583, 0.05410058, 0.03671422,
       0.02606186, 0.03303837, 0.02666951, 0.03046086, 0.01841877,
       0.01954821, 0.0654664 , 0.03382247, 0.05513705, 0.04551951,
       0.03587476, 0.02982153, 0.01598961, 0.01280518, 0.03174484,
       0.03387584, 0.02099134, 0.01880165, 0.0278201 , 0.03136155,
       0.05293519, 0.05020154, 0.01787505, 0.0538577 , 0.03260572,
       0.0301367 , 0.03906617, 0.03312015, 0.04770687, 0.01603811,
       0.02959405, 0.00786629, 0.02820772, 0.02024977, 0.03012272,
       0.02078309, 0.04480306, 0.03642677, 0.04678442, 0.03439843,
       0.02726365, 0.0221417 , 0.00929736, 0.02796467, 0.0238557 ,
       0.02362383, 0.04583019, 0.03425609, 0.02687342, 0.01939722,
       0.03475128, 0.03128236, 0.02832198, 0.04937307, 0.02205935,
       0.0396001 , 0.0169442 , 0.03516842, 0.05096008, 0.04400

In [43]:
cheeses_dtable[0]

array([0.0262751 , 0.03831497, 0.0366899 , 0.03406782, 0.04419767,
       0.03547133, 0.01783223, 0.0373632 , 0.03374524, 0.03435639,
       0.0246892 , 0.02875292, 0.01850583, 0.05410058, 0.03671422,
       0.02606186, 0.03303837, 0.02666951, 0.03046086, 0.01841877,
       0.01954821, 0.0654664 , 0.03382247, 0.05513705, 0.04551951,
       0.03587476, 0.02982153, 0.01598961, 0.01280518, 0.03174484,
       0.03387584, 0.02099134, 0.01880165, 0.0278201 , 0.03136155,
       0.05293519, 0.05020154, 0.01787505, 0.0538577 , 0.03260572,
       0.0301367 , 0.03906617, 0.03312015, 0.04770687, 0.01603811,
       0.02959405, 0.00786629, 0.02820772, 0.02024977, 0.03012272,
       0.02078309, 0.04480306, 0.03642677, 0.04678442, 0.03439843,
       0.02726365, 0.0221417 , 0.00929736, 0.02796467, 0.0238557 ,
       0.02362383, 0.04583019, 0.03425609, 0.02687342, 0.01939722,
       0.03475128, 0.03128236, 0.02832198, 0.04937307, 0.02205935,
       0.0396001 , 0.0169442 , 0.03516842, 0.05096008, 0.04400

In [52]:
cheeses_dtable.shape

(48, 128)

In [45]:
np.array_equal(cheeses_dtable, cheeses_pqdtable)

True

In [50]:
cwfirst3 = np.array([cw for (priority, id, title, cw) in totaldb[0:3]], dtype=np.uint8)

In [51]:
cwfirst3

array([[ 19, 109, 105,  38,  61,   9, 103,  74,  19,  35,  33,  87,  64,
        102, 120, 100,  37,  69,   9,  46,  54,  57, 126,  36,  35,   6,
         67,  45,  24,  36,  60,  35,  93,  87,  57, 112,  35, 113, 127,
          0,  92, 115, 111,  56,  56,  98,  87,  38],
       [  0,  42,  57,  38,  65,  70, 103,  80,   5,  35,  58,  50,  64,
         88,  94, 100, 127,  65,  39,  58,  54, 116,  17,  82,   8,   6,
         32,  84,  14, 100,  18,  18,  93,  13,  11, 121,  32, 125, 120,
        115,  92, 115, 109,  79,  15,  38,  35,   9],
       [ 12,   8,  54,  75,   5,  58,  38,  58,  52, 123,  93, 119,  14,
        107, 111, 123,  53, 125,  28,  53,  74,  96,  40, 119, 122,  85,
         32,  37, 110,  57,   2, 115, 127,  49,  93,  67,  29,   0,  23,
         48,  85,  84,  98,  54,  49,  22,  40,  38]], dtype=uint8)

In [53]:
np.arange(5)[:, None]

array([[0],
       [1],
       [2],
       [3],
       [4]])

In [79]:
tinycw = np.array([
    [0, 10000, 20000, 30000, 40000, 50000],
    [0, 100, 200, 300, 400, 500],
    [0, 1,2,3,4,5],
])
tinycw

array([[    0, 10000, 20000, 30000, 40000, 50000],
       [    0,   100,   200,   300,   400,   500],
       [    0,     1,     2,     3,     4,     5]])

In [82]:
tinycodes = np.array([
    [1,2,3,],
    [2,3,4,],
    [1,1,1,],
    [4,4,4,],
])

In [83]:
tinycw[np.arange(tinycw.shape[0]), tinycodes]

array([[10000,   200,     3],
       [20000,   300,     4],
       [10000,   100,     1],
       [40000,   400,     4]])

In [84]:
tinycw[0][tinycodes[:, 0]]

array([10000, 20000, 10000, 40000])

In [95]:
np.choose(tinycodes.reshape(-1), tinycw)

ValueError: shape mismatch: objects cannot be broadcast to a single shape.  Mismatch is between arg 0 with shape (6,) and arg 3 with shape (12,).

In [96]:
cheeses_query

array([-6.28128201e-02, -1.18328584e-02, -4.85378765e-02,  7.56069869e-02,
       -6.29612729e-02,  5.24101360e-03,  8.88406560e-02,  3.82924452e-02,
        1.63012110e-02, -2.92984396e-02,  3.20276879e-02, -3.97614948e-02,
       -2.54703797e-02, -8.77302373e-04, -7.12979510e-02, -3.26210000e-02,
        7.33206421e-03,  7.21905679e-02, -9.11571644e-03, -4.77604903e-02,
       -4.73998226e-02, -2.30509676e-02, -2.15109996e-02,  3.86618674e-02,
       -1.05217239e-02,  6.76695630e-02,  1.06834043e-02, -8.60001333e-03,
        3.92565504e-04, -5.64423129e-02, -2.41308101e-02, -4.65472043e-02,
       -1.24787949e-02, -4.98961918e-02, -2.42104828e-02, -7.50152569e-04,
        9.32364836e-02,  2.36256863e-03,  5.46098687e-02, -3.35524371e-03,
       -1.13846706e-02, -2.73707714e-02,  2.27206759e-02,  7.84932077e-02,
       -5.59639558e-02,  2.68044788e-03, -4.89668660e-02,  1.22303493e-01,
        5.38281687e-02, -9.03165713e-03, -5.56312799e-02, -4.10511345e-02,
       -9.82284639e-03, -

In [97]:
Path("./cheeses_query.json").write_text(json.dumps(
    cheeses_query.astype(np.float64).tolist()
))

8420

In [99]:
list(range(5,10))

[5, 6, 7, 8, 9]

In [100]:
list(zip(range(10), ["a", "b", "c"]))

[(0, 'a'), (1, 'b'), (2, 'c')]

In [105]:
np.save("a123.npy", np.array([1,2,3]))

In [106]:
json.dump??

In [109]:
a = np.array([[1,2],[3,4]])
a

array([[1, 2],
       [3, 4]])

In [111]:
a.reshape(-1)

array([1, 2, 3, 4])