In [28]:
import json
import time
from tqdm import tqdm, trange
import numpy as np
import pandas as pd
import redis
from redis.commands.search.field import TextField, NumericField, TagField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import NumericFilter, Query

import requests
import glob
from PIL import Image
from io import BytesIO
from vectorize.embed import TextVectorizingModel, ImageVectorizingModel


In [5]:
text_vectorizer = TextVectorizingModel(filename="./vectorize/bge.quant.onnx")
image_vectorizer = ImageVectorizingModel(filename="./vectorize/mobileclip_s0.pt")


In [7]:
def get(url, cache='data/images'):
    filename=url.split("/")[-1]
    if filename in list(map(lambda x: x.split('/')[-1], glob.glob(f'{cache}/*'))):
        im = Image.open(f'{cache}/{filename}')
    else:
        response = requests.get(url)
        if response.status_code != 400:
            im=Image.open(BytesIO(response.content))
            im.save(f'{cache}/{filename}')
    return im

In [8]:
cache='data/images'

In [33]:
df = pd.read_csv("./products_with_categories.csv")
df = df.drop(['Unnamed: 0'], axis=1)#,'asin', 'stars', 'listPrice', 'reviews', 'productURL', 'isBestSeller', 'boughtInLastMonth'], axis=1)
N = len(df)
df.head()

text_embeddings = np.zeros((N, 1024))
image_embeddings = np.zeros((N, 512))
#df=df.drop('id', axis=1)

In [35]:
for i in trange(N):
    row = df.iloc[i]
    redis_key = f"product:{i:07}"
    im=get(f"{row['imgUrl']}")
    image_embedding = image_vectorizer.vectorize(f"{cache}/{row['img']}")[0].tolist()
    text_embedding = text_vectorizer.vectorize(row['title'])[0].tolist()
    text_embeddings[i] = text_embedding
    image_embeddings[i] = image_embedding
    
np.save('text_embeddings_v2.npy', text_embeddings)
np.save('image_embeddings_v2.npy', image_embeddings)


100%|██████████| 2542/2542 [19:35<00:00,  2.16it/s]  


In [19]:
im=get(f"{cache}/{row['img']}")

#image_embedding = image_vectorizer.vectorize(im)[0].tolist()

PIL.JpegImagePlugin.JpegImageFile

In [23]:
client= redis.Redis(host='localhost', port=6380)
pipeline = client.pipeline()

for i in trange(N):
    row = df.iloc[i]
    redis_key = f"product:{i:07}"
    #im=get(f"{cache}/{row['img']}")
    to_add = row.to_dict()
    to_add['text_embedding'] = []
    to_add['image_embedding'] = []
    
    pipeline.json().set(redis_key, "$", to_add)
    pipeline.json().set(redis_key, "$.text_embedding", text_embeddings[i].tolist())
    pipeline.json().set(redis_key, "$.image_embedding", image_embeddings[i].tolist())

res = pipeline.execute()

100%|██████████| 4425/4425 [00:02<00:00, 1637.12it/s]


In [131]:
schema = (
    TextField("$.title", no_stem=True, as_name="title"),
    TextField("$.img", no_stem=True, as_name="path to image"),
    NumericField("$.price", as_name="example price"),
    TextField("$.category"),
    VectorField(
        "$.text_embedding",
        "FLAT",
        {
            "TYPE": "FLOAT32",
            "DIM": 1024,
            "DISTANCE_METRIC": "L2",
        },
        as_name="text_vector",
    ),
    VectorField(
        "$.image_embedding",
        "FLAT",
        {
            "TYPE": "FLOAT32",
            "DIM": 512,
            "DISTANCE_METRIC": "L2",
        },
        as_name="image_vector",
    )
)

definition = IndexDefinition(prefix=["product:"], index_type=IndexType.JSON)
res = client.ft("idx:product_vss").create_index(
    fields=schema, definition=definition
)

In [132]:
info = client.ft("idx:product_vss").info()
num_docs = info["num_docs"]
indexing_failures = info["hash_indexing_failures"]

In [146]:
assert indexing_failures == '0'
print(f"{num_docs} documents indexed with {indexing_failures} failures")

4439 documents indexed with 0 failures


In [27]:
queries = image_vectorizer.vectorize(Image.open("pics/dewalt.jpeg"))

UnboundLocalError: local variable 'im' referenced before assignment

In [25]:
query = (
    Query('(*)=>[KNN 3 @image_vector $query_vector AS vector_score]')
     .sort_by('vector_score')
     .return_fields('vector_score', 'title', 'price', 'img', 'category')
     .dialect(2)
)

In [26]:
client.ft("idx:product_vss").search(query, { 
    'query_vector': queries.tobytes() 
    }).docs


NameError: name 'queries' is not defined

In [140]:
queries = text_vectorizer.vectorize("Нужна дрель для стены")

query = (
    Query('(*)=>[KNN 3 @text_vector $query_vector AS vector_score]')
     .sort_by('vector_score')
     .return_fields('vector_score', 'title', 'price', 'img', 'category')
     .dialect(2)
)

In [141]:
client.ft("idx:product_vss").search(query, { 
    'query_vector': queries.tobytes() 
    }).docs


[Document {'id': 'product:0002648', 'payload': None, 'vector_score': '0.866307914257', 'title': 'Quickpoint Mortar Gun Drill Adaptor - Perfect For tuckpointing, brick work, stone work, thin brick, grouting, crack repair, and glass block'},
 Document {'id': 'product:0003281', 'payload': None, 'vector_score': '0.875742018223', 'title': 'Dremel MM50-01 Multi-Max Oscillating DIY Tool Kit with Tool-LESS Accessory Change- 5 Amp, 30 Accessories- Compact Head & Angled Body- Drywall, Nails, Remove Grout & Sanding'},
 Document {'id': 'product:0001350', 'payload': None, 'vector_score': '0.875928580761', 'title': 'Right Angle Drill, 1/2 In, 355/750 RPM'}]