In [10]:
import json
import time
from tqdm import tqdm
import numpy as np
import pandas as pd
import redis
from redis.commands.search.field import TextField, NumericField, TagField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import NumericFilter, Query

import requests
import glob
from PIL import Image
from io import BytesIO
from models.embed import TextVectorizingModel, ImageVectorizingModel


In [6]:
text_vectorizer = TextVectorizingModel(filename="models/bge.quant.onnx")
image_vectorizer = ImageVectorizingModel(filename="./models/mobileclip_s0.pt")


In [None]:
def get(url, cache='data/images'):
    filename=url.split("/")[-1]
    if url in glob.glob(f'{cache}/*'):
        im = Image.open(f'{cache}/{filename}')
    else:
        response = requests.get(url)
        if response.status_code != 400:
            im=Image.open(BytesIO(response.content))
            im.save(f'{cache}/{filename}')
    return im


In [7]:
df = pd.read_csv("./data/archive-3/amazon_best.csv")
df.head()
#df=df.drop('id', axis=1)

Unnamed: 0.1,Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth
0,0,B08XWVX62X,Eureka NEC185 Cordless Stick Vacuum Cleaner Co...,https://m.media-amazon.com/images/I/716X0z1KIL...,https://www.amazon.com/dp/B08XWVX62X,4.1,27918,152.99,169.99,175,False,600
1,1,B009ZJ2M7G,Hoover Power Scrub Deluxe Carpet Cleaner Machi...,https://m.media-amazon.com/images/I/71zMtxB9xT...,https://www.amazon.com/dp/B009ZJ2M7G,4.5,47104,181.36,239.99,175,False,4000
2,2,B07PN7DPKX,Prolux Core Floor Buffer - Heavy Duty Single P...,https://m.media-amazon.com/images/I/71+t-F84jD...,https://www.amazon.com/dp/B07PN7DPKX,4.2,497,499.99,549.99,175,False,200
3,3,B005KMDV9A,Shark NV356E Navigator Lift-Away Professional ...,https://m.media-amazon.com/images/I/61OsbGCnrq...,https://www.amazon.com/dp/B005KMDV9A,4.5,34651,199.99,0.0,175,False,5000
4,4,B09QQ9T4XY,Tineco iFLOOR 3 Breeze Complete Wet Dry Vacuum...,https://m.media-amazon.com/images/I/71rvT22VH3...,https://www.amazon.com/dp/B09QQ9T4XY,4.4,25614,279.99,0.0,175,False,6000


In [40]:
client= redis.Redis(host='localhost', port=6379)
pipeline = client.pipeline()

for i,row in tqdm(df.iterrows()):
    redis_key = f"intent:{i:07}"
    #pipeline.hset(name=redis_key, mapping=row.to_dict())
    #pipeline.hset(name=redis_key, key='example', value=row['example'])
    #pipeline.hset(name=redis_key, key="embedding", value=model.embed(row['example'])[0].tolist())
    to_add = row.to_dict()
    row['embedding'] = []
    pipeline.json().set(redis_key, "$", row.to_dict())
    pipeline.json().set(redis_key, "$.embedding", model.embed(row['example'])[0].tolist())

res = pipeline.execute()

3665it [01:17, 47.57it/s]


In [36]:
client.json().get(redis_key)

{'intent': ' E-mailing || Рассылка.',
 'code': '8-0066-fr',
 'example': 'Je ne veux plus recevoir des notification de ce site comment faire?.',
 'embedding': [-0.03903038799762726,
  -0.01650109328329563,
  -0.024391859769821167,
  -0.02092803455889225,
  -0.027954423800110817,
  -0.06993748992681503,
  0.0007089392165653408,
  0.0031620466616004705,
  0.024129526689648628,
  -0.02306586503982544,
  0.01765548065304756,
  0.021001743152737617,
  -0.027201101183891296,
  0.015077567659318449,
  0.03482069447636604,
  -0.0010734883835539222,
  0.042918600142002106,
  -0.03636159002780914,
  -0.01982710510492325,
  -0.010700585320591928,
  -0.03615438938140869,
  -0.021005872637033463,
  -0.021019378677010536,
  -0.003207245608791709,
  0.05386440455913544,
  0.019644845277071,
  -0.014997811056673528,
  -0.010476098395884035,
  0.02879909425973892,
  -0.017999282106757164,
  0.041480861604213715,
  -0.05662843957543373,
  0.04239679127931595,
  -0.0253051295876503,
  0.000966892053838819

In [96]:
client.hget(name=redis_key, key='example')

'Bonjour je voudrai représenter votre marque dans ma région.'

In [37]:
schema = (
    TextField("$.intent", no_stem=True, as_name="intent"),
    TextField("$.code", no_stem=True, as_name="code"),
    TextField("$.example", as_name="example"),
    VectorField(
        "$.embedding",
        "FLAT",
        {
            "TYPE": "FLOAT32",
            "DIM": 1024,
            "DISTANCE_METRIC": "COSINE",
        },
        as_name="vector",
    ),
)

definition = IndexDefinition(prefix=["intent:"], index_type=IndexType.JSON)
res = client.ft("idx:intent_vss").create_index(
    fields=schema, definition=definition
)

In [38]:
info = client.ft("idx:intent_vss").info()
num_docs = info["num_docs"]
indexing_failures = info["hash_indexing_failures"]

In [39]:
print(f"{num_docs} documents indexed with {indexing_failures} failures")

12 documents indexed with 0 failures


In [None]:
queries = [
    "Bike for small kids",
    "Best Mountain bikes for kids",
    "Cheap Mountain bike for kids",
    "Female specific mountain bike",
    "Road bike for beginners",
    "Commuter bike for people over 60",
    "Comfortable commuter bike",
    "Good bike for college students",
    "Mountain bike for beginners",
    "Vintage bike",
    "Comfortable city bike",
]