### Import Packages

In [32]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

### Read Product CSV File

In [33]:
product_path = "../datasets/products.csv"

In [34]:
product_df = pd.read_csv(product_path)

In [35]:
product_df.head(1)

Unnamed: 0.1,Unnamed: 0,id,Product Name,Category,Description,Selling Price,Image,Dimensions,ItemWeight,ShippingWeight,ASIN,ItemModelNumber,RecommendedAge
0,8,862a5592-a1b7-4d9f-87bf-87f757ab3a9c,propel trampolines pts55re junior trampoline w...,"('Sports & Outdoors', 'Sports & Fitness', 'Lei...",a safe and fun trampoline for your little ones,129.99,https://images-na.ssl-images-amazon.com/images...,Unknown,Unknown,Unknown,B00VTKXSUE,Unknown,Unknown


### Make Data Frame For Vector Database 

In [36]:
vector_df = pd.DataFrame()

In [37]:
vector_df["category"] = product_df["Category"].apply(lambda x: " ".join(eval(x)))

In [38]:
vector_df.head(3)

Unnamed: 0,category
0,Sports & Outdoors Sports & Fitness Leisure Spo...
1,Toys & Games Dress Up & Pretend Play Wigs
2,Toys & Games Stuffed Animals & Plush Toys Stuf...


In [39]:
vector_df["text"] = product_df["Product Name"] + " " + product_df["Description"]

In [40]:
vector_df.head(3)

Unnamed: 0,category,text
0,Sports & Outdoors Sports & Fitness Leisure Spo...,propel trampolines pts55re junior trampoline w...
1,Toys & Games Dress Up & Pretend Play Wigs,rubies storybook princess childs costume wig b...
2,Toys & Games Stuffed Animals & Plush Toys Stuf...,hansa truetolife laying baby reindeer cuddly


In [41]:
vector_df["combined"] = vector_df["text"] + " " + vector_df["category"]

In [42]:
vector_df.head()

Unnamed: 0,category,text,combined
0,Sports & Outdoors Sports & Fitness Leisure Spo...,propel trampolines pts55re junior trampoline w...,propel trampolines pts55re junior trampoline w...
1,Toys & Games Dress Up & Pretend Play Wigs,rubies storybook princess childs costume wig b...,rubies storybook princess childs costume wig b...
2,Toys & Games Stuffed Animals & Plush Toys Stuf...,hansa truetolife laying baby reindeer cuddly,hansa truetolife laying baby reindeer cuddly T...
3,Toys & Games Puzzles Jigsaw Puzzles,kodak premium puzzles hot air balloons inflate...,kodak premium puzzles hot air balloons inflate...
4,Toys & Games Baby & Toddler Toys,playgo rolln chime walk push along toy for ba...,playgo rolln chime walk push along toy for ba...


In [43]:
vector_df.to_numpy()[0][2]

'propel trampolines pts55re junior trampoline with enclosure 55 red a safe and fun trampoline for your little ones Sports & Outdoors Sports & Fitness Leisure Sports & Game Room Trampolines & Accessories Trampolines'

### Make Embedding Model

In [44]:
model_name = "all-MiniLM-L6-v2"

In [45]:
model = SentenceTransformer(model_name)

In [46]:
test_encode = model.encode(
    ["Intent Search"],
    batch_size=256,
    show_progress_bar=True,
)

Batches: 100%|██████████| 1/1 [00:00<00:00, 50.35it/s]


In [47]:
test_encode

array([[ 5.27039357e-02, -1.87651347e-02, -1.23996204e-02,
        -6.82493523e-02, -1.60217378e-02,  6.08501323e-02,
         7.23666400e-02, -1.09136729e-02, -8.70088581e-03,
        -4.19286974e-02, -1.21289035e-02,  5.83972894e-02,
         2.52426174e-02,  3.68633308e-02, -1.48207117e-02,
        -6.92350976e-03,  3.93020138e-02, -6.47709565e-03,
        -4.87710126e-02, -1.81834754e-02,  9.50361863e-02,
         4.05716002e-02,  3.73550132e-02, -4.62344252e-02,
        -2.78359707e-02, -3.89942434e-03, -2.66977884e-02,
        -7.12052733e-02,  4.29163948e-02, -3.42337154e-02,
         7.72332102e-02,  9.65119302e-02,  1.56645626e-01,
         7.88405985e-02, -1.09409690e-02, -6.49769679e-02,
        -1.00952044e-01, -2.64214221e-02,  2.33069304e-02,
        -1.93701498e-02, -5.45176156e-02, -2.23043915e-02,
        -1.14275981e-02,  7.42796361e-02, -3.09175849e-02,
         4.47752187e-03, -4.82856892e-02, -4.90691960e-02,
         7.57272393e-02,  4.03110124e-03, -4.45104428e-0

In [48]:
test_encode.shape

(1, 384)

In [49]:
test_encode.dtype

dtype('float32')

In [50]:
vector_df["combined"] = vector_df["combined"].apply(
    lambda x: str(x) if isinstance(x, float) else x
)

### Embedding Vector Data Frame ( COMBINED )

In [51]:
vector_df['combined'].to_list()

['propel trampolines pts55re junior trampoline with enclosure 55 red a safe and fun trampoline for your little ones Sports & Outdoors Sports & Fitness Leisure Sports & Game Room Trampolines & Accessories Trampolines',
 'rubies storybook princess childs costume wig black curls perfect for princess or other fairy tale costumes Toys & Games Dress Up & Pretend Play Wigs',
 'hansa truetolife laying baby reindeer cuddly Toys & Games Stuffed Animals & Plush Toys Stuffed Animals & Teddy Bears',
 'kodak premium puzzles hot air balloons inflate on the ground michigan jigsaw puzzle extra large fully interlocking unique shaped pieces Toys & Games Puzzles Jigsaw Puzzles',
 'playgo rolln chime walk  push along toy for baby toddler walker toy  learning and pretend play set customer satisfaction rolln chime walk and push along toy is designed for toddlers toddlers safety and customers satisfaction is our main concern Toys & Games Baby & Toddler Toys',
 'zing air zx crossbow in ffp orange air storm zx 

In [52]:
embeddings = model.encode(
    vector_df["combined"].to_list(),
    batch_size=256,
    show_progress_bar=True,
)

Batches: 100%|██████████| 38/38 [02:47<00:00,  4.42s/it]


In [53]:
embeddings

array([[ 2.5710051e-03, -1.2196518e-02, -2.2223510e-02, ...,
         5.7801437e-02, -4.9295310e-02,  8.6548902e-02],
       [-4.6459176e-02, -6.5139825e-03,  2.9711807e-02, ...,
        -3.2299209e-02, -4.4088643e-02,  5.3605318e-02],
       [-7.2406217e-02,  1.1140852e-02,  8.3119972e-03, ...,
        -2.7846970e-02,  4.9052224e-02,  6.9327615e-02],
       ...,
       [-7.7467151e-02,  1.8878104e-02,  4.3766655e-02, ...,
         9.8159663e-02, -5.0983284e-02,  9.9907713e-03],
       [ 2.1768805e-02,  4.8336841e-02, -3.1546776e-06, ...,
        -1.3842543e-02,  1.6669162e-02,  2.5513478e-02],
       [-3.2675233e-02, -5.9388794e-02,  2.3606529e-02, ...,
        -4.6699516e-02,  6.1646211e-03,  8.3161950e-02]],
      shape=(9631, 384), dtype=float32)

### Save Embedding Into Vector Database ( FAISS )

In [54]:
dimension = 384

In [55]:
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

In [56]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x75472474bab0> >

In [57]:
faiss.write_index(index, "../database/products_vector.bin")