# Install dependencies

Install `txtai` and all dependencies.

In [1]:
%%capture
!pip install git+https://github.com/neuml/txtai

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
data_file = 'https://raw.githubusercontent.com/manaranjanp/GenAI_LLM/main/Transformers/myntra_products_catalog.csv'

In [4]:
products_df = pd.read_csv(data_file)

In [5]:
products_df.head(5)

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White


In [None]:
products_df.shape

(12491, 8)

In [None]:
selected_products = products_df[products_df.Gender == 'Men'].sample(500, random_state = 78).reset_index()

In [None]:
selected_products.shape

(500, 9)

In [None]:
from pprint import pprint

In [None]:
list(selected_products['Description'])[0:10]

['A pair of grey comfort sandalsSynthetic upper with velcro closureCushioned footbedPatterned tpr outsoleWarranty: 1 monthWarranty provided by brand/manufacturer',
 'This set consists of a Code Copper Body Perfume and a Code Platinum Body PerfumeCode Copper Body PerfumeNotesWarm balsam and amber notesThe citrus notes of lemon and pineapple with woody hints of armoiseCode Platinum Body PerfumeNotesApple notes, muguet and cedar woodFeaturesContains zero gas, only perfumeFragrance is safe to use on skin and\xa0lasts for a long time',
 'Grey and Purple checked smart casual shirt, has a spread collar, short sleeves, button placket, and curved hem',
 'Green self-design formal shirt, has a spread collar, long sleeves, button placket, and curved hem',
 'Grey light wash 5-pocket low-rise jeans, clean look, heavy fade, has a button and zip closure, and waistband with belt loops',
 'Blue washed mid-rise denim shorts, has 5 pockets, and button closure',
 'Brown & white checked smart casual shirt, 

# Semantic Search

The first example we'll cover is semantic search. Semantic search applications have an understanding of natural language and identify results that have the same meaning, not necessarily the same keywords. While this produces higher quality results, one advantage of keyword search is it's easy to understand why a result why selected. The keyword is there.

Let's see if we can gain a better understanding of semantic search output.

In [None]:
%%capture

from txtai.embeddings import Embeddings

# Create embeddings model, backed by sentence-transformers & transformers
embeddings = Embeddings({"path": "BAAI/bge-small-en-v1.5", "content": True})

In [None]:
data_v1 = list(selected_products.Description)

In [None]:
# Create embeddings index with content enabled. The default behavior is to only store indexed vectors.
%%time
# Create an index for the list of text
embeddings.index([(uid, text, None) for uid, text in enumerate(data_v1)])

CPU times: user 1.25 s, sys: 259 ms, total: 1.51 s
Wall time: 3.44 s


In [None]:
query = 'casual shirt'

uid = embeddings.similarity(query, data_v1)[0:10]

In [None]:
uid

[(494, 0.8226932287216187),
 (486, 0.8127150535583496),
 (322, 0.8113027215003967),
 (109, 0.8020534515380859),
 (456, 0.8006207346916199),
 (431, 0.7989739179611206),
 (156, 0.7988184094429016),
 (66, 0.7984649538993835),
 (404, 0.7964844703674316),
 (328, 0.7960006594657898)]

In [None]:
for id in uid:
  print(f"{selected_products.loc[id[0]]['ProductName']}")

LA LOFT Men Charcoal Grey Regular Fit Printed Casual Shirt
Flying Machine Men Grey Slim Fit Printed Casual Shirt
Indian Terrain Men Yellow Slim Fit Solid Casual Shirt
WITH Men White Slim Fit Solid Casual Shirt
Calvin Klein Jeans Men Maroon Slim Fit Solid Casual Shirt
WROGN Men Burgundy Slim Fit Solid Casual Shirt
Indian Terrain Men White & Grey Slim Fit Printed Casual Shirt
Parx Men White & Blue Slim Fit Printed Casual Shirt
Basics Men Blue Slim Fit Solid Casual Shirt
Next Look Men White & Beige Regular Fit Printed Casual Shirt


In [None]:
query = 'winter wear'

uid = embeddings.similarity(query, data_v1)[0:5]

for id in uid:
  print(f"{data_v1[id[0]]}")

Keep yourself warm and look smart donning this grey sweater from GAS, featuring horizontal pinstripes. With full sleeves and knitted texture, this sweater is a great pick for mild winters. Team it with washed denims and sneakers. Contrast hem, cuffs and neck Knitted Horizontal pinstripes
Soft fleece knitLong sleevesRibbing at cuffs and hemDrawcord ties at hooded necklineGraphic at chest
Brown self-design  trousers and button closure
A pair of gold-toned and black geometric textured cufflinks
Grey sweater, has a V-neck, sleeveless


The `explain` method above ran an embeddings query like `search` but also analyzed each token to determine term importance. Looking at the results, it appears that `win` is the most important term. Let's visualize it.

In [None]:
# Run a search
embeddings.explain(query, limit=5)[4]

{'id': '350',
 'text': 'Grey sweater, has a V-neck, sleeveless',
 'score': 0.6849793791770935,
 'tokens': [('Grey', -0.020757973194122314),
  ('sweater,', 0.054792821407318115),
  ('has', -0.0016090869903564453),
  ('a', -0.004363834857940674),
  ('V-neck,', -0.005885779857635498),
  ('sleeveless', -0.018702149391174316)]}

In [None]:
from IPython.display import HTML

def plot(query):
  result = embeddings.explain(query, limit=5)[4]

  output = f"<b>{query}</b><br/>"
  spans = []
  for token, score in result["tokens"]:
    color = None
    if score >= 0.1:
      color = "#fdd835"
    elif score >= 0.075:
      color = "#ffeb3b"
    elif score >= 0.05:
      color = "#ffee58"
    elif score >= 0.02:
      color = "#fff59d"

    spans.append((token, score, color))

  if result["score"] >= 0.05 and not [color for _, _, color in spans if color]:
    mscore = max([score for _, score, _ in spans])
    spans = [(token, score, "#fff59d" if score == mscore else color) for token, score, color in spans]

  for token, _, color in spans:
    if color:
      output += f"<span style='background-color: {color}'>{token}</span> "
    else:
      output += f"{token} "

  return output

HTML(plot(query))

Let's try some more queries!