# Install dependencies

Install `txtai` and all dependencies.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install git+https://github.com/neuml/txtai

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data_file = '/content/drive/MyDrive/NLP codes/d1/myntra_products_catalog.csv'

In [None]:
products_df = pd.read_csv(data_file)

In [None]:
products_df.head(5)

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White


In [None]:
products_df.shape

(12491, 8)

In [None]:
selected_products = products_df[products_df.Gender == 'Men'].sample(500).reset_index()

In [None]:
selected_products.shape

(500, 9)

In [None]:
from pprint import pprint

In [None]:
list(selected_products['Description'])[0:10]

['A pair of round-toe white sneakers, has mid-top styling, lace-up detailSynthetic Leather upperCushioned footbedTextured and patterned outsoleWarranty: 3 monthsWarranty provided by brand/manufacturer',
 'Black solid low-rise denim shorts, has 5 pockets, and button closure Design Details-GYMINDIGO- STYLISED POCKET REGULAR FIT- MID RISE PERFORMANCE SHORTS WITH LAID BACK CONTROL OF JOGGER FABRIC TECHONOLOGY,  SMOKE BLACK  CLOUDY LOOK,  ZIP FLY WITH 5  BELT\xa0 LOOPS',
 'Blue solid casual shirt, has a spread collar, long sleeves, snap button placket, curved hem, and 2 flap pockets',
 'White and Black checked casual shirt, has a button-down collar, long sleeves, button placket, curved hem, and 1 patch pocket',
 'Grey and Rust red checked casual shirt, has a spread collar, long sleeves, button placket, and curved hem',
 'A pair of round-toe brown sneakers, has regular styling, lace-up detailSynthetic Leather upperCushioned footbedTextured and patterned outsoleWarranty: 3 monthsWarranty prov

# Semantic Search

The first example we'll cover is semantic search. Semantic search applications have an understanding of natural language and identify results that have the same meaning, not necessarily the same keywords. While this produces higher quality results, one advantage of keyword search is it's easy to understand why a result why selected. The keyword is there.

Let's see if we can gain a better understanding of semantic search output.

In [None]:
%%capture

from txtai.embeddings import Embeddings

# Create embeddings model, backed by sentence-transformers & transformers
embeddings = Embeddings({"path": "BAAI/bge-small-en-v1.5", "content": True})

In [None]:
data_v1 = list(selected_products.Description)

In [None]:
# Create embeddings index with content enabled. The default behavior is to only store indexed vectors.
%%time
# Create an index for the list of text
embeddings.index([(uid, text, None) for uid, text in enumerate(data_v1)])

CPU times: user 864 ms, sys: 6.38 ms, total: 871 ms
Wall time: 839 ms


In [None]:
query = 'casual shirt'

uid = embeddings.similarity(query, data_v1)[0:10]

In [None]:
uid

[(480, 0.8226931691169739),
 (259, 0.8175668716430664),
 (198, 0.8145919442176819),
 (207, 0.8127150535583496),
 (127, 0.8113968372344971),
 (126, 0.8051853179931641),
 (353, 0.8043985366821289),
 (157, 0.8027279376983643),
 (13, 0.7986592650413513),
 (325, 0.7978737950325012)]

In [None]:
for id in uid:
  print(f"{selected_products.loc[id[0]]['ProductName']}")

LA LOFT Men Charcoal Grey Regular Fit Printed Casual Shirt
Park Avenue Men Black Slim Fit Printed Casual Shirt
Flying Machine Men Black & White Slim Fit Printed Casual Shirt
Flying Machine Men Grey Slim Fit Printed Casual Shirt
Mast & Harbour Men Green Printed Casual Shirt
Parx Men Grey Slim Fit Printed Casual Shirt
Parx Men Grey Slim Fit Printed Casual Shirt
Basics Men Multicoloured Slim Fit Checked Casual Shirt
Basics Men Grey Slim Fit Solid Casual Shirt
Indian Terrain Men Blue Slim Fit Solid Casual Shirt


In [None]:
query = 'winter wear'

uid = embeddings.similarity(query, data_v1)[0:5]

for id in uid:
  print(f"{data_v1[id[0]]}")

A pair of silver-toned and black geometric textured cufflinks
A pair of gold-toned and black geometric textured cufflinks
Brown self-design mid-rise trousers, button closure, and 4 pockets
Charcoal grey solid pullover sweater, has a round neck, long sleeves, and ribbed hem
Grey sweater, has a V-neck, sleeveless


The `explain` method above ran an embeddings query like `search` but also analyzed each token to determine term importance. Looking at the results, it appears that `win` is the most important term. Let's visualize it.

In [None]:
# Run a search
embeddings.explain(query, limit=1)

[{'id': '308',
  'text': 'A pair of silver-toned and black geometric textured cufflinks',
  'score': 0.7100473642349243,
  'tokens': [('A', 0.011124789714813232),
   ('pair', 0.002017199993133545),
   ('of', 0.00806283950805664),
   ('silver-toned', 0.025922060012817383),
   ('and', 0.004504203796386719),
   ('black', -0.0033195018768310547),
   ('geometric', -0.004828095436096191),
   ('textured', -0.0001671314239501953),
   ('cufflinks', 0.04469132423400879)]}]

In [None]:
from IPython.display import HTML

def plot(query):
  result = embeddings.explain(query, limit=1)[0]

  output = f"<b>{query}</b><br/>"
  spans = []
  for token, score in result["tokens"]:
    color = None
    if score >= 0.1:
      color = "#fdd835"
    elif score >= 0.075:
      color = "#ffeb3b"
    elif score >= 0.05:
      color = "#ffee58"
    elif score >= 0.02:
      color = "#fff59d"

    spans.append((token, score, color))

  if result["score"] >= 0.05 and not [color for _, _, color in spans if color]:
    mscore = max([score for _, score, _ in spans])
    spans = [(token, score, "#fff59d" if score == mscore else color) for token, score, color in spans]

  for token, _, color in spans:
    if color:
      output += f"<span style='background-color: {color}'>{token}</span> "
    else:
      output += f"{token} "

  return output

HTML(plot(query))