In [None]:
#@title Install Dependencies { form-width: "500%", display-mode: "form" }

#Import required libraries
!pip install sentence-transformers >> /dev/null
import os
import numpy as np
import pandas as pd
from datetime import datetime
from google.cloud import bigquery
from sentence_transformers import SentenceTransformer
from pprint import pprint

#Setting up google credentials to access bigquery
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mercor-387315-804b5f512963.json'

#Initialzing BERT model for text embeddings
model = SentenceTransformer('bert-base-nli-mean-tokens')

#Function for fetching the item categories from description
items = ['tops', 'kurta-sets', 'tshirts', 'shirt', 'trousers', 'jackets',
      'jeans', 'sarees', 'shorts', 'sweatshirts', 'flats', 'dresses',
      'sports-shoes', 'belts', 'trouser', 'flip-flops', 'handbags',
      'kurtas', 'track pant', 'suits', 'casual-shoes', 'shirts',
      'track-pants', 'palazzos', 'tracksuits', 'formal-shoes', 'jean',
      'blazer', 'caps', 'top', 'suit', 'nightdress', 'bra', 'sweaters',
      'night-suits', 'wallets', 'heels', 'lounge-pants', 'tights',
      'tight', 'jegging', 'lounge-tshirts', 'short', 'kurtis',
      'boxers', 'leggings', 'churidar', 'skirts', 'ties', 'skirt',
      'coats', 't-shirt', 'sandals', 'swimwear', 'capri', 'blazers',
      'sunglasses', 'capris', 'saree-blouse', 'coat', 'cargo',
      'shapewear', 'pyjamas', 'harem-pants', 'sleepsuit', 'waistcoat',
      'rain-suit', 'swim-tops', 'salwar', 'hanger', 'thermal-set',
      'harem pant', 'rain-jacket', 'swim-bottoms', 'shawl', 'Blazer', 
      'Bottomwear', 'Capri', 'Cargos', 'Coat', 'Harem Pants', 
      'Jeans', 'Jeggings', 'T Shirt', 'TShirt', 'T-Shirt', 
      'Shirt', 'Shorts', 'Skirts', 'Suit', 'Three-Fourth', 
      'Tights', 'Top', 'Track Pant', 'Trouser', 'Trousers', 
      'Waistcoat', 'Pants']
      
def get_category(desc):
    for i in items:
        if i in desc or i.lower() in desc or i+'s' in desc:
            return i.lower()
    return None

#Funtion for fetching gender from the description
def get_gender(desc):
    if 'women' in desc or 'girl' in desc:
        return 'Women'
    elif ('women' not in desc and 'girl' not in desc):
        if ('men' in desc or 'boy' in desc):
            return 'Men'
    else:
        return None

In [75]:
#@title Fetch Results { form-width: "50%", display-mode: "form" }

text = 'men joggers' #@param { type: 'string' }
number_of_result = 10 #@param { type: 'integer' }

#Extract text embbeddings from the SentenceTransformers model
text_enc = str(model.encode(text).tolist())
category = get_category(text)
gender = get_gender(text)
time = str(datetime.now())

#Convert the result into dataframe and push it into a file for further uplaoding into bigquery
df = pd.DataFrame([[text, text_enc, category, gender]], columns=['text', 'embeddings', 'category', 'gender'])
df.to_csv('item.csv', index=False)

#initialize configuration for pushing data to bigquery
table_id = 'mercor-387315.mercor.clothing_description_search'
file_name = '/content/item.csv'

client = bigquery.Client()
job_config = bigquery.LoadJobConfig(
    source_format = bigquery.SourceFormat.CSV, skip_leading_rows=1,
    autodetect = True, write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
)

with open(file_name, 'rb') as sf:
    job = client.load_table_from_file(sf, table_id, job_config=job_config)
job.result()

#Bigquery SQL for fetching the similarity check from the input text embeddings to the dataset embeddings
javascript_udf = r"""
    function cosine_similariy_js(vector1, vector2){
      vector1 = JSON.parse(vector1)
      vector2 = JSON.parse(vector2)

      var dotproduct = 0;
      var mA=0;
      var mB=0;
      
      for(i = 0; i < vector1.length; i++){
          dotproduct += (vector1[i] * vector2[i]);
          mA += (vector1[i] * vector1[i]);
          mB += (vector2[i] * vector2[i]);
      }
      
      mA = Math.sqrt(mA);
      mB = Math.sqrt(mB);
      
      var similarity = (dotproduct)/((mA)*(mB))
      return similarity;
    }

    return cosine_similariy_js(input_emb, search_vectors)
  """;

query = """CREATE TEMP FUNCTION
  cosine_similarity(input_emb STRING, search_vectors STRING)
  RETURNS FLOAT64
  LANGUAGE js
  AS\n""" + f'"""' + f'{javascript_udf}' + '""";\n' + f"""SELECT 
    cosine_similarity(input.string_field_1, dataset.embeddings) as similarity,
    dataset.category,
    dataset.gender,
    dataset.product_url as product_link
  FROM 
    `mercor.clothing_description_embeddings_list` as dataset,
    `mercor.clothing_description_search` as input

  ORDER BY similarity DESC
  LIMIT {number_of_result}
"""

#Retrieve the result as dataframe and ouptut the product links as JSON Structure as mentioned
df = client.query(query).to_dataframe().drop_duplicates(subset='product_link')
pprint(dict(df.head(number_of_result)['product_link']))

{0: 'https://www.myntra.com/track-pants/herenow/herenow-men-maroon-joggers/5395427/buy',
 1: 'https://www.myntra.com/track-pants/herenow/herenow-men-maroon-jogger-/5395446/buy',
 2: 'https://www.myntra.com/trousers/highlander/highlander-men-taupe-joggers-trousers/16519874/buy',
 3: 'https://www.myntra.com/track-pants/slazenger/slazenger-men-grey-solid-running-joggers/16115240/buy',
 4: 'https://www.myntra.com/trousers/people/people-men-grey-joggers-trousers/16361354/buy',
 5: 'https://www.myntra.com/track-pants/slazenger/slazenger-men-teal-green-solid-running-joggers/16115190/buy',
 6: 'https://www.myntra.com/track-pants/nautica/nautica-men-multicolour-camaflauge-printed-joggers/15339924/buy',
 7: 'https://www.myntra.com/track-pants/hm/hm-boys-pack-of-3-joggers/14959436/buy',
 8: 'https://www.myntra.com/track-pants/herenow/herenow-men-navy-joggers/5395412/buy',
 9: 'https://www.myntra.com/track-pants/slazenger/slazenger-men-off-white-solid-joggers/15800222/buy'}
