### Installing the required python libraries (e.g. openai)

In [13]:
import os
import re
import sys
import json
import subprocess
import importlib.metadata
from time import sleep
from azure.identity import DefaultAzureCredential

required = ['openai', 'num2words', 'openai[embeddings]', 'transformers']
for pkg in required:
    print(f'Checking for {pkg}...')
    try:
        importlib.metadata.version(pkg)
    except importlib.metadata.PackageNotFoundError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--quiet', pkg])

az_credential = DefaultAzureCredential()

Checking for openai...
Checking for num2words...
Checking for openai[embeddings]...
Checking for transformers...


### Importing the required python modules

In [45]:
import pandas as pd
import numpy as np
import openai
import requests
from num2words import num2words
from openai.embeddings_utils import get_embedding, cosine_similarity
from transformers import GPT2TokenizerFast

# Define colors to print in the console
BLUE = '\033[94m'
CYAN = '\033[96m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'

### Configure your Azure Open AI Endpoint & API Key

You can find the Endpoint & Key in the **Keys and Endpoint** section of your Open AI resource on the left navigation pane.

In [7]:
# Update the RESOURCE_ENDPOINT to your Azure Open AI Endpoint
RESOURCE_ENDPOINT = 'https://demo-1.openai.azure.com/'

# Update the API_KEY Azure Open AI API Key
API_KEY = 'example3xjqidkobjcod6fvnvexample'

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"

Necessary functions which will be used later

- `normalize_text` - Function to normalize the input text to remove multiple spaces, additional punctuation, etc.

In [8]:
# s is input text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

### List your deployed Azure OpenAI models

In [25]:
url = openai.api_base + '/openai/deployments?api-version=2022-12-01'
r = requests.get(url, headers={"api-key": API_KEY})
deployment_data = json.loads(r.text.replace('\n', ''))['data']

for deployment in deployment_data:
    if deployment['status'] == 'succeeded' and deployment['object'] == 'deployment':
        print(f'Model Name: {BLUE}{deployment["model"]}{ENDC} deployed with model Id: {OKGREEN}{BOLD}{deployment["id"]}{ENDC}')


Model Name: [94mtext-search-davinci-query-001[0m deployed with model Id: [92m[1mtest-davinci-search-1[0m
Model Name: [94mtext-search-davinci-doc-001[0m deployed with model Id: [92m[1mtest-davinci-search-doc-1[0m
Model Name: [94mtext-davinci-003[0m deployed with model Id: [92m[1mtest-davinci-003[0m
Model Name: [94mtext-davinci-002[0m deployed with model Id: [92m[1mtext-davinci-002[0m


Load the data as a pandas dataframe

In [26]:
df = pd.io.json.read_json('amazon_pqa_earbud_headphones.json', lines=True)
df.sample(n=5)

Unnamed: 0,question_id,question_text,asin,bullet_point1,bullet_point2,bullet_point3,bullet_point4,bullet_point5,product_description,brand_name,item_name,question_type,answer_aggregated,answers
288759,Tx15UASXFE58739,zagg,B00BG34UMY,2013 Model Ultra Clear HD Premium Quality Japa...,Bubble-Free and No Rainbow effects,Easy Installation Guaranteed,Lifetime Replacement Warranty,2 Screen Protectors Included,The iSmooth Apple iPad 5 Screen Protector Kit ...,,iSmooth Apple iPad Air (Apple iPad 5) and iPad...,WH,,[{'answer_text': 'Not Zagg but alot cheaper'}]
250219,TxW08WPWH3SI2G,are these good for music like adtr and asking ...,B011IH5ZHG,"Deep, clear sound enhanced by exclusive TriPor...","Proprietary Stay Hear tips, in three sizes, co...",Sweat and weather-resistant engineered and te...,Matching protective carrying case included,Headphone Type: Earbud Home Sport Headphones,,Bose,Bose SoundSport in-ear headphones - Charcoal,yes-no,neutral,[{'answer_text': 'My son uses them for EDM and...
264036,Tx11WCEFOKT5THS,Do these earbuds have a microphone?,B07SN9Q7JV,[CD-like Sound]: Demi wireless earbuds offer a...,[Industry Leading 8H Playtime]: 1 hour charge ...,[Seamless Setup]: Open the charging lid and De...,[IPX7 Waterproof and Rock-Solid Fit]: Demi is ...,"[Pioneering Features]: Change tracks, adjust v...",,,"GoNovate Wireless Earbuds, Demi Bluetooth 5.0 ...",yes-no,yes,[{'answer_text': 'Yes it does'}]
237807,Tx31YPBFPSPY8FM,Will there be a three button mic option for an...,B017LI9HCA,"Newly developed, outstanding dynamic 8mm drive...","Precision zinc alloy metal housing, casted by ...",Patented Active Flex sport ear hooks for best ...,Built-in 3-Button Remote Mic specially tuned f...,Tangle-free Diamondback cable is the perfect m...,,V-MODA,V-MODA Zn In-Ear Modern Audiophile Headphones ...,yes-no,neutral,[{'answer_text': 'That's very unlikely. They'r...
27543,Tx1LZ1QJXR58EUE,What size of the speaker?,B07D7TG6MK,DETACHABLE SPORT EARHOOK DESIGN-This sports he...,GOOD SOUND QUALITY FOR MUSIC-The sound of thes...,COMFOFRTABLE AND SECURE FIT: The uniquely desi...,SWEAT-RESISTANT&WATE PROOF: Never mind your he...,QUALITY WARRANTY & MONEY BACK GUARANTEE - MUCR...,,,Running Headphones Over Ear Sport Earphones wi...,WH,,[{'answer_text': 'The speakers's diam is 10mm ...


Create a new data frame only with the columns `asin`, `item_name`, and `product_description`.

In [27]:
df_desc = df[['asin', 'item_name', 'product_description']]

Drop duplicate ASINs and drop cells with empty values

In [28]:
df_stage = df_desc.drop_duplicates(subset='asin')
df_stage = df_stage.replace('', np.nan).dropna()

Normalize the value of `product_description` column 

In [29]:
df_stage['product_description'] = df_stage['product_description'].apply(lambda x : normalize_text(x))

Reset the index of the dataframe

In [33]:
df_stage.reset_index(inplace=True, drop=True)
# df_stage = df_stage.drop('index', axis=1)
df_stage.sample(n=5)

Unnamed: 0,asin,item_name,product_description
10737,B07QXTZRXC,Stillbetter 3.5mm Jack Pink Wired Earbuds in-E...,Details Item: Wired In-ear Earbuds with Case C...
80,B06WLQB8JV,"Truly Wireless Earbuds, LightBiz Noise Cancell...","LightBiz Mini Bluetooth Earbud is Smallest, Po..."
1322,B07M8SZFF7,"Wireless Earbuds,WSCSR E18 Latest Bluetooth 5....","Color: Black when we say truly wireless "", we ..."
4750,B07H7NVB8M,Bluetooth Headphones Wireless Headphones Runni...,Bluetooth Headphones&Wireless HeadphonesThe bl...
5546,B0192H4U9I,Earsonics - S-EM9 in-ear Earphones,"Building on the S-EM6 success, Earsonics decid..."


Using the HuggingFace `GPT2TokenizerFast` tokenizer generate the tokens for the `product_description`

In [34]:
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
df_stage['n_tokens'] = df_stage['product_description'].apply(lambda x: len(tokenizer.encode(x)))
df_stage = df_stage[df_stage.n_tokens<2000]
df_stage.shape

Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.57MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.13MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 2.79MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 349kB/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (1201 > 1024). Running this sequence through the model will result in indexing errors


(13475, 4)

Get the embeddings of the product description and store it in a column named `davinci_search`.

In [21]:
# df_stage['davinci_search'] = df_stage['product_description'].apply(lambda x : get_embedding(x, engine = 'test-davinci-search-doc-1'))

If the generated embeddings are available in CSV file load that. If not, store the generated embeddings as CSV

In [98]:
import csv
# df_stage.to_csv('search_embeddings_output_optimized.csv', sep='|', quoting=csv.QUOTE_NONE, doublequote=False, escapechar='"')
from ast import literal_eval
df_stage = pd.read_csv('search_embeddings_output.csv', index_col=0, converters={'davinci_search': literal_eval})

In [79]:
df_stage.sample(5)

Unnamed: 0,asin,item_name,product_description,n_tokens,davinci_search
3698,B07DHG477F,ZOGO Bluetooth Headset V4.1 with Noise Reducti...,Technical Parameters Model: A10 Red Noise redu...,369,"[-0.01159856840968132, 0.0035404821392148733, ..."
7263,B0150265QA,WiNi-TECH Wireless Bluetooth Noise Cancelling ...,"Features: - Ergonomic earbud design, comfortab...",290,"[-0.01673363894224167, -0.001369908219203353, ..."
3115,B07H3TMJP9,"Wireless Earbuds,Fantime Bluetooth Wireless 5....",FANTIME is focusing on the design development ...,328,"[-0.015261711552739143, 0.0007672155043110251,..."
3864,B071G5JKYD,ADVANCED Evo X Hi-Fi Beryllium Driver Sports I...,A set of quick and transient beryllium dynamic...,286,"[-0.004853676538914442, -0.003414375940337777,..."
4436,B07QNJ6PZH,"KZ ZS10 Pro in Ear Monitor Earbuds Headphone, ...","KZ ZS10 Pro In Ear Monitor Earbuds Headphone, ...",239,"[-0.0046610222198069096, 0.002037414815276861,..."


Search for the products based on the prompt

In [82]:
# search through the reviews for a specific product
def search_docs(df, user_query, top_n=3, to_print=True):
    embedding = get_embedding(
        user_query,
        engine='test-davinci-search-1'
    )
    df['similarities'] = df.davinci_search.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values('similarities', ascending=False)
        .head(top_n)
    )
    return res


res = search_docs(df_stage, "feel good ear phone with good sound quality", top_n=4)

In [94]:
for _, item in res.iterrows():
    print(f'{OKGREEN}Name:{ENDC} {BOLD}{item["item_name"]}{ENDC}')
    print(f'{CYAN}Description:{ENDC} {item["product_description"]}')
    print('')

[92mName:[0m [1mEarbuds, UROPHYLLA Earphones 4 Feet in-Ear Headphones with Microphone Heavy Bass Noise Cancelling Earphones Compatible with iPhone/iPod/iPad/Samsung/Android and More Audio Devices - White[0m
[96mDescription:[0m Excellent Sound Hearing As You Never Have Before. Stereo and clear sound, the basic demand of a pair of wonderful earphones. This headphone transfer the beautiful songs to your ears and make you enjoy it. That's what our UROPHYLLA earbuds provides. Most comfortable Three sizes of ear tips including S/M/L make sure the best fit for you. Built-in mic, 4 feet cable, enable you to enjoy hands-free calls with your family and friends conveniently. Widely Compatibility 3.5mm audio jack is able to compatible with almost all 3.5mm headphones port devices, such as most Smartphones, iPhones, iPads, iPods, MP3 player, Laptop, Tablets and so on. Specifications Frequency Response: 20~20KHz Speaker Impedance: 16 Sensiticity(at 1 KHz): 92 3dB Plug: 3.5mm Mic Size: 4.0mm Mi