In [3]:
# import streamlit as st
import random
import time
import openai
import os
from dotenv import load_dotenv
import requests
import json

load_dotenv()
client=openai.OpenAI()

tools = [
  {
    "type": "function",
    "function": {
      "name": "call_search_api",
      "description": "Gets upto top 3 search results of products based of search keywords only if user query indicates intent to search for specific products",
      "parameters": {
        "type": "object",
        "properties": {
          "keywords": {
            "type": "string",
            "description": "A list of search keywords/tags eg. Red T-shirt, coffee mug etc.",
          }
        },
        "required": ["keywords"],
      },
    }
  }
]

def call_search_api(keywords):
    headers = {
        'Content-Type': 'application/json',
        'X-Shopify-Storefront-Access-Token': os.getenv('SHOPIFY_TOKEN')
    }
    payload = {
        'query': 'query searchProducts($query: String!, $first: Int) { search(query: $query, first: $first, types: PRODUCT) { edges { node { ... on Product { id title } } } } }',
        'variables': {
        "query": f"{keywords}",
        "first": 3
    }
    }
    response = requests.post('https://quickstart-31717217.myshopify.com/api/2024-01/graphql.json', headers=headers, data=json.dumps(payload))
    return response.json()['data']['search']['edges']


# Streamed response emulator
def response_generator(latest_user_message, previous_agent_response=None, messages=None):

    system_prompt = f"""You are a sales agent on an ecommerce platform, your job is to reply to customer queries just as a real life sales agent would. You will be given relevant info about the products and policies if and when required to be used to answer a query appropriately"""

    if messages == None:
        messages = [{"role": "system","content": system_prompt}, 
                    {"role": "user", "content": latest_user_message}]
    else:
        messages.append({"role": "assistant", "content": previous_agent_response})
        messages.append({"role": "user", "content": latest_user_message})
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=messages,
        tools=tools,
        tool_choice="auto",
        max_tokens=4096
        )
    
    search_response = []
    reply = ''

    if response.choices[0].message.content == None:
        if response.choices[0].message.tool_calls[0].function.name == "call_search_api":
                keywords = json.loads(response.choices[0].message.tool_calls[0].function.arguments)['keywords']
                search_response = call_search_api(keywords)
        if search_response == []:
            reply = "Hey sorry, we don't have that item"
        else:
            product_info = ""
            for idx, i in enumerate(search_response):
                product_info += f'product_{idx}: '+i['node']['title']+'\n'
            messages.append({"role": "system","content": f"Here are some products that surfaced from the customer query: \n{product_info} Try to recommend these to the customer."})
            response = client.chat.completions.create(
              model="gpt-4-turbo",
              messages=messages,
              tools=tools,
              tool_choice="auto",
              max_tokens=4096
              )
            reply = response.choices[0].message.content
    else:
        reply = response.choices[0].message.content
    return_value = {}
    return_value['response'] = reply
    return_value['messages'] = messages

    return json.dumps(return_value)
    # return reply, messages

In [5]:
user_query = "Hi, I need some help"
# previous_agent_response, messages = response_generator(user_query, tools)
ans = response_generator(user_query, tools)
print(ans)
# print(ans['messages'])

{"response": "Of course! What do you need help with today?", "messages": [{"role": "system", "content": "You are a sales agent on an ecommerce platform, your job is to reply to customer queries just as a real life sales agent would. You will be given relevant info about the products and policies if and when required to be used to answer a query appropriately"}, {"role": "user", "content": "Hi, I need some help"}]}


In [6]:
print(type(ans))

<class 'str'>


In [12]:
user_query = "I need a snowboard"
previous_agent_response, messages = response_generator(user_query, previous_agent_response, messages)
print(previous_agent_response)

Great choice for some winter fun! Here are a few snowboards we have available:

1. **The Complete Snowboard** - Perfect for both beginners and experienced riders.
2. **The Hidden Snowboard** - Ideal if you're looking for something with a bit more performance and agility.
3. **Green Snowboard** - A fantastic eco-friendly option for those conscious of the environment.

Each of these boards offers unique features and benefits. Let me know if you need more details on any of these options!


In [14]:
user_query = "Do you have t-shits too for snowboarding?"
previous_agent_response, messages = response_generator(user_query, previous_agent_response, messages)
print(previous_agent_response)

Hey sorry, we don't have that item


In [15]:
user_query = "Oh nevermind, Thanks!"
previous_agent_response, messages = response_generator(user_query, previous_agent_response, messages)
print(previous_agent_response)

You're welcome! If you have any other questions or need further assistance in the future, feel free to reach out. Enjoy your snowboarding adventure!


In [None]:
!pip3 install qdrant-client

In [1]:
import numpy as np
import faiss
import random
import time
import openai
import os
from dotenv import load_dotenv
import requests
import json

load_dotenv()
client=openai.OpenAI()

query = 'Where is my order?'
response = 'Your order is on the way'
expiry_date = 'NA'

# response_emb = client.embeddings.create(
#     input=query,
#     model="text-embedding-3-large"
# )

# embedding = response_emb.data[0].embedding

# Generate some sample data (replace this with your own vectors)



In [None]:


def get_score(row, grouped_order_df, query_emb):
    catalog_id = row['catalog_id']
    emb1 = row['search_catalog_embedding']
    score_list = []
    
    # Precompute norm of emb1
    norm_emb1 = np.linalg.norm(emb1)
    
    # Find the index of the catalog_id in grouped_order_df
    idx = grouped_order_df[grouped_order_df['catalog_id']==catalog_id].index[0]
    
    # Precompute indices of query embeddings
    query_indices = {}
    for idx2, query in enumerate(query_emb['query']):
        query_indices[query] = idx2
    
    for query in grouped_order_df['query'][idx]:
        try:
            idx2 = query_indices[query]
            score_list.append(cosine_similarity(query_emb['search_query_embedding'][idx2], emb1) / norm_emb1)
        except:
            pass
    return sum(score_list) / len(score_list) if len(score_list) != 0 else -1

In [92]:
import json
import faiss
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2.T)  # Transpose vec2 to match the shape of vec1
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)


def create_response(query, response, expiry_date='NA'):

    f = open('qr_dictionary.json')
    data = json.load(f)

    response_emb = client.embeddings.create(
        input=query,
        model="text-embedding-3-large"
    )

    embedding = response_emb.data[0].embedding

    data['data'].append({'query':query, 'response':response, 'embedding':embedding, 'expiry_date':expiry_date})
    with open("qr_dictionary.json", "w") as f: 
        json.dump(data, f)

def check_similar_queries(new_query):
    f = open('qr_dictionary.json')
    data = json.load(f)

    vector_data = []
    vector_metadata = []
    for data_point in data['data']:
        vector_data.append(np.array(data_point['embedding']))
        vector_metadata.append({'query':data_point['query'],'response':data_point['response'],'expiry_date':data_point['expiry_date']})

    vector_data = np.array(vector_data)
    vector_metadata = np.array(vector_metadata)

    response_emb = client.embeddings.create(
        input = new_query,
        model="text-embedding-3-large"
    )
    query_vector = np.array(response_emb.data[0].embedding)

    cosine_similarities = []
    for i in range(vector_data.shape[0]):
        cosine_sim = cosine_similarity(vector_data[i], query_vector)
        cosine_similarities.append(cosine_sim)

    cosine_similarities = np.array(cosine_similarities)
    indices = np.where(np.array(cosine_similarities) > 0.6)[0]
    indices_sorted = sorted(indices, key=lambda x: cosine_similarities[x], reverse=True)
    top_indices = indices_sorted[:3]

    decision = 'NO'
    for idx in top_indices:
        response = vector_metadata[idx]['response']
        query = vector_metadata[idx]['query']
        expiry_date = vector_metadata[idx]['expiry_date']
        system_prompt = f"""You are a sales agent on an ecommerce platform, the shop owner gave the instruction that if a user asks - {query}, it is to be responded with - {response}. Now a new user has asked the query - {new_query}. Is it alright to respond to the new user's query the same way? Respond 'YES' or 'NO'. (Your decision should be based on whether both the queries the same intent as one another)"""

        messages = [{"role": "system","content": system_prompt}]
        response_gpt = client.chat.completions.create(
            model="gpt-4-turbo",
            messages=messages,
            max_tokens=4096
            )
        decision = response_gpt.choices[0].message.content
        if decision == 'YES':
            return True,response
        else:
            continue
    return False, ''
        


# Create a flat index
# index = faiss.IndexFlatIP(vector_data[0].shape[-1])

# # Or create an index with more advanced properties
# # nlist is a trade-off between speed and accuracy
# nlist = 100
# quantizer = faiss.IndexFlatIP(vector_data[0].shape[-1])
# index = faiss.IndexIVFFlat(quantizer, vector_data[0].shape[-1], nlist, faiss.METRIC_INNER_PRODUCT)

# faiss.write_index(index, 'index.faiss')
# np.save('metadata.npy', vector_metadata)


In [93]:
check_similar_queries('Where is me order?')

[0]


(True, 'Your order is on the way')

In [76]:
import numpy as np

# Compute cosine similarity between two vectors


# Initialize an empty array to store cosine similarity scores
cosine_similarities = []

# Compute cosine similarity between query_vector and each row in vector_data
for i in range(vector_data.shape[0]):
    cosine_sim = cosine_similarity(vector_data[i], query_vector)
    cosine_similarities.append(cosine_sim[0])

# Convert the list to a numpy array
cosine_similarities = np.array(cosine_similarities)

print(cosine_similarities)


[0.69877035 0.44789807]


In [71]:
import numpy as np

# Find indices where value > 0.6
indices = np.where(np.array(cosine_similarities) > 0.6)[0]

# Sort the indices based on the values
indices_sorted = sorted(indices, key=lambda x: cosine_similarities[x], reverse=True)

# Get up to top 3 indices
top_indices = indices_sorted[:3]

print(top_indices)


[0]


In [77]:
top_indices

[0]

In [2]:
product_df

Unnamed: 0,featuredImage,handle,id,isGiftCard,productType,seo,tags,title,totalInventory,vendor,variant_info
0,{'id': 'gid://shopify/ProductImage/37274220560...,the-videographer-snowboard,gid://shopify/Product/7723420025013,0.0,,"{'title': None, 'description': None}",[],The Videographer Snowboard,50.0,Quickstart (31717217),[{'id': 'gid://shopify/ProductVariant/43857055...
1,{'id': 'gid://shopify/ProductImage/37274220167...,the-complete-snowboard,gid://shopify/Product/7723420090549,0.0,snowboard,"{'title': 'Complete Snowboard', 'description':...","[Premium, Snow, Snowboard, Sport, Winter]",The Complete Snowboard,50.0,Snowboard Vendor,[{'id': 'gid://shopify/ProductVariant/43857056...
2,{'id': 'gid://shopify/ProductImage/37274220200...,the-collection-snowboard-hydrogen,gid://shopify/Product/7723420123317,0.0,,"{'title': None, 'description': None}","[Accessory, Sport, Winter]",The Collection Snowboard: Hydrogen,50.0,Hydrogen Vendor,[{'id': 'gid://shopify/ProductVariant/43857056...
3,{'id': 'gid://shopify/ProductImage/37274220232...,the-hidden-snowboard,gid://shopify/Product/7723420156085,0.0,,"{'title': 'Hidden Snowboard', 'description': '...","[Premium, Snow, Snowboard, Sport, Winter]",The Hidden Snowboard,50.0,Snowboard Vendor,[{'id': 'gid://shopify/ProductVariant/43857056...
4,{'id': 'gid://shopify/ProductImage/37274220265...,the-archived-snowboard,gid://shopify/Product/7723420188853,0.0,,"{'title': None, 'description': None}","[Archived, Premium, Snow, Snowboard, Sport, Wi...",The Archived Snowboard,50.0,Snowboard Vendor,[{'id': 'gid://shopify/ProductVariant/43857056...
5,{'id': 'gid://shopify/ProductImage/37274220298...,gift-card,gid://shopify/Product/7723420254389,1.0,,"{'title': None, 'description': None}",[],Gift Card,0.0,Snowboard Vendor,[{'id': 'gid://shopify/ProductVariant/43857056...
6,{'id': 'gid://shopify/ProductImage/37274220331...,the-draft-snowboard,gid://shopify/Product/7723420287157,0.0,,"{'title': None, 'description': None}",[],The Draft Snowboard,20.0,Snowboard Vendor,[{'id': 'gid://shopify/ProductVariant/43857056...
7,{'id': 'gid://shopify/ProductImage/37274220363...,the-out-of-stock-snowboard,gid://shopify/Product/7723420352693,0.0,,"{'title': None, 'description': None}","[Accessory, Sport, Winter]",The Out of Stock Snowboard,0.0,Quickstart (31717217),[]
8,{'id': 'gid://shopify/ProductImage/37274220429...,the-inventory-not-tracked-snowboard,gid://shopify/Product/7723420385461,0.0,,"{'title': None, 'description': None}","[Accessory, Sport, Winter]",The Inventory Not Tracked Snowboard,0.0,Quickstart (31717217),[{'id': 'gid://shopify/ProductVariant/43857056...
9,{'id': 'gid://shopify/ProductImage/37274220462...,the-multi-managed-snowboard,gid://shopify/Product/7723420418229,0.0,,"{'title': None, 'description': None}","[Premium, Snow, Snowboard, Sport, Winter]",The Multi-managed Snowboard,100.0,Multi-managed Vendor,[{'id': 'gid://shopify/ProductVariant/43857056...


In [5]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

model_id = 'naver/splade-cocondenser-ensembledistil'

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
tokens = tokenizer("The Videographer Snowboard", return_tensors='pt')
output = model(**tokens)
output

MaskedLMOutput(loss=None, logits=tensor([[[ -5.9260,  -8.0902,  -7.6460,  ...,  -7.5764,  -7.5155,  -4.7057],
         [ -6.3694,  -8.2492,  -7.8823,  ...,  -7.7111,  -7.6492,  -5.1211],
         [-21.0173, -17.1963, -16.6255,  ..., -15.8963, -16.2647, -13.6399],
         ...,
         [-21.7755, -16.1834, -16.7062,  ..., -16.0607, -15.0108, -15.9868],
         [-12.5284, -11.5582, -11.0126,  ..., -11.7034, -11.1225,  -9.3539],
         [-19.5184, -15.8868, -15.3758,  ..., -15.6611, -14.7290, -16.9873]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [7]:
output.logits.shape

torch.Size([1, 7, 30522])

In [9]:
import torch

vec = torch.max(
    torch.log(
        1 + torch.relu(output.logits)
    ) * tokens.attention_mask.unsqueeze(-1),
dim=1)[0].squeeze()

vec.shape

cols = vec.nonzero().squeeze().cpu().tolist()
print(len(cols))

# extract the non-zero values
weights = vec[cols].cpu().tolist()
# use to create a dictionary of token ID to weight
sparse_dict = dict(zip(cols, weights))
sparse_dict

66


{1996: 0.5587583184242249,
 2002: 0.27983155846595764,
 2016: 0.1475370228290558,
 2136: 0.26124468445777893,
 2143: 0.6186653971672058,
 2208: 0.2733553946018219,
 2472: 0.17462411522865295,
 2604: 1.5755988359451294,
 2622: 0.040493570268154144,
 2640: 0.05562825873494148,
 2678: 2.35176157951355,
 2688: 0.030973238870501518,
 2694: 0.31905463337898254,
 2782: 0.09271848946809769,
 2944: 0.3809465765953064,
 2996: 0.01710648275911808,
 3063: 0.47387900948524475,
 3149: 0.41420480608940125,
 3242: 0.14893881976604462,
 3364: 0.3332504332065582,
 3410: 0.07996439188718796,
 3467: 1.2073038816452026,
 3509: 0.02473003789782524,
 3607: 0.10855190455913544,
 3626: 0.20963335037231445,
 3746: 0.009383267723023891,
 3910: 0.6331915259361267,
 3941: 0.2592417597770691,
 4007: 0.018952472135424614,
 4037: 0.3626019358634949,
 4368: 0.2452624887228012,
 4586: 2.4083690643310547,
 4950: 0.6255994439125061,
 4988: 0.9531188607215881,
 5855: 0.7590698003768921,
 6028: 0.0799088180065155,
 6085: 0

In [11]:
# extract the ID position to text token mappings
idx2token = {
    idx: token for token, idx in tokenizer.get_vocab().items()
}
sparse_dict_tokens = {
    idx2token[idx]: round(weight, 2) for idx, weight in zip(cols, weights)
}
# sort so we can see most relevant tokens first
sparse_dict_tokens = {
    k: v for k, v in sorted(
        sparse_dict_tokens.items(),
        key=lambda item: item[1],
        reverse=True
    )
}
sparse_dict_tokens

{'snow': 2.41,
 'video': 2.35,
 '##grapher': 1.93,
 '##board': 1.79,
 'board': 1.58,
 'photographer': 1.25,
 'winter': 1.21,
 'videos': 1.16,
 'journalist': 0.95,
 '##logist': 0.84,
 '##graphy': 0.82,
 'photography': 0.76,
 'founder': 0.63,
 'camera': 0.63,
 'film': 0.62,
 'the': 0.56,
 'artist': 0.47,
 'booth': 0.46,
 'robot': 0.42,
 'channel': 0.41,
 'model': 0.38,
 'website': 0.36,
 'photo': 0.36,
 'filming': 0.36,
 'presenter': 0.34,
 'actor': 0.33,
 'tv': 0.32,
 'he': 0.28,
 'game': 0.27,
 'recreation': 0.27,
 'team': 0.26,
 'equipment': 0.26,
 'sport': 0.25,
 'specialist': 0.25,
 'ski': 0.23,
 'logo': 0.22,
 'creator': 0.22,
 'crew': 0.21,
 'director': 0.17,
 'she': 0.15,
 'chair': 0.15,
 'fake': 0.15,
 'zoom': 0.15,
 'dj': 0.13,
 'invented': 0.13,
 'participant': 0.13,
 'russia': 0.11,
 'festival': 0.09,
 'champion': 0.08,
 'technique': 0.08,
 'blade': 0.07,
 'design': 0.06,
 'project': 0.04,
 'turner': 0.04,
 'bug': 0.04,
 'stunt': 0.04,
 'museum': 0.03,
 'youtube': 0.03,
 'foo

In [35]:
texts = [
   "Do you have good blue snow boards?",
   "maroon snowboard",
   "violet snowboard",
   "blue skateboard"
]
tokens = tokenizer(
    texts, return_tensors='pt',
    padding=True, truncation=True
)
output = model(**tokens)
# aggregate the token-level vecs and transform to sparse
vecs = torch.max(
    torch.log(1 + torch.relu(output.logits)) * tokens.attention_mask.unsqueeze(-1), dim=1
)[0].squeeze().detach().cpu().numpy()
vecs.shape

(4, 30522)

In [36]:
import numpy as np

sim = np.zeros((vecs.shape[0], vecs.shape[0]))

for i, vec in enumerate(vecs):
    sim[i,:] = np.dot(vec, vecs.T) / (
        np.linalg.norm(vec) * np.linalg.norm(vecs, axis=1)
    )

In [37]:
sim

array([[1.        , 0.42396614, 0.44735083, 0.53317469],
       [0.42396614, 1.        , 0.5332098 , 0.28310546],
       [0.44735083, 0.53320974, 1.00000012, 0.3148219 ],
       [0.53317469, 0.28310543, 0.3148219 , 0.99999988]])