In [21]:
import os
import pandas as pd
import glob, os
import tiktoken

from dotenv import load_dotenv
load_dotenv(verbose=True, override=True)

In [14]:
# Merge data from cards rewards
files = glob.glob('data/rewards/*.pkl')

dfs = []

for fp in files:
    df = pd.read_pickle(fp)
    dfs.append(df)
    
card_rewards = pd.concat(dfs)

card_rewards.head()


Unnamed: 0,CardName,Discount,Description
0,American Express Gold Card,€30,Get a €30 statement credit by spending €150 or...
1,American Express Gold Card,€25,Earn a €25 statement credit with a €99 purchas...
2,American Express Cash Magnet Card,1.5% cash back,Earn unlimited 1.5% cash back on all purchases.
3,American Express Platinum Card,5x points,Earn 5x Membership Rewards points on flights b...
4,American Express Blue Cash Preferred Card,6% cash back,Earn 6% cash back at U.S. supermarkets on up t...


I added a combined_info column where I merged all the textual information, so that we can embed only one column of the dataset.

In [15]:
card_rewards['combined_info'] = card_rewards.apply(lambda row: f"CardName: {row['CardName']}. Discount: {row['Discount']} Description: {row['Description']}", axis=1)
card_rewards.head(2)

Unnamed: 0,CardName,Discount,Description,combined_info
0,American Express Gold Card,€30,Get a €30 statement credit by spending €150 or...,CardName: American Express Gold Card. Discount...
1,American Express Gold Card,€25,Earn a €25 statement credit with a €99 purchas...,CardName: American Express Gold Card. Discount...


Then, I’ve added a column where I counted the number of tokens of each combined_info row, to make sure they do not exceed the maximum number of tokens accepted by Azure OpenAI embedding models (8191 tokens). For this purpose, I’ve used the cl100k_base encoding algorithm, the same used by the embedding models we are going to use.

In [18]:
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

encoding = tiktoken.get_encoding(embedding_encoding)

# omit descriptions that are too long to embed
card_rewards["n_tokens"] = card_rewards.combined_info.apply(lambda x: len(encoding.encode(x)))
card_rewards = card_rewards[card_rewards.n_tokens <= max_tokens]

In [20]:
card_rewards.head()

Unnamed: 0,CardName,Discount,Description,combined_info,n_tokens
0,American Express Gold Card,€30,Get a €30 statement credit by spending €150 or...,CardName: American Express Gold Card. Discount...,42
1,American Express Gold Card,€25,Earn a €25 statement credit with a €99 purchas...,CardName: American Express Gold Card. Discount...,43
2,American Express Cash Magnet Card,1.5% cash back,Earn unlimited 1.5% cash back on all purchases.,CardName: American Express Cash Magnet Card. D...,33
3,American Express Platinum Card,5x points,Earn 5x Membership Rewards points on flights b...,CardName: American Express Platinum Card. Disc...,35
4,American Express Blue Cash Preferred Card,6% cash back,Earn 6% cash back at U.S. supermarkets on up t...,CardName: American Express Blue Cash Preferred...,46


Now that we filtered our dataset, we need to embed our textual description using Azure OpenAI Embeddings models. Generally speaking, embedding refer to the process of converting unstructured pieces of texts into numerical representations in a multi-dimensional space, in such a way that their distance among each others is a measure of their semantic similarity.

Among Azure OpenAI models, the text-embedding-ada-002 does exactly this, and it is the model we are going to employ. To do so, we need to retrieve our keys and endpoing from our Azure OpenAI service

In [23]:
from openai import AzureOpenAI

client = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version = os.getenv("AZURE_OPENAI_API_VERSION"),
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
)

def generate_embeddings(text, model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME")): # model = "deployment_name"
    return client.embeddings.create(input = [text], model=model).data[0].embedding

card_rewards["embedding"] = card_rewards.combined_info.apply(lambda x : generate_embeddings (x, model = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"))) # model should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model
card_rewards.head()

Unnamed: 0,CardName,Discount,Description,combined_info,n_tokens,embedding
0,American Express Gold Card,€30,Get a €30 statement credit by spending €150 or...,CardName: American Express Gold Card. Discount...,42,"[-0.028317386284470558, -0.01587647572159767, ..."
1,American Express Gold Card,€25,Earn a €25 statement credit with a €99 purchas...,CardName: American Express Gold Card. Discount...,43,"[-0.021910404786467552, -0.013033478520810604,..."
2,American Express Cash Magnet Card,1.5% cash back,Earn unlimited 1.5% cash back on all purchases.,CardName: American Express Cash Magnet Card. D...,33,"[-0.03010505624115467, -0.00792407151311636, 0..."
3,American Express Platinum Card,5x points,Earn 5x Membership Rewards points on flights b...,CardName: American Express Platinum Card. Disc...,35,"[-0.022897478193044662, -0.004563046619296074,..."
4,American Express Blue Cash Preferred Card,6% cash back,Earn 6% cash back at U.S. supermarkets on up t...,CardName: American Express Blue Cash Preferred...,46,"[-0.025143921375274658, -0.015797363594174385,..."


As you can see, we now have an additional column of vectors. We can then change some columns names (in order to be mapped later on from our chain) and then save the dataframe as pickle.

In [24]:
card_rewards.rename(columns = {'embedding': 'vector'}, inplace = True)
card_rewards.rename(columns = {'combined_info': 'text'}, inplace = True)
card_rewards.to_pickle(os.path.join(os.getcwd(), "data","card_rewards.pkl"))