# Uninstall all existing python packages in the runtime
This is a brute force way to make sure that the python runtime doesn't have any package/version conflicts. 

In [None]:
!pip freeze > requirements.txt
!cat requirements.txt | xargs -n 1 pip uninstall -y

# Install dependencies 
Use the following shell command to install the pinecone client and llama-index for data ingestion. This notebook uses:

1. pinecone-client - for vector db upserts and queries
2. python-dotenv - for setting environment variables for openai and pinecone

In [None]:
!pip install -U "pinecone-client[grpc]" "python-dotenv" 

# OpenAI and Pinecone Settings

In [None]:
import openai
import os
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt, retry_if_not_exception_type

load_dotenv('.env')

EMBEDDING_MODEL = 'text-embedding-ada-002'
EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = 'cl100k_base'

PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
PINECONE_ENVIRONMENT = os.environ['PINECONE_ENVIRONMENT']
METRIC = os.environ['METRIC']
DIMENSIONS = int(os.environ['DIMENSIONS'])

# Read CSV and concatenate fields into a single text field for embeddings

In [None]:
import pandas as pd
# Read 'icecat products.csv' into a pandas dataframe
df = pd.read_csv('icecat_products.csv')

# concatenate name, title, short_description, and supplier to create a new column called 'text' but only include each column if it is not null or NAN
df['text'] = df['name'].fillna('') + ' ' + df['title'].fillna('') + ' ' + df['short_description'].fillna('') + ' ' + df['supplier'].fillna('')
df.head()

df['text'][0]

In [None]:
# Exponential retry with a 20 second cap and 6 attempts
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(openai.InvalidRequestError))
def get_embedding(text_or_tokens, model=EMBEDDING_MODEL):
    embedding_list = []
    result =  openai.Embedding.create(input=text_or_tokens, model=model).get('data')
    for i in range(len(text_or_tokens)):
        embedding_list.append(result[i]['embedding'])
    return embedding_list

# Define the batch size and number of batches
batch_size = 1000
num_batches = len(df) // batch_size + (1 if len(df) % batch_size else 0)

# Create an empty list to collect the embeddings
vectors = []

# Define a function to get the embedding for a batch of texts
for i in range(num_batches):
    batch_texts = df['text'][i*batch_size:(i+1)*batch_size].tolist()  # Convert to list
    vectors.extend(get_embedding(batch_texts))
    print(i)

# Assign the embeddings list to the 'vector' column
df['vector'] = vectors


# Save file to parquet for use in preparation to go to verizon_process_dataframe_for_pinecone.ipynb

In [None]:
# This contains text, title, short_description, and supplier
df.to_parquet('icecat_products.parquet', index=False)

In [None]:
# read top 10 rows of the dataframe
df.head()

# Process metadata and vectors in preparation for loading to Pinecone

In [None]:
# Assuming df_icecat is your existing DataFrame
# Replace the NaN, None, or null values with an empty string in the required columns
df.fillna({'name':'', 'title':'', 'short_description':'', 'supplier':'', 'img_high':'', 'img_low':'','img_thumb':'','img_500x500':''}, inplace=True)

# Function to create metadata dictionary
def create_metadata(row):
    return {
        'name': row['name'],
        'title': row['title'],
        'short_description': row['short_description'],
        'supplier': row['supplier'],
        'img_high': row['img_high'],
        'img_low': row['img_low'],
        'img_thumb': row['img_thumb'],
        'img_500x500': row['img_500x500']
    }

# Apply the function to each row of df_icecat to create the metadata column
df['metadata'] = df.apply(create_metadata, axis=1)

# Convert the 'id' column to string
df['id'] = df['id'].astype(str)

# Create df_icecate_2 DataFrame by selecting and renaming the required columns
df_2 = df[['id', 'vector', 'metadata']].rename(columns={'vector': 'values'})

In [None]:
df_2.head()

# Begin loading to Pinecone

In [None]:
import pinecone

index_name = 'verizon-products-1'

pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)

if (index_name in pinecone.list_indexes()) != True:  
    pinecone.create_index(index_name, dimension=DIMENSIONS, metric=METRIC, pods=1, replicas=1, pod_type="p1.x1")
else:
    print(f"Index {index_name} already exists")

print(f"Index Description: {pinecone.describe_index(name=index_name)}")

index = pinecone.GRPCIndex(index_name=index_name)

# Load Option #1 (GRPC Only)

In [None]:
index.delete(delete_all=True)

In [None]:
%%time
import time

# Chunk this dataset into 100000 document chunks and append to an array
chunks = []
for i in range(0, len(df_2), 100000):
    chunks.append(df_2[i:i+100000])

# Upsert each chunk into the index
for chunk in chunks:
    index.upsert_from_dataframe(chunk, show_progress=True, batch_size=300)

# Load Option #2 (GRPC + Multiprocessing)

In [None]:
%%time

import pandas as pd
import itertools
from multiprocessing import Pool, cpu_count
import upsert_module  # Import the module containing your upsert_data function
from functools import partial

def chunk_dataframe(df, n):
    """
    Split a DataFrame into smaller chunks.
    :param df: DataFrame to split.
    :param n: Number of rows for each chunk.
    :return: List of DataFrame chunks.
    """
    return [df.iloc[i:i + n] for i in range(0, len(df), n)]

# Assuming vectors_to_upsert is already defined somewhere
chunk_size = 300  # Define the chunk size based on your preference
chunks = chunk_dataframe(df_2, chunk_size)

# Using functools.partial to pass additional parameters
partial_function = partial(upsert_module.upsert_data, index_name=index_name, environment=PINECONE_ENVIRONMENT, api_key=PINECONE_API_KEY)

with Pool(processes=cpu_count()) as pool:
    results = pool.map(partial_function, chunks)

# Query

In [None]:
import openai

EMBEDDING_MODEL = 'text-embedding-ada-002'
EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = 'cl100k_base'

In [None]:
def get_embedding(text_or_tokens, model=EMBEDDING_MODEL):
    embedding_list = []
    result =  openai.Embedding.create(input=text_or_tokens, model=model).get('data')
    for i in range(len(text_or_tokens)):
        embedding_list.append(result[i]['embedding'])
    return embedding_list

In [None]:
response = get_embedding(['ipood green third generation kase for runing'])
#response = get_embedding(['water camera case for Sony SPK-HCE'])
search_results = index.query(response[0], top_k=5, include_metadata=True)

# Result

In [None]:
for i in search_results.get('matches'):
    supplier = i.get('metadata').get('supplier')
    title = i.get('metadata').get('title')
    short_description = i.get('metadata').get('short_description')
    score = i.get('score')

    print('Supplier: ', supplier)
    print('Title: ', title)
    print('Short Description: ', short_description)
    print('Score: ', score)
    print('------------------')