# Uninstall all existing python packages in the runtime
This is a brute force way to make sure that the python runtime doesn't have any package/version conflicts. 

In [None]:
!pip freeze > requirements.txt
!cat requirements.txt | xargs -n 1 pip uninstall -y

# Install dependencies 
Use the following shell command to install the pinecone client and llama-index for data ingestion. This notebook uses:

1. pinecone-client - for vector db upserts and queries
2. python-dotenv - for setting environment variables for openai and pinecone
3. llama-index and llama-hub - data pipeline framework that ingests a 150K record csv into pinecone. Also provides query wrappers for pinecone and openai.

In [None]:
!pip install -U "pinecone-client[grpc]" "python-dotenv" "llama-index" "llama-hub" 
!pip freeze > requirements.txt

# Create Pinecone index and llama_index vector store wrapper

In [None]:
from dotenv import load_dotenv
import os
import pinecone

load_dotenv('.env')

PINECONE_INDEX_NAME = os.environ['PINECONE_INDEX_NAME']
PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
PINECONE_ENVIRONMENT = os.environ['PINECONE_ENVIRONMENT']
METRIC = os.environ['METRIC']
DIMENSIONS = int(os.environ['DIMENSIONS'])

pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)

if (PINECONE_INDEX_NAME in pinecone.list_indexes()) != True:  
    pinecone.create_index(PINECONE_INDEX_NAME, dimension=DIMENSIONS, metric=METRIC, pods=1, replicas=1, pod_type="p1.x1")
else:
    print(f"Index {PINECONE_INDEX_NAME} already exists")

print(f"Index Description: {pinecone.describe_index(name=PINECONE_INDEX_NAME)}")

pinecone_index = pinecone.Index(PINECONE_INDEX_NAME)


# Parse CSV into llama_index 

1. Load csv file
2. Attach metadata that can be used in pinecone queries
3. End result is a list of `nodes` that can be upserted into pinecone with a simple vector_store.add() call

In [None]:
from pathlib import Path
from llama_index.schema import TextNode
from llama_index.node_parser import SimpleNodeParser
from tqdm.auto import tqdm 
from datetime import datetime
from llama_index import download_loader

parser = SimpleNodeParser.from_defaults()

PagedCSVReader = download_loader("PagedCSVReader")

loader = PagedCSVReader(encoding="utf-8")
documents = loader.load_data(file=Path('./icecat_products.csv'))

def get_metadata(docText):
    data_dict = {}
    for line in docText.strip().split('\n'):
        if line.startswith('price') == True:
            try:
                key, value = line.split(': ', 1)
                data_dict[key] = float(value)
            except:
                pass
        elif line.startswith('date_released') == True:
            try:
                key, value = line.split(': ', 1)
                date_obj = datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
                new_date_obj = datetime(date_obj.year, date_obj.month, date_obj.day)
                unix_timestamp = int(new_date_obj.timestamp())
                data_dict[key] = unix_timestamp
            except:
                pass
        elif line.startswith('ean') == False:
            try:
                key, value = line.split(': ', 1)
                data_dict[key] = value
            except:
                pass
    return data_dict

nodes = []

for doc in tqdm(documents, total=len(documents)):
    node = TextNode(
        text=doc.text
    )
    
    node.metadata = get_metadata(doc.text)
    nodes.append(node)

print(nodes[0].text)
print(nodes[0].metadata)

# Generate keyword and question/answer examples for a subset of the data

This takes a long time to run and is an expensive token operation. The main reason it is included is to show the full power of llama_index paired with Pinecone

In [None]:
from llama_index.node_parser.extractors import MetadataExtractor, QuestionsAnsweredExtractor, KeywordExtractor
from llama_index.llms import OpenAI
import random

llm = OpenAI(model="gpt-3.5-turbo")

metadata_extractor = MetadataExtractor(
    extractors=[
        KeywordExtractor(nodes=5, llm=llm),
        QuestionsAnsweredExtractor(questions=3, llm=llm),
    ],
    in_place=False,
)

test_query_suggestions = metadata_extractor.process_nodes(random.sample(nodes, 3))
for tqs in test_query_suggestions:
    print(f"id: {tqs.metadata['id']}, name: {tqs.metadata['name']}")
    print(f"keywords: {tqs.metadata['excerpt_keywords']}")
    print(f"questions this excerpt can answer:\n{tqs.metadata['questions_this_excerpt_can_answer']}\n")
  

# Generate vector embeddings for all records

1. This takes a long time to run ~40 minutes because openai batch embeddings are slow
2. TODO - Need to explore parallizing the code 
3. TODO - Need to expore using `get_text_embedding_batch()`

In [None]:
from llama_index.embeddings import OpenAIEmbedding
from llama_index.vector_stores import PineconeVectorStore
from llama_index.schema import MetadataMode

batch_size = 100
embed_model = OpenAIEmbedding(embed_batch_size=batch_size)
pinecone_vector_store = PineconeVectorStore(pinecone_index=pinecone_index, batch_size=batch_size)

# Calculate the number of batches
num_batches = len(nodes) // batch_size + (1 if len(nodes) % batch_size else 0)
# Get the embeddings for a batch of nodes
for i in tqdm(range(num_batches), total=num_batches):
    batch = nodes[i * batch_size : (i + 1) * batch_size]
    batch_embeddings = embed_model.get_text_embedding_batch(
        [node.text for node in batch]
    )
    
    for node, embedding in zip(batch, batch_embeddings):
        node.embedding = embedding
    
    pinecone_vector_store.add(batch)


# OPTIONAL - If the data pipeline partially processes
1. Find out which percentage of batches failed
2. Change `percentage_to_reprocess` to the desired re-process percentage

In [None]:
from llama_index.embeddings import OpenAIEmbedding
from llama_index.vector_stores import PineconeVectorStore
from llama_index.schema import MetadataMode

batch_size = 100
embed_model = OpenAIEmbedding(embed_batch_size=batch_size)
pinecone_vector_store = PineconeVectorStore(pinecone_index=pinecone_index, batch_size=batch_size)

# Calculate the number of batches
num_batches = len(nodes) // batch_size + (1 if len(nodes) % batch_size else 0)

percentage_to_reprocess = 0.03
num_batches_to_reprocess = int(num_batches * percentage_to_reprocess)

for i in tqdm(range(num_batches - num_batches_to_reprocess, num_batches), total=num_batches_to_reprocess):
    batch = nodes[i * batch_size : (i + 1) * batch_size]
    batch_embeddings = embed_model.get_text_embedding_batch(
        [node.text for node in batch]
    )
    
    for node, embedding in zip(batch, batch_embeddings):
        node.embedding = embedding
    
    pinecone_vector_store.add(batch)

In [None]:
#pinecone_index.delete(delete_all=True)

# GenAI request

1. Convert query string into a vector 
2. Query pinecone to get LLM context
3. Pass context to LLM to generate response
4. Display response from LLM

In [None]:
from llama_index import VectorStoreIndex

index = VectorStoreIndex.from_vector_store(pinecone_vector_store)
query_engine = index.as_query_engine()
query_str = "Can you provide the id and image url of: Lenovo ThinkPad X100e"
response = query_engine.query(query_str)

print(str(response))