In [2]:
# 1. Import required libraries
import pandas as pd
import numpy as np
import time
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer

In [7]:
# 2. Initialize Pinecone
pinecone=Pinecone(api_key="pcsk_6XRWDd_JcyJNFjMUTz9p5ZV7eDceby373jLjPy86oEu5KxqLNgE8LNCUpJsGZfmXZb6mdu", environment="gcp-starter")

# 3. Create or connect to an index
index_name = "product-embeddings"  # Replace with your index name
dimension = 384  # This will depend on your embedding model

# create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    print(f"Creating new index '{index_name}'...")
    pinecone.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
    )
print("Connecting to index...")

# Connect to the index
index = pinecone.Index(index_name)

Creating new index 'product-embeddings'...
Connecting to index...


In [4]:
# 2. Initialize Pinecone
pinecone=Pinecone(api_key="pcsk_6XRWDd_JcyJNFjMUTz9p5ZV7eDceby373jLjPy86oEu5KxqLNgE8LNCUpJsGZfmXZb6mdu", environment="gcp-starter")

# 3. Create or connect to an index
index_name = "product-embeddings"  # Replace with your index name

# Connect to the index
index = pinecone.Index(index_name)

In [3]:
# 4. Load your product data
df = pd.read_csv("../data/dataset/cleaned_dataset.csv")  # our data source

# 5. Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')  #  a good default model for text embeddings (a good balance of speed and quality)



In [5]:
# 6. Create embeddings and upload to Pinecone
# in order decide what product information to include in the embedding
# For example, we will combine product name, description, and category
def create_product_embedding(row):
    # Combine product information into meaningful text
    product_text = f"Product {row['StockCode']}: {row['Description']} from {row['Country']}"
    return model.encode(product_text).tolist()

# 7. Process products in batches
print("Starting to process products...")
batch_size = 10000
for i in range(0, len(df), batch_size):
    batch = df.iloc[i:i+batch_size]
    print(f"Processing batch {i//batch_size + 1} of {len(df)//batch_size + 1}")
    
    # Create embeddings for the batch
    embeddings = [create_product_embedding(row) for _, row in batch.iterrows()]
    
    # Prepare vectors for upload
    vectors = []
    for j, (_, row) in enumerate(batch.iterrows()):
        vectors.append({
            "id": f"{row['StockCode']}_{row['InvoiceNo']}", # Create unique ID
            "values": embeddings[j],
            "metadata": {
                "stock_code": row['StockCode'],
                "description": row['Description'],
                "country": row['Country'],
                "unit_price": float(row['UnitPrice']),
                "quantity": int(row['Quantity'])
            }
        })
    
    # Upload vectors to Pinecone
    index.upsert(vectors=vectors)
    print(f"Uploaded batch {i//batch_size + 1}")

print("All products processed and uploaded to Pinecone!")

Starting to process products...
Processing batch 1 of 54


KeyboardInterrupt: 

Cosine similarity stands out as the best metric for our e-commerce product embeddings because it focuses on the directional similarity between vectors rather than their magnitude. This is crucial when dealing with product descriptions of varying lengths, as it normalizes the vectors to consider only their orientation in the high-dimensional space. For example, two product descriptions "ceramic white mug" and "large ceramic white coffee mug with handle" might be semantically very similar, but their vector magnitudes could differ significantly due to the length difference. Cosine similarity solves this by measuring the angle between vectors, producing a score between -1 and 1, where 1 indicates perfect similarity. This normalization is particularly valuable when using SentenceTransformer embeddings, as they capture semantic meaning in a 384-dimensional space where the direction of the vector represents the product's characteristics, while the magnitude might be influenced by description length or complexity. Unlike Euclidean distance, which can be skewed by vector magnitudes, or dot product, which doesn't normalize its results, cosine similarity provides consistent and interpretable results that better reflect true semantic relationships between products, making it ideal for both search relevance and recommendation accuracy.

# Task3

In [6]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np

def compare_similarity_metrics(vec1, vec2):
    # Reshape vectors for sklearn functions
    v1 = np.array(vec1).reshape(1, -1)
    v2 = np.array(vec2).reshape(1, -1)
    
    # Calculate similarities using different metrics
    cosine_sim = cosine_similarity(v1, v2)[0][0]
    euclidean_dist = euclidean_distances(v1, v2)[0][0]
    dot_prod = np.dot(vec1, vec2)
    
    print(f"Cosine Similarity: {cosine_sim:.4f}")
    print(f"Euclidean Distance: {euclidean_dist:.4f}")
    print(f"Dot Product: {dot_prod:.4f}")

# Example: Compare two similar products
product1_text = "White ceramic coffee mug for hot beverages"
product2_text = "Ceramic tea cup in white color"

# Get embeddings
vec1 = model.encode(product1_text)
vec2 = model.encode(product2_text)

print("Comparing similar products:")
compare_similarity_metrics(vec1, vec2)

# Compare with a dissimilar product
product3_text = "Garden tools set with metal rake"
vec3 = model.encode(product3_text)

print("\nComparing dissimilar products:")
compare_similarity_metrics(vec1, vec3)

Comparing similar products:
Cosine Similarity: 0.7457
Euclidean Distance: 0.7132
Dot Product: 0.7457

Comparing dissimilar products:
Cosine Similarity: 0.0136
Euclidean Distance: 1.4046
Dot Product: 0.0136


In [7]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np

def compare_similarity_metrics(vec1, vec2, product1_text, product2_text):
    # Reshape vectors for sklearn functions
    v1 = np.array(vec1).reshape(1, -1)
    v2 = np.array(vec2).reshape(1, -1)
    
    # Calculate similarities using different metrics
    cosine_sim = cosine_similarity(v1, v2)[0][0]
    euclidean_dist = euclidean_distances(v1, v2)[0][0]
    dot_prod = np.dot(vec1, vec2)
    
    print(f"\nComparing:")
    print(f"Product 1: {product1_text}")
    print(f"Product 2: {product2_text}")
    print(f"Cosine Similarity: {cosine_sim:.4f}")
    print(f"Euclidean Distance: {euclidean_dist:.4f}")
    print(f"Dot Product: {dot_prod:.4f}")
    print("-" * 80)

# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example 1: Very Similar Electronics
phone1 = "iPhone 13 Pro Max 256GB in Graphite"
phone2 = "iPhone 13 Pro 256GB in Sierra Blue"
vec1 = model.encode(phone1)
vec2 = model.encode(phone2)
print("\n1. Very Similar Electronics:")
compare_similarity_metrics(vec1, vec2, phone1, phone2)

# Example 2: Related but Different Electronics
product1 = "Wireless Bluetooth headphones with noise cancellation"
product2 = "Wired gaming headset with RGB lighting"
vec1 = model.encode(product1)
vec2 = model.encode(product2)
print("\n2. Related but Different Electronics:")
compare_similarity_metrics(vec1, vec2, product1, product2)

# Example 3: Similar Clothing Items
clothing1 = "Men's black cotton t-shirt with crew neck"
clothing2 = "Men's navy blue cotton t-shirt with v-neck"
vec1 = model.encode(clothing1)
vec2 = model.encode(clothing2)
print("\n3. Similar Clothing Items:")
compare_similarity_metrics(vec1, vec2, clothing1, clothing2)

# Example 4: Related but Different Clothing
clothing1 = "Women's winter wool coat in black"
clothing2 = "Women's summer cotton dress in floral print"
vec1 = model.encode(clothing1)
vec2 = model.encode(clothing2)
print("\n4. Related but Different Clothing:")
compare_similarity_metrics(vec1, vec2, clothing1, clothing2)

# Example 5: Completely Different Categories
item1 = "Professional DSLR camera with 24MP sensor"
item2 = "Organic green tea leaves, 100g package"
vec1 = model.encode(item1)
vec2 = model.encode(item2)
print("\n5. Completely Different Categories:")
compare_similarity_metrics(vec1, vec2, item1, item2)

# Example 6: Similar Kitchen Items
kitchen1 = "Stainless steel cooking pot with glass lid, 5L"
kitchen2 = "Non-stick cooking pan with lid, 28cm"
vec1 = model.encode(kitchen1)
vec2 = model.encode(kitchen2)
print("\n6. Similar Kitchen Items:")
compare_similarity_metrics(vec1, vec2, kitchen1, kitchen2)

# Example 7: Same Product Different Brands
shoe1 = "Nike running shoes with mesh upper and foam sole"
shoe2 = "Adidas running shoes with breathable mesh and rubber sole"
vec1 = model.encode(shoe1)
vec2 = model.encode(shoe2)
print("\n7. Same Product Different Brands:")
compare_similarity_metrics(vec1, vec2, shoe1, shoe2)


1. Very Similar Electronics:

Comparing:
Product 1: iPhone 13 Pro Max 256GB in Graphite
Product 2: iPhone 13 Pro 256GB in Sierra Blue
Cosine Similarity: 0.6937
Euclidean Distance: 0.7827
Dot Product: 0.6937
--------------------------------------------------------------------------------

2. Related but Different Electronics:

Comparing:
Product 1: Wireless Bluetooth headphones with noise cancellation
Product 2: Wired gaming headset with RGB lighting
Cosine Similarity: 0.4220
Euclidean Distance: 1.0752
Dot Product: 0.4220
--------------------------------------------------------------------------------

3. Similar Clothing Items:

Comparing:
Product 1: Men's black cotton t-shirt with crew neck
Product 2: Men's navy blue cotton t-shirt with v-neck
Cosine Similarity: 0.7852
Euclidean Distance: 0.6555
Dot Product: 0.7852
--------------------------------------------------------------------------------

4. Related but Different Clothing:

Comparing:
Product 1: Women's winter wool coat in bla