# Node2vec embedding

For this notebook, we create node embedding for all product nodes using node2vec. Afterward, we upload the embedding to pine cone vector database to facilliate searching.

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install graphdatascience

In [None]:
# Client import
from neo4j import GraphDatabase

In [None]:
AURA_CONNECTION_URI = ""
AURA_USERNAME = ""
AURA_PASSWORD = ""

# connet to pinecone db
driver = GraphDatabase.driver(
    AURA_CONNECTION_URI,
    auth=(AURA_USERNAME, AURA_PASSWORD)
)

In [None]:
def execute_cypher_command(command):
    with driver.session() as session:
        result = session.run(command)
        return result.data()

In [None]:
# create graph projection, graph name is KDD
gds_graph_creation = """
  CALL gds.graph.project(
      "KDD",
      ["Product", "Store", "User"],
      ["belongs to", "comment"]
  ) YIELD
  graphName AS graph
"""

# Create a driver session
graph_name = execute_cypher_command(gds_graph_creation)

In [None]:
# create graph projection, graph name is KDD
gds_node2vec_creation = """
CALL gds.node2vec.write("KDD", {writeProperty: "embedding"})
"""

# Create a driver session
execute_cypher_command(gds_node2vec_creation)

[{'nodeCount': 2853146,
  'nodePropertiesWritten': 2853146,
  'preProcessingMillis': 0,
  'computeMillis': 19140,
  'writeMillis': 42611,
  'configuration': {'writeProperty': 'embedding',
   'walkLength': 80,
   'walkBufferSize': 1000,
   'jobId': '279a3e79-cd5e-408c-af7f-ce5128b3c5be',
   'iterations': 1,
   'returnFactor': 1.0,
   'negativeSamplingRate': 5,
   'windowSize': 10,
   'sudo': False,
   'writeConcurrency': 4,
   'positiveSamplingFactor': 0.001,
   'inOutFactor': 1.0,
   'logProgress': True,
   'negativeSamplingExponent': 0.75,
   'nodeLabels': ['*'],
   'initialLearningRate': 0.025,
   'concurrency': 4,
   'relationshipTypes': ['*'],
   'walksPerNode': 10,
   'embeddingInitializer': 'NORMALIZED',
   'embeddingDimension': 128,
   'minLearningRate': 0.0001},
  'lossPerIteration': [0.0]}]

In [None]:
# Run a query and get the cursor object
session = driver.session()
cursor = session.run("MATCH (n) RETURN n")

product_embedding= []
# Iterate over the results of the query, save only the product nodes
for record in cursor:
    if list(record["n"].labels)[0] == "Product":
        product_embedding.append(dict(record["n"].items()))

In [None]:
import pandas as pd
import json

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(product_embedding)

In [None]:
df.to_parquet('embedding.parquet', engine='pyarrow')

In [None]:
!cp ./embedding.parquet ./drive/MyDrive/Colab_Notebooks/MACS_37000/final/data

## Upload Data to pinecone

Afer acquiring the embedding data, we upload it to the pinecone dataabase

In [None]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-4.1.0-py3-none-any.whl (215 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/215.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m133.1/215.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.5/215.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pinecone-client
Successfully installed pinecone-client-4.1.0


In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp ./drive/MyDrive/Colab_Notebooks/MACS_37000/final/data/embedding.parquet .

In [None]:
import pandas as pd

df = pd.read_parquet('embedding.parquet')

In [None]:
df.head()

Unnamed: 0,parent_asin,average_rating,embedding,title,price,description
0,B08BHN9PK5,4.6,"[0.0014340028865262866, 0.003454748773947358, ...",YUEDGE 5 Pairs Men's Moisture Control Cushione...,,
1,B08R39MRDW,4.1,"[0.0014305035583674908, 0.002265076618641615, ...",DouBCQ Women's Palazzo Lounge Wide Leg Casual ...,,
2,B077KJHCJ4,4.3,"[0.001429803785867989, 0.003589642234146595, 0...",Pastel by Vivienne Honey Vanilla Girls' Trapez...,,
3,B0811M2JG9,2.0,"[0.0014319033361971378, -0.0003840545250568539...",Mento Streamtail,29.81,Slip on the Women's Mento and you're ready to ...
4,B07SB2892S,4.3,"[0.0014312033308669925, 0.0009405110613442957,...",RONNOX Women's 3-Pairs Bright Colored Calf Com...,17.99,Ronnox Calf Sleeves - Allowing Your Body to Pe...


In [None]:
# check the size
len(df.iloc[0]['embedding'])

128

In [None]:
from pinecone import Pinecone
from tqdm import tqdm
import math

pc = Pinecone(api_key="")
index = pc.Index("node-embed")

In [None]:
products = df.to_dict(orient='records')

In [None]:
def upload_batch(raw_items):
  items_to_upsert = []
  for x in raw_items:
    if math.isnan(x["price"]):
      x["price"] = -1
    if x['description'] == None:
      x['description'] = ""
    if x['title'] == None:
      x['title'] = ""

    item_to_upsert = {
      "id": x['parent_asin'],
      "values": x['embedding'],
      "metadata" : {
        "title": x['title'], # sparse data
        "description": x['description'],
        "average_rating": x['average_rating'],
        "price": x['price']}
    }
    items_to_upsert.append(item_to_upsert)

  try:
    index.upsert(items_to_upsert)
  except Exception as e:
    print(e)
    print()

In [None]:
# upload the embedding in the size of 100
segment_size = 100
num_segments = (len(products) + segment_size - 1) // segment_size
for i in tqdm(range(0, len(products), segment_size), total=num_segments, desc="Processing segments"):
    products_segment = products[i:i + segment_size]
    upload_batch(products_segment)

Processing segments: 100%|██████████| 8262/8262 [47:49<00:00,  2.88it/s]
