In [293]:
import weaviate 
import os
import pandas as pd
from dotenv import load_dotenv
from datetime import datetime
import pickle
from skipgram import SkipGram
import torch
import numpy as np
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure
import weaviate.classes as wvc
from weaviate.util import generate_uuid5
import random
import time

In [294]:
# Load environment variables from .env file
load_dotenv()

# Get the file path from the environment variable
file_path = os.getenv("PATH_TO_ORIGINAL_DATA")
model_path = os.getenv("PATH_TO_MODELS")

# define file names and paths
dataset_file_name = "2019-Oct.csv"
pkl_file_name= "token_map.pkl"
model_file_name = "finished_OpenCDPEmbedding.pt"

# define the dataset and model file paths
dataset = file_path + dataset_file_name
vocab_map_file = model_path + pkl_file_name
embd_model_file = model_path + model_file_name

In [295]:
# Define the columns for the dataset
columns = ["event_type", "product_id", "category_code"]

# Define the data types for each column
dtype_mapping = {
    "event_type": "category",
    "product_id": "UInt32",
    "category_code" : "category",
}

# Define the context size for the SkipGram model
context_size = 2

In [None]:
# Data Source: https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store?select=2019-Oct.csv
# Load dataset

df = pd.read_csv(dataset, usecols=columns, dtype=dtype_mapping,nrows=3000)

print(df.head())

In [None]:
df['merged'] = df['event_type'].astype(str) + df['product_id'].astype(str)

df['merged'].head()

In [None]:
# Load Embedding file to estimate the embedding dimension
embed_model = torch.load(embd_model_file, map_location=torch.device("cpu"))

# Check embedding dim
embedding_weights = embed_model["model"]["embedding.weight"]
embedding_dim = embedding_weights.shape[1]

print(f"Embedding-Dimension: {embedding_dim}")

In [299]:
# load vocabulary mapping from pickle file
with open(vocab_map_file, "rb") as f:
    vocab_mapping = pickle.load(f)

In [None]:
# create embedding
embedding = SkipGram.create_from_checkpoint(embd_model_file, vocab_mapping, embedding_dim, context_size)

In [301]:
# Vectorizes an item using a given model and actions map.
def vectorize_item(item_id, model, vocab_map):
    if item_id not in vocab_map:
        return None  
    index = vocab_map[item_id]
    index_tensor = torch.tensor([index], dtype=torch.long)
    with torch.no_grad():
        vector = model.embed(index_tensor).squeeze().numpy()
    return vector 

In [302]:
# vectorize all product_ids in the dataframe
vectors = []
for item_id in df["merged"]:
    vector = vectorize_item(item_id, embedding, vocab_mapping)
    vectors.append(vector)

df["vector"] = vectors

In [None]:
df[df['vector'].notna()]

In [None]:
df.head

In [None]:
df = df.dropna(subset=['category_code'])
df

In [None]:
print(df.dtypes)

In [None]:
df_product = df.drop(columns=['event_type', 'merged'])
df_product

In [None]:
df_product = df_product.dropna()
df_product

In [None]:
# Create the client
client = weaviate.connect_to_local()

print(client.is_ready())

In [322]:
client.collections.delete("RecommenderDB")

In [323]:
# Create the schema with the properties and no vectorizer (as we already have the vectors)
recommendations = client.collections.create(
    name="RecommenderDB",
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
    properties=[
        wvc.config.Property(name="product_id", data_type=wvc.config.DataType.INT),
        wvc.config.Property(name="category_code", data_type=wvc.config.DataType.TEXT),
    ]
)

In [None]:
recommencations = client.collections.get("RecommenderDB")

In [None]:
# Enter context manager
with recommencations.batch.dynamic() as batch:
    # Loop through the data
    for i, row in enumerate(df_product.itertuples(index=False)):
        recommendation_objs = {
            "product_id": row.product_id,
            "category_code": row.category_code,
        }
        # Get the vector
        vector = row.vector
        # Add object (including vector) to batch queue
        c = batch.add_object(
            properties=recommendation_objs,
            uuid=generate_uuid5(row.index),
            vector=[0.12345] * 1536 # Add the custom vector
            # references=reference_obj  # You can add references here
        )
        print(c)


In [None]:
recommendation_objs = list()

for i, row in df_product.iterrows():
    vector = row['vector'].tolist()
    print(vector)
    recommendation_objs.append(wvc.data.DataObject(
        properties={
            "product_id": row["product_id"],
            "category_code": row["category_code"],
        },
        vector= vector
    ))
recommencations = client.collections.get("RecommenderDB")
recommencations.data.insert_many(recommendation_objs)

In [None]:
# testing query
# Read the last 10 rows of the dataset, load only first 5000 rows
last_10_rows = pd.read_csv(dataset, usecols=columns, dtype=dtype_mapping, nrows=5000).tail(10)
print(last_10_rows)

In [None]:
last_10_rows['merged'] = last_10_rows['event_type'].astype(str) + last_10_rows['product_id'].astype(str)

last_10_rows.dropna()
last_10_rows

In [None]:
# Function to vectorize a random row
def vectorize_random_row(df, model, vocab_map):
    while True:
        random_row = df.sample(n=1).iloc[0]
        if pd.notna(random_row['category_code']) and pd.notna(random_row['product_id']):
            item_id = random_row['merged']
            vector = vectorize_item(item_id, model, vocab_map)
            if vector is not None:
                return vector, random_row
        # If the selected row has NaN values, continue to select another row

# Vectorize a random row from last_10_rows
q_vector, random_row = vectorize_random_row(last_10_rows, embedding, vocab_mapping)
print(f"Random Row: {random_row}")
print(f"Vector: {q_vector}")

In [None]:
time.sleep(1)  # Sleep so we don't query before async indexing finishes

product_recommendation = recommencations.query.near_vector(
    near_vector=q_vector,
    limit=5,
    return_properties=["product_id", "category_code"]
)

print(product_recommendation)



In [None]:
# Alle Objekte aus einer bestimmten Klasse abrufen
result = client.collections.get("RecommenderDB")

# Ergebnis anzeigen
print(result)

In [None]:
for item in result.iterator(include_vector=True):
    #print(item)
    print(item.properties)
    print(item.vector)

In [None]:
# Inspect the response
for o in product_recommendation.objects:
    print(o.properties["product_id"], o.properties["category_code"])

client.close()