In [32]:
import weaviate
import os
import pandas as pd
from dotenv import load_dotenv
from datetime import datetime
import pickle
from skipgram import SkipGram
import torch
import numpy as np

In [33]:
# Load environment variables from .env file
load_dotenv()

# Get the file path from the environment variable
file_path = os.getenv("PATH_TO_ORIGINAL_DATA")
model_path = os.getenv("PATH_TO_MODELS")

# define file names and paths
dataset_file_name = "2019-Oct.csv"
pkl_file_name= "token_map.pkl"
model_file_name = "finished_OpenCDPEmbedding.pt"

# define the dataset and model file paths
dataset = file_path + dataset_file_name
vocab_map_file = model_path + pkl_file_name
embd_model_file = model_path + model_file_name

In [34]:
# Define the columns for the dataset
columns = ["event_type", "product_id", "category_code"]

# Define the data types for each column
dtype_mapping = {
    "event_type": "category",
    "product_id": "UInt32",
    "category_code" : "category",
}

# Define the context size for the SkipGram model
context_size = 2

In [None]:
# Data Source: https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store?select=2019-Oct.csv
# Load dataset

df = pd.read_csv(dataset, usecols=columns, dtype=dtype_mapping,nrows=3000)

print(df.head())

In [None]:
df['merged'] = df['event_type'].astype(str) + df['product_id'].astype(str)

df['merged'].head()

In [None]:
# Load Embedding file to estimate the embedding dimension
embed_model = torch.load(embd_model_file, map_location=torch.device("cpu"))

# Check embedding dim
embedding_weights = embed_model["model"]["embedding.weight"]
embedding_dim = embedding_weights.shape[1]

print(f"Embedding-Dimension: {embedding_dim}")

In [38]:
# load vocabulary mapping from pickle file
with open(vocab_map_file, "rb") as f:
    vocab_mapping = pickle.load(f)

In [None]:
# create embedding
embedding = SkipGram.create_from_checkpoint(embd_model_file, vocab_mapping, embedding_dim, context_size)

In [40]:
# Vectorizes an item using a given model and actions map.
def vectorize_item(item_id, model, vocab_map):
    if item_id not in vocab_map:
        return None  
    index = vocab_map[item_id]
    index_tensor = torch.tensor([index], dtype=torch.long)
    with torch.no_grad():
        vector = model.embed(index_tensor).squeeze().numpy()
    return vector 

In [41]:
# vectorize all product_ids in the dataframe
vectors = []
for item_id in df["merged"]:
    vector = vectorize_item(item_id, embedding, vocab_mapping)
    vectors.append(vector)

df["vector"] = vectors

In [None]:
df[df['vector'].notna()]