In [4]:
import weaviate
import os
import pandas as pd
from dotenv import load_dotenv
from datetime import datetime
import pickle
from skipgram import SkipGram
import torch

In [5]:
file_name = "yoochoose-clicks.dat"
pkl_file_name= "yoochoose_trigrams.pkl"
checkpoint_path = "finished_embedding_YooChooseEmbedding.pt"

columns = ["session_id", "ts", "item_id", "category_id"]

dtype_mapping = {
    "session_id": "UInt32",
    "ts": "str",
    "item_id": "UInt32",
    "category_id": "category"
}

context_size = 2

In [None]:
# # Load environment variables from .env file
load_dotenv()

In [7]:
# # Get the file path from the environment variable
file_path = os.getenv("PATH_TO_ORIGINAL_DATA")
model_path = os.getenv("PATH_TO_MODELS")

In [None]:
# Load checkpoint
checkpoint = torch.load(model_path + checkpoint_path, map_location=torch.device("cpu"))

# Check embedding dim
embedding_weights = checkpoint["model"]["embedding.weight"]
embedding_dim = embedding_weights.shape[1]

print(f"Embedding-Dimension: {embedding_dim}")

In [None]:
# Load the data
# Data Source: https://www.kaggle.com/datasets/chadgostopp/recsys-challenge-2015
data = pd.read_csv(file_path + file_name, names=columns, dtype=dtype_mapping)

print(data.head())

In [10]:
# transform to timestamp (in seconds)
data.ts = data.ts.apply(lambda x: int(datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp()))
data.sort_values(by="ts", inplace=True)

In [11]:
#load vocabulary mapping from pickle file
with open(model_path + pkl_file_name, "rb") as f:
    pkl_model = pickle.load(f)

In [None]:
print(pkl_model.keys())

In [13]:
# Extract the action mapping
if "actions_map" in pkl_model:
    action_mapping = pkl_model["actions_map"]
    #print("Action Mapping:", action_mapping)
else:
    print("Action Mapping not found.")

In [None]:

embedding = SkipGram.create_from_checkpoint(model_path + checkpoint_path, action_mapping, embedding_dim, context_size)

In [None]:
print(len(action_mapping))

In [None]:
# Print the first 5 values of action_mapping
first_5_action_mapping = dict(list(action_mapping.items())[:5])
print(first_5_action_mapping)

In [22]:
def vectorize_item(item_id, model, actions_map):
    """
    Vektorisiere eine item_id, basierend auf dem Modell und der actions_map.
    """
    if item_id not in actions_map:
        return None  # Keine Vektoren für unbekannte IDs
    index = actions_map[item_id]
    index_tensor = torch.tensor([index], dtype=torch.long)
    with torch.no_grad():
        vector = model.embed(index_tensor).squeeze().numpy()
    return vector  # Nur den Vektor zurückgeben

In [27]:
sampled_data = data.sample(frac=0.1, random_state=42)

In [28]:
# Vektoren generieren
vectors = []
for item_id in sampled_data["item_id"]:
    vector = vectorize_item(item_id, embedding, action_mapping)
    vectors.append(vector)

sampled_data["vector"] = vectors

In [None]:
print(sampled_data['vector'].head())

In [None]:
print(sampled_data['vector'].iloc[1])  # Display the first row's vector

In [None]:
new_df = sampled_data[['item_id', 'vector']].copy()
print(new_df.head())