In [24]:
import weaviate
import os
import pandas as pd
from dotenv import load_dotenv
from datetime import datetime
import pickle
from skipgram import SkipGram
import torch

In [25]:
file_name = "yoochoose-clicks.dat"
pkl_file_name= "yoochoose_trigrams.pkl"
checkpoint_path = "finished_embedding_YooChooseEmbedding.pt"

columns = ["session_id", "ts", "item_id", "category_id"]

dtype_mapping = {
    "session_id": "UInt32",
    "ts": "str",
    "item_id": "UInt32",
    "category_id": "category"
}

context_size = 2

In [26]:
# # Load environment variables from .env file
load_dotenv()

True

In [27]:
# # Get the file path from the environment variable
file_path = os.getenv("PATH_TO_ORIGINAL_DATA")
model_path = os.getenv("PATH_TO_MODELS")

In [28]:
# Load checkpoint
checkpoint = torch.load(model_path + checkpoint_path, map_location=torch.device("cpu"))

# Check embedding dim
embedding_weights = checkpoint["model"]["embedding.weight"]
embedding_dim = embedding_weights.shape[1]

print(f"Embedding-Dimension: {embedding_dim}")

Embedding-Dimension: 32


  checkpoint = torch.load(model_path + checkpoint_path, map_location=torch.device("cpu"))


In [30]:
# Load the data
# Data Source: https://www.kaggle.com/datasets/chadgostopp/recsys-challenge-2015
data = pd.read_csv(file_path + file_name, names=columns, dtype=dtype_mapping)

print(data.head())

   session_id                        ts    item_id category_id
0           1  2014-04-07T10:51:09.277Z  214536502           0
1           1  2014-04-07T10:54:09.868Z  214536500           0
2           1  2014-04-07T10:54:46.998Z  214536506           0
3           1  2014-04-07T10:57:00.306Z  214577561           0
4           2  2014-04-07T13:56:37.614Z  214662742           0


In [31]:
# transform to timestamp (in seconds)
data.ts = data.ts.apply(lambda x: int(datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp()))
data.sort_values(by="ts", inplace=True)

In [32]:
#load vocabulary mapping from pickle file
with open(model_path + pkl_file_name, "rb") as f:
    pkl_model = pickle.load(f)

In [33]:
print(pkl_model.keys())

dict_keys(['ngrams', 'actions_map'])


In [34]:
# Extract the action mapping
if "actions_map" in pkl_model:
    action_mapping = pkl_model["actions_map"]
    #print("Action Mapping:", action_mapping)
else:
    print("Action Mapping not found.")

In [35]:

embedding = SkipGram.create_from_checkpoint(model_path + checkpoint_path, action_mapping, embedding_dim, context_size)

  checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))


In [52]:
import numpy as np

# Mapping of the item_ids from the Yoochoose data with the action_mapping
data["item_id_mapped"] = data["item_id"].apply(lambda x: action_mapping.get(np.int32(x), len(action_mapping)))

In [54]:
# Function to vectorize items
def vectorize_items(item_ids, model):
    item_ids_tensor = torch.tensor(item_ids, dtype=torch.long)
    with torch.no_grad():
        embeddings = model.embed(item_ids_tensor)
    return embeddings.numpy()

# Calculate the vectors for the first 10 items
item_ids = data["item_id_mapped"].tolist()
item_vectors = vectorize_items(item_ids, embedding)

print(item_vectors)

[[ 3.4318674   4.4825516  -2.544856   ...  0.7612928  -0.9312703
   0.6341148 ]
 [-0.9762234   5.6311107  -2.3641407  ...  0.6059743  -0.9499383
  -1.1148039 ]
 [-1.3192425   4.2692685  -3.398055   ...  0.59222704  1.0682908
  -0.70420265]
 ...
 [-3.8075075   3.0427346  -1.5741898  ...  2.2775836   0.70090294
  -1.9142159 ]
 [-3.8075075   3.0427346  -1.5741898  ...  2.2775836   0.70090294
  -1.9142159 ]
 [-4.7697096  -4.071556    0.22586069 ...  1.536745   -0.8430815
   2.5262337 ]]


In [57]:
from sklearn.metrics.pairwise import cosine_similarity

example_item_id = data["item_id_mapped"].iloc[11]  # Beispiel-Item (numerische ID)
example_vector = item_vectors[example_item_id]  # Vektor des Items

# Ähnlichkeit berechnen
similarities = cosine_similarity([example_vector], item_vectors)

# IDs der ähnlichsten Produkte (sortiert nach Ähnlichkeit)
similar_indices = np.argsort(similarities[0])[::-1][:5]  # Top 5 ähnliche Produkte

# Rückführung der numerischen IDs in die ursprünglichen item_id
reverse_action_mapping = {v: k for k, v in action_mapping.items()}  # Mapping umkehren
# Check if the key exists in the dictionary before accessing it
similar_items = []
for idx in similar_indices:
    if idx in reverse_action_mapping:
        similar_items.append(reverse_action_mapping[idx])
    else:
        print(f"Key {idx} not found in reverse_action_mapping")

print(f"Ähnliche Produkte für Item {example_item_id}: {similar_items}")

Key 232371 not found in reverse_action_mapping
Key 178656 not found in reverse_action_mapping
Key 3732864 not found in reverse_action_mapping
Ähnliche Produkte für Item 43438: [np.int32(214840567), np.int32(214827105)]
