# Hetero Graph Creation

## 1. Setup

In [None]:
!pip install torch_geometric



In [None]:
from torch_geometric.nn import GATv2Conv, to_hetero
from torch_geometric.transforms import ToUndirected
from torch_geometric.data import HeteroData
from torch.nn import Linear, Embedding
import pandas as pd
import numpy as np
import torch
import os

pd.set_option('display.max_columns', None)

## 2. Data Prep

### 2.1 Read Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
drive_path = "drive/MyDrive/SJ_PCD_24-2/"
customers_path = drive_path + "data/cleaned/customers.parquet"
sales_path = drive_path + "data/cleaned/sales.parquet"
products_path = drive_path + "data/cleaned/products.parquet"
products_embeddings_path = drive_path + "data/transformed/products_embeddings.parquet"

customers = pd.read_parquet(customers_path)
sales = pd.read_parquet(sales_path)
products = pd.read_parquet(products_path)
products_embeddings = pd.read_parquet(products_embeddings_path)

### 2.2 Update customers

In [None]:
customer_id_mapping = {id_: idx for idx, id_ in enumerate(customers['customer_id'].values)}
customers['customer_id_for_graph'] = customers['customer_id'].apply(lambda x: customer_id_mapping[x])
customers = customers[[
    "customer_id_for_graph",
    "customer_id",
    "customer_age",
    "customer_gender",
    "purchases",
    "total_gross_sum", "total_gross_mean", "total_gross_max", "total_gross_min",
    "total_discount_sum", "total_discount_mean", "total_discount_max", "total_discount_min",
    "total_net_sum", "total_net_mean", "total_net_max", "total_net_min",
]]

### 2.3 Update products and products embeddings

In [None]:
products_id_mapping = {id_: idx for idx, id_ in enumerate(products['product_id'].values)}
products["product_id_for_graph"] = products["product_id"].apply(lambda x: products_id_mapping[x])
products = products[["product_id_for_graph", "product_id", "product_price", "units_sold"]]

final_products = products.merge(products_embeddings, on="product_id")
embedding_df = pd.DataFrame(final_products['embedding'].tolist(), index=final_products.index)
final_products = pd.concat([final_products.drop('embedding', axis=1), embedding_df], axis=1)

### 2.4 Update sales

In [None]:
# Mapping sale IDs to indices
sales_id_mapping = {id_: idx for idx, id_ in enumerate(sales['sale_id'].values)}
sales["sale_id_for_graph"] = sales["sale_id"].apply(lambda x: sales_id_mapping[x])
sales["customer_id_for_graph"] = sales["customer_id"].apply(lambda x: customer_id_mapping[x])
sales["product_id_for_graph"] = sales["product_id"].apply(lambda x: products_id_mapping[x])

# Ensure modifications are applied to the dataframe without triggering a SettingWithCopyWarning
sales = sales.loc[:, [
    "sale_id_for_graph", "customer_id_for_graph", "product_id_for_graph", "sale_id", "customer_id",
    "product_id", "store_id", "week_of_year", "day_of_week", "hour", "units", "gross_total", "was_in_promotion",
    "total_discount", "net_total"]]

# Instead of one hot encoding, create embeddings to encode the store id
sales.loc[:, 'store_id_code'] = sales['store_id'].astype('category').cat.codes  # Use .loc to avoid the warning

# Create the embeddings for store_id
num_stores = sales['store_id_code'].nunique()
store_embedding = Embedding(num_embeddings=num_stores, embedding_dim=5)
store_ids_tensor = torch.tensor(sales['store_id_code'].values, dtype=torch.long)
store_embeddings = store_embedding(store_ids_tensor)

# Add embeddings to dataframe
sales['store_embeddings'] = store_embeddings.detach().numpy().tolist()
embedding_cols = [f'store_embedding_{i}' for i in range(5)]
embedding_df = pd.DataFrame(sales['store_embeddings'].tolist(), columns=embedding_cols)

# Concatenate the embeddings with the original dataframe
sales = pd.concat([sales, embedding_df], axis=1)

# Drop the temporary 'store_embeddings' column
sales = sales.drop(columns=['store_embeddings'])

## 3. Create Graph

In [None]:
data = HeteroData()

In [None]:
data["customer"].x = torch.tensor(customers.drop(["customer_id", "customer_id_for_graph"], axis=1).values, dtype=torch.float)
data["product"].x = torch.tensor(final_products.drop(["product_id_for_graph", "product_id"], axis=1).values, dtype=torch.float)

In [None]:
print(data["customer"].x.shape)
print(data["customer"].x)
print("\n\n")
print(data["product"].x.shape)
print(data["product"].x)

torch.Size([388273, 15])
tensor([[5.0000e+01, 0.0000e+00, 2.2000e+01,  ..., 3.6447e+01, 1.0990e+02,
         5.9900e+00],
        [3.0000e+01, 0.0000e+00, 5.0000e+00,  ..., 2.2702e+01, 3.6240e+01,
         5.4900e+00],
        [3.4000e+01, 1.0000e+00, 7.0000e+00,  ..., 2.4440e+01, 7.9790e+01,
         1.0000e-02],
        ...,
        [2.0000e+01, 1.0000e+00, 6.0000e+00,  ..., 9.9383e+00, 1.6990e+01,
         4.4900e+00],
        [5.7000e+01, 1.0000e+00, 5.0000e+00,  ..., 9.9720e+00, 2.1900e+01,
         4.9900e+00],
        [2.1000e+01, 1.0000e+00, 6.0000e+00,  ..., 1.6642e+01, 4.4900e+01,
         5.9900e+00]])



torch.Size([12899, 770])
tensor([[ 1.9550e+01,  2.3000e+02,  5.5187e-03,  ...,  1.5744e-02,
         -9.0021e-02, -5.3182e-02],
        [ 1.5030e+01,  9.1000e+01, -8.3539e-02,  ..., -6.6962e-02,
         -4.8173e-02, -5.1496e-02],
        [ 2.5390e+01,  5.4700e+02,  8.0060e-02,  ..., -2.7141e-03,
         -8.6794e-02, -8.4612e-02],
        ...,
        [ 4.0870e+01,  6.8000

In [None]:
edge_index = torch.stack([
    torch.tensor(sales["customer_id_for_graph"].values, dtype=torch.long),
    torch.tensor(sales["product_id_for_graph"].values, dtype=torch.long)
], dim=0)
data[("customer", "bought", "product")].edge_index = edge_index

print(edge_index.shape)
print(edge_index)

torch.Size([2, 6001953])
tensor([[12941, 12941, 12941,  ..., 12940, 12940, 12940],
        [  206, 11318,  5085,  ...,  6878,  9892, 11227]])


In [None]:
edge_attr_features = [
    "week_of_year", "day_of_week", "hour", "units", "gross_total", "was_in_promotion",
    "total_discount", "net_total", "store_embedding_0", "store_embedding_1",
    "store_embedding_2", "store_embedding_3", "store_embedding_4"
]
edge_attr = torch.tensor(sales[edge_attr_features].astype(float, errors='ignore').values, dtype=torch.float)
data[("customer", "bought", "product")].edge_attr = edge_attr

print(edge_attr.shape)
print(edge_attr)

torch.Size([6001953, 13])
tensor([[15.0000,  2.0000, 15.0000,  ..., -0.8431, -1.2884,  0.7040],
        [15.0000,  2.0000, 15.0000,  ..., -0.8431, -1.2884,  0.7040],
        [15.0000,  2.0000, 15.0000,  ..., -0.8431, -1.2884,  0.7040],
        ...,
        [11.0000,  0.0000, 16.0000,  ..., -0.4445, -0.6737,  0.7682],
        [11.0000,  0.0000, 16.0000,  ..., -0.4445, -0.6737,  0.7682],
        [11.0000,  0.0000, 16.0000,  ..., -0.4445, -0.6737,  0.7682]])


In [None]:
data = ToUndirected()(data)
data.metadata()

(['customer', 'product'],
 [('customer', 'bought', 'product'), ('product', 'rev_bought', 'customer')])

In [None]:
print(data)

HeteroData(
  customer={ x=[388273, 15] },
  product={ x=[12899, 770] },
  (customer, bought, product)={
    edge_index=[2, 6001953],
    edge_attr=[6001953, 13],
  },
  (product, rev_bought, customer)={
    edge_index=[2, 6001953],
    edge_attr=[6001953, 13],
  }
)


In [None]:
print(f"My graph has {data['customer'].num_nodes} customer nodes and {data['product'].num_nodes} product nodes.")
print(f"In my graph, customer nodes have {data['customer'].num_features} features, and product nodes have {data['product'].num_features} features.")
print(f"My graph has {data['customer', 'bought', 'product'].num_edges} edges, connecting customer nodes to product nodes.")
print(f"Similarly, there are {data['product', 'rev_bought', 'customer'].num_edges} reverse edges, connecting product nodes to customer nodes.")
print(f"In my graph, each edge has {data['customer', 'bought', 'product'].num_edge_features} features.")

My graph has 388273 customer nodes and 12899 product nodes.
In my graph, customer nodes have 15 features, and product nodes have 770 features.
My graph has 6001953 edges, connecting customer nodes to product nodes.
Similarly, there are 6001953 reverse edges, connecting product nodes to customer nodes.
In my graph, each edge has 13 features.


In [None]:
torch.save(data, drive_path+"data/transformed/graph.pth")