In [20]:
import pandas as pd
import torch
from torch_frame import TensorFrame, stype
from torch_frame.nn import (
    StypeWiseFeatureEncoder,
    EmbeddingEncoder,
    LinearBucketEncoder,
)
from torch_frame.data import Dataset
from torch.nn import LayerNorm

Let's start by creating initial embeddings for our top 150 players using Torch Frame

In [None]:
# load our players data in from the CSV
players = pd.read_csv("../data/player_features.csv")

In [30]:
# channels controls the dimension size each column will have for our player rows after encoding
channels = 128

# set the stypes for each column in our data
col_to_stype = {
    "player_id": stype.numerical,
    "current_rank": stype.numerical,
    "dob": stype.numerical,
    "height": stype.numerical,
    "country_num": stype.categorical
}

# 2) Build a Dataset and materialize it -> computes col_stats and a TensorFrame
ds = Dataset(df=players, col_to_stype=col_to_stype).materialize()
tf_players = ds.tensor_frame
col_stats  = ds.col_stats
col_names_dict = tf_players.col_names_dict

# 3) Create the stype-wise encoder with the computed stats
stype_encoder_dict = {
    stype.categorical: EmbeddingEncoder(),
    stype.numerical:  LinearBucketEncoder(post_module=LayerNorm(channels)),
}

encoder = StypeWiseFeatureEncoder(
    out_channels=channels,
    col_stats=col_stats,
    col_names_dict=col_names_dict,
    stype_encoder_dict=stype_encoder_dict,
)

# 4) Encode
x, _meta = encoder(tf_players)  # x: [batch, num_cols, channels]

player_emb = x.mean(dim=1) # simple average pooling over columns for now. we can get fancier later on

Now that we have our initial player embeddings, we can grab our edges to make our graph

In [32]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GATConv

In [None]:
edges = pd.read_csv("../data/edges.csv")

# construct edge index tensor
edge_index = torch.tensor(
    [edges["winner_idx"].values, edges["loser_idx"].values],
    dtype=torch.long
)

# add on edge attributes
surface = torch.tensor(edges["surface"].values, dtype=torch.long)
surface_oh = torch.nn.functional.one_hot(surface, num_classes=3).float()  # [E,3]
days = torch.tensor(edges["days_ago"].values, dtype=torch.float32).unsqueeze(1)
days = (days - days.mean()) / (days.std() + 1e-6)  # normalized to mean 0, std 1

edge_attr = torch.cat([surface_oh, days], dim=1)  # [E, 4]

# create the graph
g = Data(x=player_emb, edge_index=edge_index, edge_attr=edge_attr)

torch.Size([4137, 1]) torch.Size([4137, 3])
