In [1]:
import torch

In [2]:
torch.cuda.is_available()

False

In [3]:
torch.__version__

'2.6.0+cpu'

In [4]:
import torch_rechub

In [5]:
torch_rechub.__version__

'0.1.0'

In [6]:
!pip list|grep rechub

torch-rechub              0.3.0


In [7]:
torch_rechub.__version__

'0.1.0'

### 加载包

In [9]:

import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder

from torch_rechub.basic.features import SparseFeature, SequenceFeature
from torch_rechub.models.matching import DSSM
from torch_rechub.trainers import MatchTrainer
from torch_rechub.utils.data import df_to_dict, MatchDataGenerator
from torch_rechub.utils.match import generate_seq_feature_match, gen_model_input

torch.manual_seed(2022)



<torch._C.Generator at 0x7f12703e9770>

### 准备数据

In [10]:
data = pd.read_csv('/data/ml-1m_sample.csv')
print(f"数据集大小: {len(data)} 条记录")

数据集大小: 100 条记录


In [11]:
# 处理 genres 特征
data["cate_id"] = data["genres"].apply(lambda x: x.split("|")[0])


In [12]:
# ========== 2. 特征编码 ==========
user_col, item_col = "user_id", "movie_id"
sparse_features = ["user_id", "movie_id", "gender", "age", "occupation", "zip", "cate_id"]

feature_max_idx = {}
for feat in sparse_features:
    encoder = LabelEncoder()
    data[feat] = encoder.fit_transform(data[feat]) + 1  # +1 为 padding 预留 0
    feature_max_idx[feat] = data[feat].max() + 1

# ========== 3. 定义用户塔和物品塔特征 ==========
user_cols = ["user_id", "gender", "age", "occupation", "zip"]
item_cols = ["movie_id", "cate_id"]

user_profile = data[user_cols].drop_duplicates("user_id")
item_profile = data[item_cols].drop_duplicates("movie_id")

In [13]:
# ========== 4. 生成序列特征和训练数据 ==========
df_train, df_test = generate_seq_feature_match(
    data,
    user_col,
    item_col,
    time_col="timestamp",
    item_attribute_cols=[],
    sample_method=1,
    mode=0,  # point-wise
    neg_ratio=3,
    min_item=0
)

x_train = gen_model_input(df_train, user_profile, user_col, item_profile, item_col, seq_max_len=50)
y_train = x_train["label"]
x_train = {k: v for k, v in x_train.items() if k != "label"}
x_test = gen_model_input(df_test, user_profile, user_col, item_profile, item_col, seq_max_len=50)


preprocess data


generate sequence features: 100%|██████████| 2/2 [00:00<00:00, 796.26it/s]

n_train: 384, n_test: 2
0 cold start user dropped 





In [14]:
# ========== 5. 定义特征类型 ==========
user_features = [
    SparseFeature(name, vocab_size=feature_max_idx[name], embed_dim=16)
    for name in user_cols
]
user_features += [
    SequenceFeature(
        "hist_movie_id",
        vocab_size=feature_max_idx["movie_id"],
        embed_dim=16,
        pooling="mean",
        shared_with="movie_id"
    )
]

item_features = [
    SparseFeature(name, vocab_size=feature_max_idx[name], embed_dim=16)
    for name in item_cols
]


In [15]:
user_features[:10]

[<SparseFeature user_id with Embedding shape (3, 16)>,
 <SparseFeature gender with Embedding shape (3, 16)>,
 <SparseFeature age with Embedding shape (3, 16)>,
 <SparseFeature occupation with Embedding shape (3, 16)>,
 <SparseFeature zip with Embedding shape (3, 16)>,
 <SequenceFeature hist_movie_id with Embedding shape (94, 16)>]

In [16]:
item_features[:10]

[<SparseFeature movie_id with Embedding shape (94, 16)>,
 <SparseFeature cate_id with Embedding shape (11, 16)>]

In [17]:
# ========== 6. 创建 DataLoader ==========
all_item = df_to_dict(item_profile)
test_user = x_test

dg = MatchDataGenerator(x=x_train, y=y_train)
train_dl, test_dl, item_dl = dg.generate_dataloader(test_user, all_item, batch_size=256)


In [18]:
dg

<torch_rechub.utils.data.MatchDataGenerator at 0x7f1200683da0>

In [19]:
# ========== 7. 定义模型 ==========
model = DSSM(
    user_features,
    item_features,
    temperature=0.02,
    user_params={"dims": [128, 64], "activation": "prelu"},
    item_params={"dims": [128, 64], "activation": "prelu"},
)


In [20]:
model

DSSM(
  (embedding): EmbeddingLayer(
    (embed_dict): ModuleDict(
      (user_id): Embedding(3, 16)
      (gender): Embedding(3, 16)
      (age): Embedding(3, 16)
      (occupation): Embedding(3, 16)
      (zip): Embedding(3, 16)
      (movie_id): Embedding(94, 16)
      (cate_id): Embedding(11, 16)
    )
  )
  (user_mlp): MLP(
    (mlp): Sequential(
      (0): Linear(in_features=96, out_features=128, bias=True)
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): PReLU(num_parameters=1)
      (3): Dropout(p=0, inplace=False)
      (4): Linear(in_features=128, out_features=64, bias=True)
      (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): PReLU(num_parameters=1)
      (7): Dropout(p=0, inplace=False)
    )
  )
  (item_mlp): MLP(
    (mlp): Sequential(
      (0): Linear(in_features=32, out_features=128, bias=True)
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_ru

In [21]:
# ========== 8. 训练模型 ==========
trainer = MatchTrainer(
    model,
    mode=0,  # point-wise
    optimizer_params={"lr": 1e-4, "weight_decay": 1e-6},
    n_epoch=2,
    device="cpu",
    model_path="./",
)

trainer.fit(train_dl)

epoch: 0


train: 100%|██████████| 2/2 [00:00<00:00,  2.78it/s]


epoch: 1


train: 100%|██████████| 2/2 [00:00<00:00,  6.03it/s]


In [23]:
!ls -lrth

total 180K
-rw-r--r-- 1 root root 2.9K Feb 13 17:22 Day1-初始化.ipynb
-rw-r--r-- 1 root root 154K Feb 13 17:41 model.pth
-rw-r--r-- 1 root root  20K Feb 13 17:41 day2-RecallSys.ipynb


In [24]:
# ========== 9. 导出嵌入向量 ==========
user_embedding = trainer.inference_embedding(model, mode="user", data_loader=test_dl, model_path="./")
item_embedding = trainer.inference_embedding(model, mode="item", data_loader=item_dl, model_path="./")

print(f"用户嵌入维度: {user_embedding.shape}")
print(f"物品嵌入维度: {item_embedding.shape}")

user inference: 100%|██████████| 1/1 [00:00<00:00,  4.50it/s]
item inference: 100%|██████████| 1/1 [00:00<00:00,  5.97it/s]

用户嵌入维度: torch.Size([2, 64])
物品嵌入维度: torch.Size([93, 64])





In [25]:
user_embedding

tensor([[-0.0430,  0.0229,  0.2193, -0.0459, -0.0040,  0.1367,  0.1212, -0.0384,
         -0.0393, -0.0276, -0.0210,  0.3142, -0.0317, -0.0762, -0.0239, -0.0718,
         -0.0815, -0.0014,  0.0115,  0.0840,  0.0721, -0.0852,  0.2254, -0.0742,
          0.0406,  0.2351,  0.2641, -0.0063, -0.0227, -0.0332,  0.0466,  0.2733,
         -0.0046, -0.0526,  0.2190, -0.0483, -0.0031, -0.0054,  0.3504,  0.1895,
          0.1686, -0.0021, -0.0226,  0.1977, -0.0382, -0.0030,  0.2833, -0.0059,
         -0.0334, -0.0249, -0.0101,  0.0957,  0.0604,  0.0610, -0.0894,  0.0857,
         -0.0346,  0.1687, -0.0657,  0.0020,  0.1581,  0.2239,  0.0591, -0.0534],
        [-0.0431,  0.0232,  0.2189, -0.0463, -0.0035,  0.1365,  0.1219, -0.0385,
         -0.0394, -0.0277, -0.0207,  0.3126, -0.0319, -0.0761, -0.0238, -0.0718,
         -0.0816, -0.0016,  0.0097,  0.0832,  0.0729, -0.0855,  0.2258, -0.0743,
          0.0402,  0.2346,  0.2639, -0.0068, -0.0228, -0.0333,  0.0459,  0.2725,
         -0.0045, -0.0526, 

In [26]:
item_embedding

tensor([[-0.0197,  0.2466,  0.2210,  ...,  0.2887,  0.3514, -0.0856],
        [-0.0196,  0.2471,  0.2207,  ...,  0.2888,  0.3511, -0.0854],
        [-0.0197,  0.2466,  0.2207,  ...,  0.2885,  0.3513, -0.0854],
        ...,
        [-0.0198,  0.2470,  0.2212,  ...,  0.2885,  0.3512, -0.0855],
        [-0.0197,  0.2462,  0.2207,  ...,  0.2886,  0.3518, -0.0856],
        [-0.0197,  0.2465,  0.2209,  ...,  0.2883,  0.3514, -0.0855]])

In [27]:
!ls -lrth

total 184K
-rw-r--r-- 1 root root 2.9K Feb 13 17:22 Day1-初始化.ipynb
-rw-r--r-- 1 root root 154K Feb 13 17:41 model.pth
-rw-r--r-- 1 root root  23K Feb 13 17:42 day2-RecallSys.ipynb


In [30]:
# 保存
torch.save(model.state_dict(), "model.pth")

In [31]:
model

DSSM(
  (embedding): EmbeddingLayer(
    (embed_dict): ModuleDict(
      (user_id): Embedding(3, 16)
      (gender): Embedding(3, 16)
      (age): Embedding(3, 16)
      (occupation): Embedding(3, 16)
      (zip): Embedding(3, 16)
      (movie_id): Embedding(94, 16)
      (cate_id): Embedding(11, 16)
    )
  )
  (user_mlp): MLP(
    (mlp): Sequential(
      (0): Linear(in_features=96, out_features=128, bias=True)
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): PReLU(num_parameters=1)
      (3): Dropout(p=0, inplace=False)
      (4): Linear(in_features=128, out_features=64, bias=True)
      (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): PReLU(num_parameters=1)
      (7): Dropout(p=0, inplace=False)
    )
  )
  (item_mlp): MLP(
    (mlp): Sequential(
      (0): Linear(in_features=32, out_features=128, bias=True)
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_ru

In [32]:
# 加载
model.load_state_dict(torch.load("model.pth"))

<All keys matched successfully>

In [33]:
model

DSSM(
  (embedding): EmbeddingLayer(
    (embed_dict): ModuleDict(
      (user_id): Embedding(3, 16)
      (gender): Embedding(3, 16)
      (age): Embedding(3, 16)
      (occupation): Embedding(3, 16)
      (zip): Embedding(3, 16)
      (movie_id): Embedding(94, 16)
      (cate_id): Embedding(11, 16)
    )
  )
  (user_mlp): MLP(
    (mlp): Sequential(
      (0): Linear(in_features=96, out_features=128, bias=True)
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): PReLU(num_parameters=1)
      (3): Dropout(p=0, inplace=False)
      (4): Linear(in_features=128, out_features=64, bias=True)
      (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): PReLU(num_parameters=1)
      (7): Dropout(p=0, inplace=False)
    )
  )
  (item_mlp): MLP(
    (mlp): Sequential(
      (0): Linear(in_features=32, out_features=128, bias=True)
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_ru

In [34]:
!ls -lrth

total 188K
-rw-r--r-- 1 root root 2.9K Feb 13 17:22 Day1-初始化.ipynb
-rw-r--r-- 1 root root 154K Feb 13 17:44 model.pth
-rw-r--r-- 1 root root  26K Feb 13 17:44 day2-RecallSys.ipynb


In [35]:
trainer.export_onnx("model.onnx")

# 双塔模型可分别导出
trainer.export_onnx("user_tower.onnx", mode="user")
trainer.export_onnx("item_tower.onnx", mode="item")



RuntimeError: Failed to export ONNX model: Module onnx is not installed!

In [37]:
!pip install onnx --index-url https://mirrors.aliyun.com/pypi/simple/

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Collecting onnx
  Downloading https://mirrors.aliyun.com/pypi/packages/fb/71/d3fec0dcf9a7a99e7368112d9c765154e81da70fcba1e3121131a45c245b/onnx-1.20.1-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (17.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.5/17.5 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting protobuf>=4.25.1 (from onnx)
  Downloading https://mirrors.aliyun.com/pypi/packages/9b/53/a9443aa3ca9ba8724fdfa02dd1887c1bcd8e89556b715cfbacca6b63dbec/protobuf-6.33.5-cp39-abi3-manylinux2014_x86_64.whl (323 kB)
Collecting ml_dtypes>=0.5.0 (from onnx)
  Downloading https://mirrors.aliyun.com/pypi/packages/3a/cb/28ce52eb94390dda42599c98ea0204d74799e4d8047a0eb559b6fd648056/ml_dtypes-0.5.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m13.3 MB/s[0m eta [36m0:00:0

In [None]:
双塔模型（DSSM）—— 召回阶段的"向量引擎"
核心思想：用户塔与物品塔独立编码 → 生成低维向量 → 通过向量相似度（内积/余弦）实现大规模候选集快速检索。

In [38]:
trainer.export_onnx("model.onnx")

# 双塔模型可分别导出
trainer.export_onnx("user_tower.onnx", mode="user")
trainer.export_onnx("item_tower.onnx", mode="item")

True

In [39]:
!ls -lrth

total 432K
-rw-r--r-- 1 root root 2.9K Feb 13 17:22 Day1-初始化.ipynb
-rw-r--r-- 1 root root 154K Feb 13 17:44 model.pth
-rw-r--r-- 1 root root  44K Feb 13 17:47 day2-RecallSys.ipynb
-rw-r--r-- 1 root root  63K Feb 13 17:47 model.onnx
-rw-r--r-- 1 root root 100K Feb 13 17:47 user_tower.onnx
-rw-r--r-- 1 root root  63K Feb 13 17:47 item_tower.onnx


In [44]:
!pip install onnxruntime --index-url https://mirrors.aliyun.com/pypi/simple/

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Collecting onnxruntime
  Downloading https://mirrors.aliyun.com/pypi/packages/7d/a1/43ad01b806a1821d1d6f98725edffcdbad54856775643718e9124a09bfbe/onnxruntime-1.24.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m13.4 MB/s[0m  [33m0:00:01[0mm0:00:01[0m00:01[0m
[?25hCollecting flatbuffers (from onnxruntime)
  Downloading https://mirrors.aliyun.com/pypi/packages/e8/2d/d2a548598be01649e2d46231d151a6c56d10b964d94043a335ae56ea2d92/flatbuffers-25.12.19-py2.py3-none-any.whl (26 kB)
Installing collected packages: flatbuffers, onnxruntime
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [onnxruntime][0m [onnxruntime]
[1A[2KSuccessfully installed flatbuffers-25.12.19 onnxruntime-1.24.1
[0m

In [42]:
!pip install -U pip --no-cache-dir --index-url https://mirrors.aliyun.com/pypi/simple/

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Collecting pip
  Downloading https://mirrors.aliyun.com/pypi/packages/de/f0/c81e05b613866b76d2d1066490adf1a3dbc4ee9d9c839961c3fc8a6997af/pip-26.0.1-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.3.1
    Uninstalling pip-24.3.1:
      Successfully uninstalled pip-24.3.1
Successfully installed pip-26.0.1
[0m