Skip to content

Commit

Permalink
Adjust the code structure
Browse files Browse the repository at this point in the history
  • Loading branch information
morningsky committed May 22, 2022
1 parent 1f9aaed commit 2bc1f92
Show file tree
Hide file tree
Showing 34 changed files with 288 additions and 273 deletions.
17 changes: 17 additions & 0 deletions examples/matching/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# 召回

## Movielens

使用ml-1m数据集,使用其中原始特征7个user特征`'user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip',"cate_id"`,2个item特征`"movie_id", "cate_id"`,一共9个sparse特征。

- 构造用户观看历史特征``hist_movie_id``,使用`mean`池化该序列embedding
- 使用随机负采样构造负样本
- 将每个用户最后一条观看记录设置为测试集
- 原始数据下载地址:https://grouplens.org/datasets/movielens/1m/
- 处理数据csv下载地址:https://cowtransfer.com/s/5a3ab69ebd314e

| Model\Metrics | Hit@100 | Recall@100 | Precision@100 |
| ------------- | ------- | ---------- | ------------- |
| DSSM | 2.43% | 2.43% | 0.02% |
| | | | |
| | | | |
File renamed without changes.
File renamed without changes.
86 changes: 48 additions & 38 deletions examples/run_movielens.py → examples/matching/run_movielens.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import sys

sys.path.append("../")
sys.path.append(".../")

import os
import random
Expand All @@ -16,24 +16,25 @@
from torch_rechub.models.matching import DSSM
from torch_rechub.trainers import MatchTrainer
from torch_rechub.basic.features import DenseFeature, SparseFeature, SequenceFeature
from torch_rechub.basic.match_utils import Annoy, generate_seq_feature_match
from torch_rechub.basic.utils import PredictDataset, pad_sequences, TorchDataset, df_to_input_dict
from torch_rechub.basic.metric import topk_metrics

from torch_rechub.utils.match import Annoy, generate_seq_feature_match
from torch_rechub.utils.data import PredictDataset, pad_sequences, TorchDataset, df_to_dict
from torch_rechub.basic.metric import topk_metrics


def gen_model_input(df, user_profile, user_col, item_profile, item_col, seq_max_len):
df = pd.merge(df, user_profile, on=user_col)
df = pd.merge(df, item_profile, on=item_col)
for col in df.columns.to_list():
if col.startswith("hist_"):
df[col] = pad_sequences(df[col], maxlen=seq_max_len, value=0).tolist()
input_dict = df_to_input_dict(df)
input_dict = df_to_dict(df)
return input_dict


def get_movielens_data(data_path, load_cache=False):
data = pd.read_csv(data_path)[:100000]
data["cate_id"] = data["genres"].apply(lambda x:x.split("|")[0])
sparse_features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip',"cate_id"]
data["cate_id"] = data["genres"].apply(lambda x: x.split("|")[0])
sparse_features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip', "cate_id"]
user_col, item_col, label_col = "user_id", "movie_id", "label"

feature_max_idx = {}
Expand All @@ -42,67 +43,75 @@ def get_movielens_data(data_path, load_cache=False):
data[feature] = lbe.fit_transform(data[feature]) + 1
feature_max_idx[feature] = data[feature].max() + 1
if feature == user_col:
user_map = {encode_id+1: raw_id for encode_id, raw_id in enumerate(lbe.classes_)} #encode user id: raw user id
user_map = {encode_id + 1: raw_id for encode_id, raw_id in enumerate(lbe.classes_)} #encode user id: raw user id
if feature == item_col:
item_map = {encode_id+1: raw_id for encode_id, raw_id in enumerate(lbe.classes_)} #encode item id: raw item id
item_map = {encode_id + 1: raw_id for encode_id, raw_id in enumerate(lbe.classes_)} #encode item id: raw item id
np.save("./data/ml-1m/saved/raw_id_maps.npy", (user_map, item_map))

user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id')
item_profile = data[["movie_id", "cate_id"]].drop_duplicates('movie_id')


if load_cache: #if you have run this script before and saved the preprocessed data
x_train, y_train, x_test, y_test = np.load("./data/ml-1m/saved/data_preprocess.npy",allow_pickle=True)

if load_cache: #if you have run this script before and saved the preprocessed data
x_train, y_train, x_test, y_test = np.load("./data/ml-1m/saved/data_preprocess.npy", allow_pickle=True)
else:
df_train, df_test = generate_seq_feature_match(data, user_col, item_col, time_col="timestamp", item_attribute_cols=[], sample_method=0, mode=0, neg_ratio=3, min_item=0)
x_train = gen_model_input(df_train, user_profile, user_col, item_profile, item_col, seq_max_len=50)
x_train = gen_model_input(df_train, user_profile, user_col, item_profile, item_col, seq_max_len=50)
y_train = df_train["label"].values
x_test = gen_model_input(df_test, user_profile, user_col, item_profile, item_col, seq_max_len=50)
x_test = gen_model_input(df_test, user_profile, user_col, item_profile, item_col, seq_max_len=50)
y_test = df_test["label"].values
np.save("./data/ml-1m/saved/data_preprocess.npy", (x_train, y_train, x_test, y_test))
np.save("./data/ml-1m/saved/data_preprocess.npy", (x_train, y_train, x_test, y_test))

user_cols = ['user_id', 'gender', 'age', 'occupation', 'zip']
item_cols = ['movie_id',"cate_id"]
item_cols = ['movie_id', "cate_id"]

user_features = [SparseFeature(feature_name, vocab_size=feature_max_idx[feature_name], embed_dim=8) for feature_name in user_cols]
user_features += [SequenceFeature("hist_movie_id", vocab_size=feature_max_idx["movie_id"], embed_dim=8, pooling="mean", shared_with="movie_id")]

item_features = [SparseFeature(feature_name, vocab_size=feature_max_idx[feature_name], embed_dim=8) for feature_name in item_cols]

all_item_model_input = df_to_input_dict(item_profile)
all_item_model_input = df_to_dict(item_profile)
test_user_model_input = x_test
return user_features, item_features, x_train, y_train, all_item_model_input, test_user_model_input



def main(dataset_path, model_name, epoch, learning_rate, batch_size, weight_decay, device, save_dir, seed):
if not os.path.exists(save_dir):
os.makedirs(save_dir)
torch.manual_seed(seed)
user_features, item_features, x_train, y_train, all_item_model_input, test_user_model_input = get_movielens_data(dataset_path)
dataset = TorchDataset(x_train, y_train)
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8)

if model_name == "dssm":
model = DSSM(user_features, item_features, sim_func="cosine", temperature=0.01, user_params={"dims":[256, 128,64,16],"output_layer":False}, item_params={"dims":[256, 128,64,16],"output_layer":False})

trainer = MatchTrainer(model, optimizer_params={"lr": learning_rate, "weight_decay": weight_decay}, n_epoch=epoch,
earlystop_patience=5, device=device, model_path=save_dir,
scheduler_fn=torch.optim.lr_scheduler.StepLR,
scheduler_params={"step_size": 2,"gamma": 0.8})
if model_name == "dssm":
model = DSSM(user_features, item_features, sim_func="cosine", temperature=0.01, user_params={"dims": [256, 128, 64, 16], "output_layer": False}, item_params={"dims": [256, 128, 64, 16], "output_layer": False})

trainer = MatchTrainer(model,
optimizer_params={
"lr": learning_rate,
"weight_decay": weight_decay
},
n_epoch=epoch,
earlystop_patience=5,
device=device,
model_path=save_dir,
scheduler_fn=torch.optim.lr_scheduler.StepLR,
scheduler_params={
"step_size": 2,
"gamma": 0.8
})

trainer.fit(train_dataloader)

print("inference embedding")
test_dl = DataLoader(PredictDataset(test_user_model_input), batch_size=batch_size, shuffle=True, num_workers=8)
user_embedding = trainer.inference_embedding(model=model, mode="user_tower", data_loader=test_dl, model_path=save_dir)

all_item_dl = DataLoader(PredictDataset(all_item_model_input), batch_size=batch_size, shuffle=True, num_workers=8)
item_embedding = trainer.inference_embedding(model=model, mode="item_tower", data_loader=all_item_dl, model_path=save_dir)

torch.save(user_embedding.data.cpu(), save_dir + "user_embedding.pth")
torch.save(item_embedding.data.cpu(), save_dir + "item_embedding.pth")

print("evaluate embedding matching on test data")
annoy = Annoy(n_trees=10)
annoy.fit(item_embedding)
Expand All @@ -113,24 +122,25 @@ def main(dataset_path, model_name, epoch, learning_rate, batch_size, weight_deca
match_res = collections.defaultdict(dict)
topk = 10
for user_id, user_emb in zip(test_user_model_input["user_id"], user_embedding):
items_idx, items_scores = annoy.query(v=user_emb, n=topk) #the index of topk match items
items_idx, items_scores = annoy.query(v=user_emb, n=topk) #the index of topk match items
match_res[user_map[user_id]] = all_item_model_input["movie_id"][items_idx]

#get ground truth
print("generate ground truth")
user_col = "user_id"
item_col = "movie_id"

data = pd.DataFrame({"user_id":test_user_model_input["user_id"],"movie_id":test_user_model_input["movie_id"]})
data = pd.DataFrame({"user_id": test_user_model_input["user_id"], "movie_id": test_user_model_input["movie_id"]})
data[user_col] = data[user_col].map(user_map)
data[item_col] = data[item_col].map(item_map)
user_pos_item = data.groupby(user_col).agg(list).reset_index()
ground_truth = dict(zip(user_pos_item[user_col], user_pos_item[item_col]))

print("compute topk metrics")
out = topk_metrics(y_true=ground_truth, y_pred=match_res, topKs=[topk])
print(out)


if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
Expand Down
78 changes: 34 additions & 44 deletions examples/README.md → examples/ranking/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,33 +63,7 @@



## Census-Income

该数据集内容为美国的人口普查收入数据,用于预测收入(50k-,50k+)。预处理之后,一共包含41列,其中7列为dense特征,33列为sparse特征,1列为label。

- 注意事项
- 在论文MMOE的实验组1与PLE中,均将收入预测作为主任务,婚姻状态预测作为辅助任务。
- 为了统一对所有多任务模型进行实验,我们按ESMM的设定,将收入预测作为CTR任务,将婚姻状态预测作为CVR任务。
- 对原始数据的处理参考`preprocess_census.py`
- 原始数据地址:http://archive.ics.uci.edu/ml/datasets/Census-Income+(KDD)
- 预处理之后的数据下载地址:https://cowtransfer.com/s/e8b67418ce044c
- 使用方法

```python
python run_census.py --model_name SharedBottom
python run_census.py --model_name ESMM
python run_census.py --model_name MMOE
python run_census.py --model_name PLE
python run_census.py --model_name AITM
```

## AliExpress

阿里速卖通数据,一共有16个sparse特征,63个dense特征,包含“曝光”、“点击”、“转化”三个标签,原始数据已对dense特征进行归一化预处理。原始数据一共有5个csv,这里只使用US地区数据进行测试。

- 原始数据地址:https://tianchi.aliyun.com/dataset/dataDetail?dataId=74690&lang=en-us

- 预处理后数据:https://cowtransfer.com/s/7080e52e5f4f4a



Expand Down Expand Up @@ -126,6 +100,38 @@ TBD
| | | |
| | | |

# 多任务学习排序

## AliExpress

阿里速卖通数据,一共有16个sparse特征,63个dense特征,包含“曝光”、“点击”、“转化”三个标签,原始数据已对dense特征进行归一化预处理。原始数据一共有5个csv,这里只使用US地区数据进行测试。

- 原始数据地址:https://tianchi.aliyun.com/dataset/dataDetail?dataId=74690&lang=en-us

- 预处理后数据:https://cowtransfer.com/s/7080e52e5f4f4a

## Census-Income

该数据集内容为美国的人口普查收入数据,用于预测收入(50k-,50k+)。预处理之后,一共包含41列,其中7列为dense特征,33列为sparse特征,1列为label。

- 注意事项
- 在论文MMOE的实验组1与PLE中,均将收入预测作为主任务,婚姻状态预测作为辅助任务。
- 为了统一对所有多任务模型进行实验,我们按ESMM的设定,将收入预测作为CTR任务,将婚姻状态预测作为CVR任务。
- 对原始数据的处理参考`preprocess_census.py`
- 原始数据地址:http://archive.ics.uci.edu/ml/datasets/Census-Income+(KDD)
- 预处理之后的数据下载地址:https://cowtransfer.com/s/e8b67418ce044c
- 使用方法

```python
python run_census.py --model_name SharedBottom
python run_census.py --model_name ESMM
python run_census.py --model_name MMOE
python run_census.py --model_name PLE
python run_census.py --model_name AITM
```



多任务学习模型测试结果:

| Model\Dataset | Census-Income(CVR) | Census-Income(CTR) | Taobao(CVR) | Taobao(CTR) | AliExpress-US(CVR) | AliExpress-US(CTR) |
Expand All @@ -139,6 +145,8 @@ TBD

> Note: ESMM中CVR较低正常,因为我们构造了一个虚拟的任务依赖关系,以产生CTCVR label


多任务学习模型在采样后的AliExpress-US上进行MetaBalance和Adam的对比

| Model\Dataset | CVR(Adam) | CTR(Adam) | CVR(MetaBalance) | CTR(MetaBalance) |
Expand All @@ -148,21 +156,3 @@ TBD
| PLE(num_level=1) | 0.7284 | 0.6351 | 0.7511 | 0.6373 |
| AITM | 0.5970 | 0.4839 | 0.7379 | 0.6093 |


# 召回

## Movielens

使用ml-1m数据集,使用其中原始特征7个user特征`'user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip',"cate_id"`,2个item特征`"movie_id", "cate_id"`,一共9个sparse特征。

- 构造用户观看历史特征``hist_movie_id``,使用`mean`池化该序列embedding
- 使用随机负采样构造负样本
- 将每个用户最后一条观看记录设置为测试集
- 原始数据下载地址:https://grouplens.org/datasets/movielens/1m/
- 处理数据csv下载地址:https://cowtransfer.com/s/5a3ab69ebd314e

| Model\Metrics | Hit@100 | Recall@100 | Precision@100 |
| ------------- | ------- | ---------- | ------------- |
| DSSM | 2.43% | 2.43% | 0.02% |
| | | | |
| | | | |
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import sys

sys.path.append("../")
sys.path.append(".../")

import pandas as pd
import torch
from torch_rechub.models.multi_task import SharedBottom, ESMM, MMOE, PLE, AITM
from torch_rechub.trainers import MTLTrainer
from torch_rechub.basic.features import DenseFeature, SparseFeature
from torch_rechub.basic.utils import DataGenerator
from torch_rechub.utils.data import DataGenerator


def get_aliexpress_data_dict(data_path='./data/aliexpress/AliExpress_US'):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import sys

sys.path.append("../")
sys.path.append(".../")

import pandas as pd
import torch
from torch_rechub.models.ranking import DIN
from torch_rechub.trainers import CTRTrainer
from torch_rechub.basic.features import DenseFeature, SparseFeature, SequenceFeature
from torch_rechub.basic.utils import DataGenerator, create_seq_features, df_to_input_dict
from torch_rechub.utils.data import DataGenerator, create_seq_features, df_to_dict


def get_amazon_data_dict(dataset_path):
Expand All @@ -32,9 +32,9 @@ def get_amazon_data_dict(dataset_path):
train, val, test = create_seq_features(data, seq_feature_col=['item_id', 'cate_id'], drop_short=3)

print('========== Generate input dict ==========')
train = df_to_input_dict(train)
val = df_to_input_dict(val)
test = df_to_input_dict(test)
train = df_to_dict(train)
val = df_to_dict(val)
test = df_to_dict(test)

train_y, val_y, test_y = train["label"], val["label"], test["label"]

Expand Down
6 changes: 3 additions & 3 deletions examples/run_census.py → examples/ranking/run_census.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import sys

sys.path.append("../")
sys.path.append(".../")

import pandas as pd
import torch
from torch_rechub.models.multi_task import SharedBottom, ESMM, MMOE, PLE, AITM
from torch_rechub.trainers import MTLTrainer
from torch_rechub.basic.features import DenseFeature, SparseFeature
from torch_rechub.basic.utils import DataGenerator
from torch_rechub.utils.data import DataGenerator


def get_census_data_dict(model_name, data_path='./data/census-income'):
Expand Down Expand Up @@ -73,7 +73,7 @@ def main(model_name, epoch, learning_rate, batch_size, weight_decay, device, sav
elif model_name == "AITM":
task_types = ["classification", "classification"]
features, x_train, y_train, x_val, y_val, x_test, y_test = get_census_data_dict(model_name)
model = AITM(features, 2, bottom_params={"dims":[32,16]}, tower_params_list=[{"dims": [8]}, {"dims": [8]}])
model = AITM(features, 2, bottom_params={"dims": [32, 16]}, tower_params_list=[{"dims": [8]}, {"dims": [8]}])

dg = DataGenerator(x_train, y_train)
train_dataloader, val_dataloader, test_dataloader = dg.generate_dataloader(x_val=x_val, y_val=y_val, x_test=x_test, y_test=y_test, batch_size=batch_size)
Expand Down
4 changes: 2 additions & 2 deletions examples/run_criteo.py → examples/ranking/run_criteo.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import sys

sys.path.append("../")
sys.path.append(".../")

import numpy as np
import pandas as pd
import torch
from torch_rechub.models.ranking import WideDeep, DeepFM, DCN, xDeepFM
from torch_rechub.trainers import CTRTrainer
from torch_rechub.basic.features import DenseFeature, SparseFeature
from torch_rechub.basic.utils import DataGenerator
from torch_rechub.utils.data import DataGenerator
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

Expand Down

0 comments on commit 2bc1f92

Please sign in to comment.