In [2]:
import os
import math
import datetime
import joblib
import logging

from tqdm import tqdm

import numpy as np
import pandas as pd

import paddle
from paddle.io import Dataset, IterableDataset, DataLoader
import paddle.nn.functional as F

In [2]:
paddle.enable_static()

In [3]:
class StaticModel:
    def __init__(self):
        self.cost = None
        self.infer_target_var = None
        self._init_hyper_parameters()

    def _init_hyper_parameters(self):        
        self.sparse_feature_number = 88000000
        self.sparse_feature_dim = 9
        self.sparse_inputs_slots = 28
        self.learning_rate = 0.001
        self.fc_sizes = [512, 256, 128, 32]
        
    def create_feeds(self, is_infer=False):
        sparse_input_ids = [
            paddle.static.data(
                name="C" + str(i), shape=[None, 1], lod_level=1, dtype="int64")
            for i in range(1, self.sparse_inputs_slots)
        ]

        label = paddle.static.data(
            name="label", shape=[None, 1], dtype="int64"
        )

        feeds_list = [label] + sparse_input_ids
        
        return feeds_list

    def net(self, _input, is_infer=False):
        self.log_key = _input[0]
        self.label_input = _input[1]
        self.sparse_inputs = _input[2:self.sparse_inputs_slots]
        sparse_number = self.sparse_inputs_slots - 2

        def embedding_layer(_input):
            emb = paddle.fluid.layers.embedding(
                input=_input,
                is_sparse=True,
                is_distributed=False,
                size=[
                    self.sparse_feature_number, self.sparse_feature_dim
                ],
                param_attr=paddle.fluid.ParamAttr(
                    name="SparseFeatFactors",
                    initializer=paddle.fluid.initializer.Uniform()
                )
            )
            
            emb_sum = paddle.fluid.layers.sequence_pool(
                input=emb, 
                pool_type='sum'
            )
            
            return emb_sum

        sparse_embs = list(map(embedding_layer, self.sparse_inputs))

        dnn_model = StaticDNNLayer(
            self.sparse_feature_number, 
            self.sparse_feature_dim,
            sparse_number, 
            self.fc_sizes
        )

        pred = dnn_model.forward(sparse_embs)

        predict_2d = paddle.concat(x=[1 - pred, pred], axis=1)

        auc, batch_auc_var, _ = paddle.static.auc(
            input=predict_2d,
            label=self.label_input,
            slide_steps=0
        )
        self.inference_target_var = auc
        
        if is_infer:
            fetch_dict = {'log_key': self.log_key,'pred': pred}
            return fetch_dict

        cost = paddle.nn.functional.log_loss(
            input=pred, 
            label=paddle.cast(
                self.label_input, 
                dtype="float32"
            )
        )
        
        avg_cost = paddle.mean(x=cost)
        self._cost = avg_cost
        fetch_dict = {'cost': avg_cost, 'auc': auc}
        
        return fetch_dict

    def create_optimizer(self, strategy=None):
        optimizer = paddle.optimizer.Adam(
            learning_rate=self.learning_rate, 
            lazy_mode=True
        )
        
        if strategy != None:
            import paddle.distributed.fleet as fleet
            optimizer = fleet.distributed_optimizer(optimizer, strategy)
            
        optimizer.minimize(self._cost)

    def infer_net(self, _input):
        return self.net(_input, is_infer=True)

In [4]:
class StaticDNNLayer(paddle.nn.Layer):
    def __init__(self, sparse_feature_number, sparse_feature_dim, num_field, layer_sizes):
        super().__init__()
        
        self.sparse_feature_number = sparse_feature_number
        self.sparse_feature_dim = sparse_feature_dim
        self.num_field = num_field
        self.layer_sizes = layer_sizes

        sizes = [sparse_feature_dim * num_field 
                 ] + self.layer_sizes + [1]
        acts = ["relu" for _ in range(len(self.layer_sizes))] + [None]
        
        self._mlp_layers = []
        
        for i in range(len(layer_sizes) + 1):
            linear = paddle.nn.Linear(
                in_features=sizes[i],
                out_features=sizes[i + 1],
                weight_attr=paddle.ParamAttr(
                    initializer=paddle.nn.initializer.Normal(
                        std=1.0 / math.sqrt(sizes[i])
                    )
                )
            )
            
            self.add_sublayer('linear_%d' % i, linear)
            self._mlp_layers.append(linear)
            
            if acts[i] == 'relu':
                act = paddle.nn.ReLU()
                self.add_sublayer('act_%d' % i, act)
                self._mlp_layers.append(act)

    def forward(self, sparse_embs):
        y_dnn = paddle.concat(x=sparse_embs, axis=1)
        
        for n_layer in self._mlp_layers:
            y_dnn = n_layer(y_dnn)

        predict = F.sigmoid(y_dnn)
        
        return predict

In [5]:
def get_filepath(dir_path, list_name):
    for file in os.listdir(dir_path):
        file_path = os.path.join(dir_path, file)
        
        if os.path.isdir(file_path):
            get_filepath(file_path, list_name)
        else:
            list_name.append(file_path)
    
    return list_name

In [6]:
def get_file_list(data_path):
    assert os.path.exists(data_path)
    list_name = []
    file_list = get_filepath(data_path, list_name)

    print("File list: {}".format(file_list))
    
    return file_list

In [7]:
def get_reader(input_var):
    reader_type = "QueueDataset"
    train_data_path = "../data/data205411/2023-cvr-contest-data/train_data"
    
    assert train_data_path != ""

    assert reader_type in [
        "QueueDataset"
    ]
    
    file_list = get_file_list(train_data_path)
    print("train file_list: {}".format(file_list))
    
    reader_instance = Queue(input_var, file_list)
    return reader_instance.get_reader(), file_list

In [8]:
class Queue:
    def __init__(self, input_var, file_list):
        assert isinstance(input_var, list)
        assert len(file_list) > 0

        self.input_var = input_var
        self.file_list = file_list

        self.pipe_command = "python3 queuedataset_reader.py"
        self.train_reader = "criteo_reader"
        
        assert self.pipe_command != None
                        
        print("pipe_command is: {}".format(self.pipe_command))
        
        self.batch_size = 128
        assert self.batch_size >= 1
        
        self.thread_num = 1
        print("dataset init thread_num:", self.thread_num)
        assert self.thread_num >= 1

    def get_reader(self):
        print("Get Train Dataset")
        dataset = paddle.distributed.QueueDataset()
        dataset.init(
            use_var=self.input_var,
            pipe_command=self.pipe_command,
            batch_size=self.batch_size,
            thread_num=self.thread_num
        )
        print("dataset get_reader thread_num:", self.thread_num)
        dataset.set_filelist(self.file_list)
        return dataset

    def get_infer_reader(self):
        print("Get Infer Dataset")
        dataset = paddle.distributed.QueueDataset()
        self.infer_batch_size = 128
        self.infer_thread_num = self.thread_num
        dataset.init(
            use_var=self.input_var,
            pipe_command=self.pipe_command,
            batch_size=self.infer_batch_size,
            thread_num=self.infer_thread_num
        )
        print("dataset get_infer_reader thread_num:", self.infer_thread_num)
        dataset.set_filelist(self.file_list)
        return dataset

In [9]:
def dataset_train(epoch_id, dataset, fetch_vars, exe):
    fetch_info = [
        "Epoch {} Var {}".format(epoch_id, var_name) for var_name in fetch_vars
    ]
    
    fetch_vars = [var for _, var in fetch_vars.items()]
    print_interval = 50
    
    exe.train_from_dataset(
        program=paddle.static.default_main_program(),
        dataset=dataset,
        fetch_list=fetch_vars,
        fetch_info=fetch_info,
        print_period=print_interval,
        debug=False
    )

In [10]:
static_model_class = StaticModel()

In [11]:
input_data = static_model_class.create_feeds()
input_data

[var label : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C1 : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C2 : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C3 : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C4 : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C5 : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C6 : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C7 : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C8 : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C9 : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C10 : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C11 : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C12 : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C13 : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True),
 var C14 : LOD_TENSOR.shape(-1, 1).dtype(

In [12]:
input_data_names = [data.name for data in input_data]
input_data_names

['label',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21',
 'C22',
 'C23',
 'C24',
 'C25',
 'C26',
 'C27']

In [13]:
fetch_vars = static_model_class.net(input_data)

In [14]:
fetch_vars

{'cost': var mean_0.tmp_0 : LOD_TENSOR.shape(1,).dtype(float32).stop_gradient(False),
 'auc': var auc_0.tmp_0 : LOD_TENSOR.shape(1,).dtype(int64).stop_gradient(False)}

In [15]:
place = paddle.set_device("cpu")

In [16]:
static_model_class.create_optimizer()

In [17]:
exe = paddle.static.Executor(place)

In [18]:
exe.run(paddle.static.default_startup_program())

[]

In [19]:
last_epoch_id = -1
step_num = 0

In [20]:
dataset, file_list = get_reader(input_data)

File list: ['../data/data205411/2023-cvr-contest-data/train_data/file_27.txt', '../data/data205411/2023-cvr-contest-data/train_data/file_26.txt', '../data/data205411/2023-cvr-contest-data/train_data/file_18.txt', '../data/data205411/2023-cvr-contest-data/train_data/file_24.txt', '../data/data205411/2023-cvr-contest-data/train_data/file_30.txt', '../data/data205411/2023-cvr-contest-data/train_data/file_25.txt', '../data/data205411/2023-cvr-contest-data/train_data/file_19.txt', '../data/data205411/2023-cvr-contest-data/train_data/file_21.txt', '../data/data205411/2023-cvr-contest-data/train_data/file_09.txt', '../data/data205411/2023-cvr-contest-data/train_data/file_08.txt', '../data/data205411/2023-cvr-contest-data/train_data/file_20.txt', '../data/data205411/2023-cvr-contest-data/train_data/file_22.txt', '../data/data205411/2023-cvr-contest-data/train_data/file_23.txt', '../data/data205411/2023-cvr-contest-data/train_data/file_12.txt', '../data/data205411/2023-cvr-contest-data/train_da

In [21]:
for epoch_id in range(last_epoch_id + 1, 1):
    fetch_batch_var = dataset_train(epoch_id, dataset, fetch_vars, exe)

device worker program id: 4875610480


In [22]:
fetch_batch_var

In [17]:
x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype="float32")

In [18]:
x0 = x.unsqueeze(-1)

In [19]:
xl = x0
xl

Tensor(shape=[2, 3, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[1.],
         [2.],
         [3.]],

        [[4.],
         [5.],
         [6.]]])

In [20]:
xl_T = paddle.transpose(xl, perm=(0, 2, 1))
xl_T

Tensor(shape=[2, 1, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[1., 2., 3.]],

        [[4., 5., 6.]]])

In [21]:
cross_network = paddle.nn.Linear(
    in_features=3, out_features=1
)

In [24]:
paddle.matmul(x0, xl_T)

Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[1. , 2. , 3. ],
         [2. , 4. , 6. ],
         [3. , 6. , 9. ]],

        [[16., 20., 24.],
         [20., 25., 30.],
         [24., 30., 36.]]])

In [27]:
(cross_network(paddle.matmul(x0, xl_T)) + xl).squeeze(-1)

Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[2.77003670 , 5.54007339 , 8.31011009 ],
        [27.47306442, 34.34133148, 41.20959473]])

In [46]:
w0 = paddle.create_parameter(shape=[1,1], dtype="float32")
w0

Parameter containing:
Tensor(shape=[1, 1], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[-0.15144837]])

In [39]:
w0.expand(shape=(2, 1))

Tensor(shape=[2, 1], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[-0.82814622],
        [-0.82814622]])

In [40]:
x

Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[1., 2., 3.],
        [4., 5., 6.]])

In [45]:
paddle.concat([w0.expand(shape=(2, 1)), x], axis=-1)

Tensor(shape=[2, 4], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[-0.82814622,  1.        ,  2.        ,  3.        ],
        [-0.82814622,  4.        ,  5.        ,  6.        ]])

In [18]:
x = paddle.rand(shape=[3, 5*3])
x

Tensor(shape=[3, 15], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[0.64861107, 0.33811730, 0.66795737, 0.55047876, 0.53967911, 0.96403658,
         0.40827477, 0.38332576, 0.45271787, 0.89102578, 0.22485089, 0.32955831,
         0.11219117, 0.54765588, 0.51901442],
        [0.67763549, 0.22279523, 0.23241772, 0.32972619, 0.25060549, 0.24740583,
         0.30024922, 0.39849868, 0.81412715, 0.31648001, 0.45057923, 0.28084281,
         0.50915974, 0.77315301, 0.97476274],
        [0.89788783, 0.50603282, 0.61585224, 0.04976138, 0.14215194, 0.92916489,
         0.93003160, 0.64014935, 0.73386419, 0.85679454, 0.07925825, 0.06251020,
         0.62031645, 0.18634364, 0.09363453]])

In [20]:
# field=5, embedding_dim=3
f = x.reshape(shape=[-1, 5, 3]).transpose(perm=(0, 2, 1))
f

Tensor(shape=[3, 3, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[0.64861107, 0.55047876, 0.40827477, 0.89102578, 0.11219117],
         [0.33811730, 0.53967911, 0.38332576, 0.22485089, 0.54765588],
         [0.66795737, 0.96403658, 0.45271787, 0.32955831, 0.51901442]],

        [[0.67763549, 0.32972619, 0.30024922, 0.31648001, 0.50915974],
         [0.22279523, 0.25060549, 0.39849868, 0.45057923, 0.77315301],
         [0.23241772, 0.24740583, 0.81412715, 0.28084281, 0.97476274]],

        [[0.89788783, 0.04976138, 0.93003160, 0.85679454, 0.62031645],
         [0.50603282, 0.14215194, 0.64014935, 0.07925825, 0.18634364],
         [0.61585224, 0.92916489, 0.73386419, 0.06251020, 0.09363453]]])

In [21]:
f_prime = f.transpose(perm=(0, 2, 1))
f_prime

Tensor(shape=[3, 5, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[0.64861107, 0.33811730, 0.66795737],
         [0.55047876, 0.53967911, 0.96403658],
         [0.40827477, 0.38332576, 0.45271787],
         [0.89102578, 0.22485089, 0.32955831],
         [0.11219117, 0.54765588, 0.51901442]],

        [[0.67763549, 0.22279523, 0.23241772],
         [0.32972619, 0.25060549, 0.24740583],
         [0.30024922, 0.39849868, 0.81412715],
         [0.31648001, 0.45057923, 0.28084281],
         [0.50915974, 0.77315301, 0.97476274]],

        [[0.89788783, 0.50603282, 0.61585224],
         [0.04976138, 0.14215194, 0.92916489],
         [0.93003160, 0.64014935, 0.73386419],
         [0.85679454, 0.07925825, 0.06251020],
         [0.62031645, 0.18634364, 0.09363453]]])

In [23]:
p = paddle.matmul(f_prime, f)
p

Tensor(shape=[3, 5, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[0.98118669, 1.18345678, 0.69681686, 0.87408608, 0.60461986],
         [1.18345678, 1.52364695, 0.86805606, 0.92954439, 0.85766619],
         [0.69681686, 0.86805606, 0.51858038, 0.59917146, 0.49070254],
         [0.87408608, 0.92954439, 0.59917146, 0.95309353, 0.39415166],
         [0.60461986, 0.85766619, 0.49070254, 0.39415166, 0.58188981]],

        [[0.56284559, 0.33676937, 0.48146072, 0.38011783, 0.74383163],
         [0.33676937, 0.23273212, 0.40028578, 0.28675154, 0.60280168],
         [0.48146072, 0.40028578, 0.91175383, 0.50321984, 1.25455606],
         [0.38011783, 0.28675154, 0.50321984, 0.38205391, 0.78326070],
         [0.74383163, 0.60280168, 1.25455606, 0.78326070, 1.80717158]],

        [[1.44154572, 0.68884194, 1.61095250, 0.84790969, 0.70893562],
         [0.68884194, 0.88603073, 0.81915897, 0.11198428, 0.14435883],
         [1.61095250, 0.81915897, 1.81330669, 0.89345711, 0.7649167

In [23]:
paddle.disable_static()

In [24]:
a = paddle.to_tensor([[1, 2], [3, 3], [5, 3]], dtype="float32")
a

Tensor(shape=[3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[1., 2.],
        [3., 3.],
        [5., 3.]])

In [25]:
softmax_a = paddle.nn.functional.softmax(a)
softmax_a

Tensor(shape=[3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[0.26894143, 0.73105860],
        [0.50000000, 0.50000000],
        [0.88079703, 0.11920291]])

In [26]:
paddle.slice(softmax_a, axes=[1], starts=[1], ends=[2])

Tensor(shape=[3, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[0.73105860],
        [0.50000000],
        [0.11920291]])

In [27]:
label = paddle.to_tensor([[1], [0], [0]], dtype="float32")
label

Tensor(shape=[3, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[1.],
        [0.],
        [0.]])

In [35]:
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)


class RecDataset(IterableDataset):
    def __init__(self, file_list, config):
        super().__init__()
        self.file_list = file_list
        self.max_len = config["runner"]["max_len"]
        self.init()

    def init(self):
        padding = 0
        sparse_slots = "log_key click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
        self.sparse_slots = sparse_slots.strip().split(" ")
        self.dense_slots = ["dense_feature"]
        self.dense_slots_shape = [13]
        self.slots = self.sparse_slots
        self.slot2index = {}
        self.visit = {}

        for i in range(len(self.slots)):
            self.slot2index[self.slots[i]] = i
            self.visit[self.slots[i]] = False

        self.padding = padding

    def __iter__(self):
        for file in self.file_list:
            with open(file, "r") as rf:
                for line in rf:
                    items = line.strip("\n").split("\t")
                    log_key = int(items[0])
                    conv1 = items[1]
                    conv2 = items[2]
                    conv3 = items[3]

                    if conv1 == "1" or conv2 == "1" or conv3 == "1":
                        conv = 1
                    else:
                        conv = 0

                    output = [(i, []) for i in self.slots]
                    feasigns = items[4].split(" ")

                    for i in feasigns:
                        slot_feasign = i.split(":")
                        slot = slot_feasign[1]
                        if slot not in self.slots:
                            continue
                        if slot in self.sparse_slots:
                            feasign = int(slot_feasign[0])
                        else:
                            feasign = float(slot_feasign[0])
                        output[self.slot2index[slot]][1].append(feasign)
                        self.visit[slot] = True

                    output[0][1].append(log_key)
                    self.visit['log_key'] = True
                    output[1][1].append(conv)
                    self.visit['click'] = True

                    for i in self.visit:
                        slot = i
                        if not self.visit[slot]:
                            if i in self.dense_slots:
                                output[self.slot2index[i]][1].extend(
                                    [self.padding] *
                                    self.dense_slots_shape[self.slot2index[i]]
                                )
                            else:
                                output[self.slot2index[i]][1].extend([self.padding])
                        else:
                            self.visit[slot] = False

                    res = []
                    for key, value in output:
                        if key == "log_key":
                            continue

                        if key == "click":
                            res.append(np.array(value).astype("float32").reshape([-1,]))
                            continue

                        padding = [0] * (self.max_len - len(value))
                        res.append(
                            np.array(value + padding).astype("int64").reshape([self.max_len,])
                        )

                    len_array = [len(value) for key, value in output][2:]
                    mask = np.array(
                        [[0] * x + [-1e9] * (self.max_len - x) for x in len_array]
                    ).reshape([-1, self.max_len])

                    res.append(mask)
                    yield res

In [25]:
data_dir = "../../data/data205411/2023-cvr-contest-data/train_data"
file_list = [os.path.join(data_dir, x) for x in os.listdir(data_dir)]

config = {"runner": {"max_len": 20}}

In [36]:
dataset = RecDataset(file_list[:1], config)

In [37]:
loader = DataLoader(
    dataset,
    batch_size=4,
    places="cpu",
    drop_last=True
)

In [75]:
f1 = paddle.to_tensor([[1, 2, 0, 0], [5, 0, 0, 0]])
f2 = paddle.to_tensor([[3, 0, 0, 0], [4, 1, 0, 0]])
f3 = paddle.to_tensor([[3, 1, 2, 0], [5, 7, 8, 0]])

In [43]:
f1

Tensor(shape=[2, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[1, 2, 0, 0],
        [5, 0, 0, 0]])

In [44]:
f2

Tensor(shape=[2, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[3, 0, 0, 0],
        [4, 1, 0, 0]])

In [71]:
mask = paddle.to_tensor([[[0, 0, -1e9, -1e9], [0, -1e9, -1e9, -1e9]], [[0, -1e9, -1e9, -1e9], [0, 0, -1e9, -1e9]]], dtype="float32")

In [61]:
emb = paddle.nn.Embedding(
    num_embeddings=10,
    embedding_dim=5
)

In [72]:
mask[0]

Tensor(shape=[2, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[ 0.         ,  0.         , -1000000000., -1000000000.],
        [ 0.         , -1000000000., -1000000000., -1000000000.]])

In [73]:
emb(f1)

Tensor(shape=[2, 4, 5], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[[ 0.52326089, -0.39613602, -0.11836523,  0.05948782, -0.17124951],
         [ 0.22142595,  0.50726551, -0.32140428,  0.61215407, -0.55660534],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851]],

        [[ 0.53894895,  0.24171901,  0.61883634,  0.09842485, -0.40532160],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851]]])

In [63]:
paddle.exp(mask[0].unsqueeze(-1))

Tensor(shape=[2, 4, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[1.],
         [1.],
         [0.],
         [0.]],

        [[1.],
         [0.],
         [0.],
         [0.]]])

In [64]:
paddle.exp(mask[0].unsqueeze(-1)) * emb(f1)

Tensor(shape=[2, 4, 5], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[[ 0.52326089, -0.39613602, -0.11836523,  0.05948782, -0.17124951],
         [ 0.22142595,  0.50726551, -0.32140428,  0.61215407, -0.55660534],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ]],

        [[ 0.53894895,  0.24171901,  0.61883634,  0.09842485, -0.40532160],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ]]])

In [67]:
emb_1 = paddle.sum(
    paddle.exp(mask[0].unsqueeze(-1)) * emb(f1),
    axis=1
)

emb_1

Tensor(shape=[2, 5], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[ 0.74468684,  0.11112949, -0.43976951,  0.67164189, -0.72785485],
        [ 0.53894895,  0.24171901,  0.61883634,  0.09842485, -0.40532160]])

In [68]:
emb_2 = paddle.sum(
    paddle.exp(mask[1].unsqueeze(-1)) * emb(f2),
    axis=1
)

emb_2

Tensor(shape=[2, 5], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[-0.29778719, -0.16829005,  0.26603258,  0.48641241, -0.30877686],
        [ 0.28506643,  0.05156046, -0.13683656, -0.12907267,  0.46634430]])

In [70]:
paddle.concat([emb_1, emb_2], axis=1)

Tensor(shape=[2, 10], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[ 0.74468684,  0.11112949, -0.43976951,  0.67164189, -0.72785485,
         -0.29778719, -0.16829005,  0.26603258,  0.48641241, -0.30877686],
        [ 0.53894895,  0.24171901,  0.61883634,  0.09842485, -0.40532160,
          0.28506643,  0.05156046, -0.13683656, -0.12907267,  0.46634430]])

In [76]:
for batch_id, batch in enumerate(loader()):
    print(batch_id)
    mask = batch[-1]
    break

0


In [92]:
batch[1]

Tensor(shape=[4, 20], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[1140, 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ,
         0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ],
        [1   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ,
         0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ],
        [1457, 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ,
         0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ],
        [1   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ,
         0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ]])

In [93]:
batch[17]

Tensor(shape=[4, 20], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[0       , 0       , 0       , 0       , 0       , 0       , 0       ,
         0       , 0       , 0       , 0       , 0       , 0       , 0       ,
         0       , 0       , 0       , 0       , 0       , 0       ],
        [76971848, 76971849, 76971850, 76971851, 76971852, 54333065, 76971853,
         76971854, 37832108, 76971855, 0       , 0       , 0       , 0       ,
         0       , 0       , 0       , 0       , 0       , 0       ],
        [8340766 , 76971868, 4547522 , 76971869, 76971870, 76971871, 8323787 ,
         1098057 , 12330399, 1098055 , 0       , 0       , 0       , 0       ,
         0       , 0       , 0       , 0       , 0       , 0       ],
        [76971884, 76971885, 76263073, 76971886, 66572142, 76971887, 72082292,
         75392845, 66572136, 74450315, 0       , 0       , 0       , 0       ,
         0       , 0       , 0       , 0       , 0       , 0       ]])

In [95]:
f1 = paddle.to_tensor([
    [1, 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ],
    [2   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ],
    [3, 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ],
    [4   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   , 0   ]]
)

In [96]:
f1

Tensor(shape=[4, 20], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [102]:
f2 = paddle.to_tensor(
    [[2       , 0       , 0       , 0       , 0       , 0       , 0       ,
         0       , 0       , 0       , 0       , 0       , 0       , 0       ,
         0       , 0       , 0       , 0       , 0       , 0       ],
        [1, 1, 2, 3, 4, 5, 6,
         7, 8, 9, 0       , 0       , 0       , 0       ,
         0       , 0       , 0       , 0       , 0       , 0       ],
        [8 , 9, 2 , 3, 1, 5, 6 ,
         7 , 5, 6 , 0       , 0       , 0       , 0       ,
         0       , 0       , 0       , 0       , 0       , 0       ],
        [1, 2, 3, 4, 5, 6, 7,
         8, 9, 1, 0       , 0       , 0       , 0       ,
         0       , 0       , 0       , 0       , 0       , 0       ]]
)

In [103]:
f2

Tensor(shape=[4, 20], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [8, 9, 2, 3, 1, 5, 6, 7, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [104]:
emb(f2)

Tensor(shape=[4, 20, 5], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[[ 0.22142595,  0.50726551, -0.32140428,  0.61215407, -0.55660534],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         [-0.61016905,  0.16292155,  0.06961745, -0.06022114, -0.22813851],
         

In [87]:
mask[:, 0, :]

Tensor(shape=[4, 20], dtype=float64, place=Place(cpu), stop_gradient=True,
       [[ 0.         , -1000000000., -1000000000., -1000000000., -1000000000.,
         -1000000000., -1000000000., -1000000000., -1000000000., -1000000000.,
         -1000000000., -1000000000., -1000000000., -1000000000., -1000000000.,
         -1000000000., -1000000000., -1000000000., -1000000000., -1000000000.],
        [ 0.         , -1000000000., -1000000000., -1000000000., -1000000000.,
         -1000000000., -1000000000., -1000000000., -1000000000., -1000000000.,
         -1000000000., -1000000000., -1000000000., -1000000000., -1000000000.,
         -1000000000., -1000000000., -1000000000., -1000000000., -1000000000.],
        [ 0.         , -1000000000., -1000000000., -1000000000., -1000000000.,
         -1000000000., -1000000000., -1000000000., -1000000000., -1000000000.,
         -1000000000., -1000000000., -1000000000., -1000000000., -1000000000.,
         -1000000000., -1000000000., -1000000000., -10

In [124]:
lin = paddle.nn.Linear(5, 1)

In [128]:
lin(paddle.sum(paddle.exp(mask[:, 16, :].unsqueeze(-1)) * emb(f2), axis=1).astype("float32"))

Tensor(shape=[4, 1], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[-0.38755727],
        [-0.77973664],
        [-0.67935890],
        [-0.77973664]])

In [137]:
paddle.sum(paddle.exp(mask[:, 16, :].unsqueeze(-1)) * emb(f2), axis=1)

Tensor(shape=[4, 5], dtype=float64, place=Place(cpu), stop_gradient=False,
       [[ 0.22142595,  0.50726551, -0.32140428,  0.61215407, -0.55660534],
        [ 1.25040612, -0.15093717,  0.75631195,  0.71164760, -1.38679367],
        [ 0.36064115,  0.26468557,  2.22910503,  1.36571333, -2.37395239],
        [ 1.25040612, -0.15093717,  0.75631195,  0.71164760, -1.38679367]])

In [107]:
paddle.exp(mask[:, 16, :].unsqueeze(-1)) * emb(f2)

Tensor(shape=[4, 20, 5], dtype=float64, place=Place(cpu), stop_gradient=False,
       [[[ 0.22142595,  0.50726551, -0.32140428,  0.61215407, -0.55660534],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         [ -0.       ,  0.        ,  0.        ,  -0.       ,  -0.       ],
         

In [106]:
paddle.exp(mask[:, 16, :].unsqueeze(-1))

Tensor(shape=[4, 20, 1], dtype=float64, place=Place(cpu), stop_gradient=True,
       [[[1.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]],

        [[1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]],

        [[1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [1.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]],

        [[1.],
 

In [133]:
import paddle
import paddle.nn.functional as F


class DNN(paddle.nn.Layer):
    def __init__(self):
        super().__init__()

        self.sparse_feature_number = 10
        self.sparse_feature_dim = 5
        self.num_field = 2
        self.layer_sizes = [8]

        self.embedding = paddle.nn.Embedding(
            num_embeddings=self.sparse_feature_number,
            embedding_dim=self.sparse_feature_dim,
            weight_attr=paddle.framework.ParamAttr(
                initializer=paddle.nn.initializer.XavierUniform()
            )
        )

        sizes = [self.sparse_feature_dim * self.num_field] + self.layer_sizes + [1]

        self.layers = []

        for i in range(len(sizes) - 1):
            linear = paddle.nn.Linear(
                in_features=sizes[i],
                out_features=sizes[i+1],
                weight_attr=paddle.framework.ParamAttr(
                    initializer=paddle.nn.initializer.XavierUniform()),
                bias_attr=paddle.ParamAttr(
                    initializer=paddle.nn.initializer.Constant(value=0.0))
            )
            self.layers.append(linear)

    def forward(self, features, mask):
        feature_ls = []

        for idx, feature in enumerate(features):
            emb = paddle.sum(
                paddle.exp(mask[:, idx, :].unsqueeze(-1)) * self.embedding(feature),
                axis=1
            )

            feature_ls.append(emb)

        x = paddle.concat(feature_ls, axis=1).astype("float32")

        for idx, layer in enumerate(self.layers):
            x = layer(x)

            if idx != len(self.layers) - 1:
                x = F.relu(x)

        return F.sigmoid(x)


In [134]:
dnn = DNN()

In [131]:
features = [f1, f2]

In [135]:
dnn(features, mask)

Tensor(shape=[4, 1], dtype=float32, place=Place(cpu), stop_gradient=False,
       [[0.48766893],
        [0.54612899],
        [0.55225706],
        [0.44780943]])

In [68]:
b = paddle.randint(low=1, high=101, shape=[4, 3, 5])
b = paddle.cast(b, dtype="float32")
b

Tensor(shape=[4, 3, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[15., 19., 46., 63., 66.],
         [93., 41., 28., 45., 6. ],
         [45., 82., 53., 12., 37.]],

        [[28., 82., 41., 88., 57.],
         [6. , 92., 36., 12., 60.],
         [29., 1. , 43., 43., 11.]],

        [[16., 91., 46., 99., 94.],
         [23., 37., 56., 58., 3. ],
         [36., 14., 32., 38., 52.]],

        [[3. , 56., 7. , 47., 52.],
         [58., 21., 37., 73., 31.],
         [47., 80., 2. , 17., 10.]]])

In [69]:
paddle.sqrt(paddle.sum(b**2, axis=-1, keepdim=True))

Tensor(shape=[4, 3, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[105.00952911],
         [114.78239441],
         [114.32847595]],

        [[142.06335449],
         [116.36151123],
         [68.27151489 ]],

        [[171.14321899],
         [91.68968964 ],
         [81.63332367 ]],

        [[90.03887939 ],
         [107.07006836],
         [94.87886810 ]]])

In [70]:
normalized_b = b / paddle.sqrt(paddle.sum(b**2, axis=-1, keepdim=True))
normalized_b

Tensor(shape=[4, 3, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[0.14284419, 0.18093596, 0.43805549, 0.59994555, 0.62851441],
         [0.81022877, 0.35719764, 0.24393985, 0.39204618, 0.05227283],
         [0.39360273, 0.71723163, 0.46357656, 0.10496072, 0.32362890]],

        [[0.19709516, 0.57720727, 0.28860363, 0.61944193, 0.40122944],
         [0.05156344, 0.79063946, 0.30938065, 0.10312688, 0.51563442],
         [0.42477453, 0.01464740, 0.62983811, 0.62983811, 0.16112137]],

        [[0.09348895, 0.53171843, 0.26878074, 0.57846290, 0.54924756],
         [0.25084609, 0.40353501, 0.61075568, 0.63256842, 0.03271905],
         [0.44099638, 0.17149860, 0.39199677, 0.46549618, 0.63699478]],

        [[0.03331894, 0.62195355, 0.07774419, 0.52199674, 0.57752830],
         [0.54170132, 0.19613324, 0.34556809, 0.68179649, 0.28953004],
         [0.49536848, 0.84318036, 0.02107951, 0.17917582, 0.10539754]]])

In [71]:
normalized_b_transpose = normalized_b.transpose(perm=(0, 2, 1))
normalized_b_transpose

Tensor(shape=[4, 5, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[0.14284419, 0.81022877, 0.39360273],
         [0.18093596, 0.35719764, 0.71723163],
         [0.43805549, 0.24393985, 0.46357656],
         [0.59994555, 0.39204618, 0.10496072],
         [0.62851441, 0.05227283, 0.32362890]],

        [[0.19709516, 0.05156344, 0.42477453],
         [0.57720727, 0.79063946, 0.01464740],
         [0.28860363, 0.30938065, 0.62983811],
         [0.61944193, 0.10312688, 0.62983811],
         [0.40122944, 0.51563442, 0.16112137]],

        [[0.09348895, 0.25084609, 0.44099638],
         [0.53171843, 0.40353501, 0.17149860],
         [0.26878074, 0.61075568, 0.39199677],
         [0.57846290, 0.63256842, 0.46549618],
         [0.54924756, 0.03271905, 0.63699478]],

        [[0.03331894, 0.54170132, 0.49536848],
         [0.62195355, 0.19613324, 0.84318036],
         [0.07774419, 0.34556809, 0.02107951],
         [0.52199674, 0.68179649, 0.17917582],
         [0.57752830, 0.

In [73]:
a = paddle.ones(shape=[4, 3, 5]) # batch_size=4, non_zero_values=3, emb_dim=5
a

Tensor(shape=[4, 3, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]]])

In [74]:
normalized_a = a / paddle.sqrt(paddle.sum(a**2, axis=-1, keepdim=True))
normalized_a

Tensor(shape=[4, 3, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[0.44721359, 0.44721359, 0.44721359, 0.44721359, 0.44721359],
         [0.44721359, 0.44721359, 0.44721359, 0.44721359, 0.44721359],
         [0.44721359, 0.44721359, 0.44721359, 0.44721359, 0.44721359]],

        [[0.44721359, 0.44721359, 0.44721359, 0.44721359, 0.44721359],
         [0.44721359, 0.44721359, 0.44721359, 0.44721359, 0.44721359],
         [0.44721359, 0.44721359, 0.44721359, 0.44721359, 0.44721359]],

        [[0.44721359, 0.44721359, 0.44721359, 0.44721359, 0.44721359],
         [0.44721359, 0.44721359, 0.44721359, 0.44721359, 0.44721359],
         [0.44721359, 0.44721359, 0.44721359, 0.44721359, 0.44721359]],

        [[0.44721359, 0.44721359, 0.44721359, 0.44721359, 0.44721359],
         [0.44721359, 0.44721359, 0.44721359, 0.44721359, 0.44721359],
         [0.44721359, 0.44721359, 0.44721359, 0.44721359, 0.44721359]]])

In [82]:
(normalized_a @ normalized_b_transpose) / 5**0.5

Tensor(shape=[4, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[0.39805910, 0.37113702, 0.40060008],
         [0.39805910, 0.37113702, 0.40060008],
         [0.39805910, 0.37113702, 0.40060008]],

        [[0.41671550, 0.35406896, 0.37204388],
         [0.41671550, 0.35406896, 0.37204388],
         [0.41671550, 0.35406896, 0.37204388]],

        [[0.40433973, 0.38608482, 0.42139652],
         [0.40433973, 0.38608482, 0.42139652],
         [0.40433973, 0.38608482, 0.42139652]],

        [[0.36650831, 0.41094583, 0.32884037],
         [0.36650831, 0.41094583, 0.32884037],
         [0.36650831, 0.41094583, 0.32884037]]])

In [83]:
attention = paddle.nn.functional.softmax((normalized_a @ normalized_b_transpose) / 5**0.5)
attention

Tensor(shape=[4, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[0.33602360, 0.32709783, 0.33687851],
         [0.33602360, 0.32709783, 0.33687851],
         [0.33602360, 0.32709783, 0.33687851]],

        [[0.34535313, 0.32438171, 0.33026513],
         [0.34535313, 0.32438171, 0.33026513],
         [0.34535313, 0.32438171, 0.33026513]],

        [[0.33343184, 0.32740030, 0.33916789],
         [0.33343184, 0.32740030, 0.33916789],
         [0.33343184, 0.32740030, 0.33916789]],

        [[0.33239460, 0.34749851, 0.32010686],
         [0.33239460, 0.34749851, 0.32010686],
         [0.33239460, 0.34749851, 0.32010686]]])

In [62]:
paddle.sum(normalized_a @ normalized_b_transpose, axis=2, keepdim=True)

Tensor(shape=[4, 3, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[1.88326001],
         [1.88326001],
         [1.88326001]],

        [[1.59917104],
         [1.59917104],
         [1.59917104]],

        [[1.75510931],
         [1.75510931],
         [1.75510931]],

        [[1.89605808],
         [1.89605808],
         [1.89605808]]])

In [63]:
a * paddle.sum(normalized_a @ normalized_b_transpose, axis=2, keepdim=True)

Tensor(shape=[4, 3, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[1.88326001, 1.88326001, 1.88326001, 1.88326001, 1.88326001],
         [1.88326001, 1.88326001, 1.88326001, 1.88326001, 1.88326001],
         [1.88326001, 1.88326001, 1.88326001, 1.88326001, 1.88326001]],

        [[1.59917104, 1.59917104, 1.59917104, 1.59917104, 1.59917104],
         [1.59917104, 1.59917104, 1.59917104, 1.59917104, 1.59917104],
         [1.59917104, 1.59917104, 1.59917104, 1.59917104, 1.59917104]],

        [[1.75510931, 1.75510931, 1.75510931, 1.75510931, 1.75510931],
         [1.75510931, 1.75510931, 1.75510931, 1.75510931, 1.75510931],
         [1.75510931, 1.75510931, 1.75510931, 1.75510931, 1.75510931]],

        [[1.89605808, 1.89605808, 1.89605808, 1.89605808, 1.89605808],
         [1.89605808, 1.89605808, 1.89605808, 1.89605808, 1.89605808],
         [1.89605808, 1.89605808, 1.89605808, 1.89605808, 1.89605808]]])

In [64]:
paddle.sum(a * paddle.sum(normalized_a @ normalized_b_transpose, axis=2, keepdim=True), axis=1)

Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[5.64978027, 5.64978027, 5.64978027, 5.64978027, 5.64978027],
        [4.79751301, 4.79751301, 4.79751301, 4.79751301, 4.79751301],
        [5.26532793, 5.26532793, 5.26532793, 5.26532793, 5.26532793],
        [5.68817425, 5.68817425, 5.68817425, 5.68817425, 5.68817425]])

In [85]:
attention

Tensor(shape=[4, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[0.33602360, 0.32709783, 0.33687851],
         [0.33602360, 0.32709783, 0.33687851],
         [0.33602360, 0.32709783, 0.33687851]],

        [[0.34535313, 0.32438171, 0.33026513],
         [0.34535313, 0.32438171, 0.33026513],
         [0.34535313, 0.32438171, 0.33026513]],

        [[0.33343184, 0.32740030, 0.33916789],
         [0.33343184, 0.32740030, 0.33916789],
         [0.33343184, 0.32740030, 0.33916789]],

        [[0.33239460, 0.34749851, 0.32010686],
         [0.33239460, 0.34749851, 0.32010686],
         [0.33239460, 0.34749851, 0.32010686]]])

In [86]:
a

Tensor(shape=[4, 3, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]]])

In [84]:
attention @ a

Tensor(shape=[4, 3, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]]])

In [87]:
paddle.sum(attention @ a, axis=1)

Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.]])

In [183]:
WQ = paddle.randn(shape=[10,3])
WK = paddle.randn(shape=[10,3])
WV = paddle.randn(shape=[10,3])

In [184]:
WQ, WK, WV

(Tensor(shape=[10, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[-0.02911364, -0.47811183,  0.40014619],
         [-0.13354546,  0.10683342, -1.58571506],
         [-0.38840693, -0.95521319,  0.09084713],
         [ 1.75245047, -0.95999950,  0.46159360],
         [-1.33324468, -0.60996574,  0.49316347],
         [ 0.52573293,  0.57017356,  0.51194710],
         [ 0.21334632,  0.19572523,  0.19105043],
         [-0.99943167,  0.13115218, -0.67968792],
         [ 0.67731380,  1.28376007,  0.30227867],
         [-0.88605523, -1.04832232, -0.60421520]]),
 Tensor(shape=[10, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[-0.99107218, -1.02872157, -0.01977291],
         [ 0.27816173, -0.27780670, -0.61071074],
         [-0.28916654, -1.01661170,  1.64721584],
         [-0.86218989, -0.43323833, -1.11205029],
         [-0.05574733,  0.46919966, -0.27491590],
         [ 1.23608959, -0.48388645, -0.34201142],
         [ 0.61169046,  1.54798615,  0.6394313

In [185]:
f1 = paddle.rand(shape=[2, 3, 10])
f2 = paddle.rand(shape=[2, 3, 10])

f1, f2

(Tensor(shape=[2, 3, 10], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[[0.22601740, 0.14923583, 0.34761751, 0.05315388, 0.13177596,
           0.83874351, 0.94392550, 0.74413365, 0.23427728, 0.95526809],
          [0.23121725, 0.99879581, 0.89045691, 0.56941539, 0.15396763,
           0.84482205, 0.29068530, 0.84894836, 0.87598592, 0.99289370],
          [0.68629593, 0.01225852, 0.69453418, 0.59595186, 0.79729217,
           0.85709256, 0.67777288, 0.46706274, 0.27853531, 0.85552394]],
 
         [[0.17263722, 0.06319062, 0.61965865, 0.16608801, 0.00190719,
           0.64621472, 0.80712038, 0.64940017, 0.34340176, 0.30346230],
          [0.03234544, 0.36421672, 0.72796488, 0.56136560, 0.88940006,
           0.63393033, 0.64482522, 0.02115875, 0.37731385, 0.59155494],
          [0.88369453, 0.02426497, 0.47848526, 0.15510882, 0.07602921,
           0.57549298, 0.90019351, 0.62567103, 0.41393134, 0.12018152]]]),
 Tensor(shape=[2, 3, 10], dtype=float32, place=Place(cpu)

In [186]:
Q1 = f1 @ WQ
K1 = f1 @ WK
V1 = f1 @ WV

Q2 = f2 @ WQ
K2 = f2 @ WK
V2 = f2 @ WV

In [187]:
att11 = Q1 @ K1.transpose(perm=(0,2,1))
att12 = Q1 @ K2.transpose(perm=(0,2,1))
att21 = Q2 @ K1.transpose(perm=(0,2,1))
att22 = Q2 @ K2.transpose(perm=(0,2,1))

In [188]:
v11 = att11 @ V1
v12 = att12 @ V1
v21 = att21 @ V2
v22 = att22 @ V2

In [189]:
v11, v12, v21, v22

(Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[[ 13.11691284,  0.35256189 , -3.60694551 ],
          [ 6.54435873 ,  0.31446952 ,  2.97256708 ],
          [-17.44693565, -1.32715344 ,  14.64944267]],
 
         [[ 0.22234079 , -0.14395532 , -0.26214507 ],
          [-5.93603134 ,  0.44208062 ,  3.50922155 ],
          [-0.58429468 , -0.02209246 ,  0.43169656 ]]]),
 Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[[14.70955753,  0.39050823, -6.59274006],
          [11.57777119, -0.04599017, -5.65099573],
          [ 2.36672425, -0.32883200, -3.55854321]],
 
         [[ 1.01845992, -0.29503533, -0.74935389],
          [-4.35839844,  0.62893754,  5.80981445],
          [-6.91150713,  1.56529462,  5.66818810]]]),
 Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[[-12.58084202,  2.07626796 ,  1.89076865 ],
          [-17.91967964,  7.95994663 , -1.20150900 ],
          [ 24.4

In [190]:
f1, f2

(Tensor(shape=[2, 3, 10], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[[0.22601740, 0.14923583, 0.34761751, 0.05315388, 0.13177596,
           0.83874351, 0.94392550, 0.74413365, 0.23427728, 0.95526809],
          [0.23121725, 0.99879581, 0.89045691, 0.56941539, 0.15396763,
           0.84482205, 0.29068530, 0.84894836, 0.87598592, 0.99289370],
          [0.68629593, 0.01225852, 0.69453418, 0.59595186, 0.79729217,
           0.85709256, 0.67777288, 0.46706274, 0.27853531, 0.85552394]],
 
         [[0.17263722, 0.06319062, 0.61965865, 0.16608801, 0.00190719,
           0.64621472, 0.80712038, 0.64940017, 0.34340176, 0.30346230],
          [0.03234544, 0.36421672, 0.72796488, 0.56136560, 0.88940006,
           0.63393033, 0.64482522, 0.02115875, 0.37731385, 0.59155494],
          [0.88369453, 0.02426497, 0.47848526, 0.15510882, 0.07602921,
           0.57549298, 0.90019351, 0.62567103, 0.41393134, 0.12018152]]]),
 Tensor(shape=[2, 3, 10], dtype=float32, place=Place(cpu)

In [191]:
paddle.concat([f1, f2], axis=1)

Tensor(shape=[2, 6, 10], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[0.22601740, 0.14923583, 0.34761751, 0.05315388, 0.13177596,
          0.83874351, 0.94392550, 0.74413365, 0.23427728, 0.95526809],
         [0.23121725, 0.99879581, 0.89045691, 0.56941539, 0.15396763,
          0.84482205, 0.29068530, 0.84894836, 0.87598592, 0.99289370],
         [0.68629593, 0.01225852, 0.69453418, 0.59595186, 0.79729217,
          0.85709256, 0.67777288, 0.46706274, 0.27853531, 0.85552394],
         [0.14204416, 0.52158093, 0.89627624, 0.30561233, 0.05896373,
          0.69075704, 0.16991428, 0.09038220, 0.31861377, 0.44900793],
         [0.68168646, 0.78342730, 0.50771642, 0.82231581, 0.82065779,
          0.20955226, 0.09865968, 0.50425911, 0.08482269, 0.67514211],
         [0.70994622, 0.99890590, 0.47064129, 0.04362010, 0.50890225,
          0.58002007, 0.63963532, 0.95849329, 0.41871896, 0.09995396]],

        [[0.17263722, 0.06319062, 0.61965865, 0.16608801, 0.00190719,
    

In [192]:
Q12 = paddle.concat([f1, f2], axis=1) @ WQ
K12 = paddle.concat([f1, f2], axis=1) @ WK
V12 = paddle.concat([f1, f2], axis=1) @ WV

In [193]:
V12, V1, V2

(Tensor(shape=[2, 6, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[[ 1.25740039,  0.31568274, -1.62759519],
          [ 3.67519450,  0.33257985, -0.18418433],
          [ 1.48565042, -0.43640757, -2.59749222],
          [ 1.42461216, -1.52103603,  0.47856456],
          [ 1.77182758,  0.56325567, -1.39030528],
          [ 4.03681755,  0.85027134, -0.99889898]],
 
         [[ 1.78961289, -0.53770232, -0.79891741],
          [ 2.05336618, -0.12652537, -1.45887959],
          [ 1.72479475, -0.69683164, -1.90441430],
          [ 3.39968443, -0.27404481, -2.21407437],
          [ 2.85888791,  1.52924919, -2.08488464],
          [ 3.04307914,  1.94621503, -2.94300508]]]),
 Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[[ 1.25740039,  0.31568274, -1.62759519],
          [ 3.67519450,  0.33257985, -0.18418433],
          [ 1.48565042, -0.43640757, -2.59749222]],
 
         [[ 1.78961289, -0.53770232, -0.79891741],
          [ 2.05336

In [194]:
K12

Tensor(shape=[2, 6, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[-1.16176343,  3.11848760, -0.10499684],
         [-3.01654911,  1.52219141, -1.78193367],
         [-2.03019595,  1.82683682, -0.38217762],
         [-0.73075640,  0.08651253,  0.04889122],
         [-2.62417698,  0.48311013, -1.23787141],
         [-1.29695380, -0.41091371, -0.81524318]],

        [[-0.79375899,  0.86780554,  0.27666277],
         [-0.86689240,  1.93604314, -0.41889307],
         [-1.32102883,  0.01635419,  0.06326835],
         [-1.23096108, -1.74858344, -1.27353597],
         [-3.09010053,  1.65498805, -2.60844707],
         [-4.05528164,  2.02209878, -2.41626263]]])

In [195]:
K12.transpose(perm=(0,2,1))

Tensor(shape=[2, 3, 6], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[-1.16176343, -3.01654911, -2.03019595, -0.73075640, -2.62417698,
          -1.29695380],
         [ 3.11848760,  1.52219141,  1.82683682,  0.08651253,  0.48311013,
          -0.41091371],
         [-0.10499684, -1.78193367, -0.38217762,  0.04889122, -1.23787141,
          -0.81524318]],

        [[-0.79375899, -0.86689240, -1.32102883, -1.23096108, -3.09010053,
          -4.05528164],
         [ 0.86780554,  1.93604314,  0.01635419, -1.74858344,  1.65498805,
           2.02209878],
         [ 0.27666277, -0.41889307,  0.06326835, -1.27353597, -2.60844707,
          -2.41626263]]])

In [196]:
(Q12 @ K12.transpose(perm=(0,2,1)))

Tensor(shape=[2, 6, 6], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[-0.30054840,  3.12394857,  1.35543597,  0.69121939,  3.00100255,
           1.89220095],
         [-1.84301293,  2.47804785, -0.16527693,  0.09639604,  2.32900023,
           1.95001042],
         [-5.15389776, -2.06883192, -2.26369762,  0.42327836,  0.05411994,
           1.10092700],
         [-2.70795441, -1.08019197, -1.68676019, -0.20745376, -0.36489257,
           0.42772758],
         [-6.40970278,  0.32431003, -2.31939125,  0.40442321,  2.07964325,
           2.74445200],
         [ 1.52228880,  5.82451773,  2.96234751,  0.86298549,  4.72409058,
           2.58233571]],

        [[ 0.04728505, -0.08945351,  0.18634082,  0.25031447,  0.09962809,
           0.21215336],
         [-0.82568324, -2.56469464,  0.46839160,  2.38476038, -2.23856521,
          -2.33627939],
         [ 0.08647355, -0.38998175,  0.03578772, -0.57296872, -1.59423816,
          -1.51470804],
         [-0.42908779, -1.2518

In [199]:
att12

Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[ 0.69121939,  3.00100255,  1.89220095],
         [ 0.09639604,  2.32900023,  1.95001042],
         [ 0.42327836,  0.05411994,  1.10092700]],

        [[ 0.25031447,  0.09962809,  0.21215336],
         [ 2.38476038, -2.23856521, -2.33627939],
         [-0.57296872, -1.59423816, -1.51470804]]])

In [200]:
paddle.split(Q12 @ K12.transpose(perm=(0,2,1)), num_or_sections=2, axis=1)

[Tensor(shape=[2, 3, 6], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[[-0.30054840,  3.12394857,  1.35543597,  0.69121939,  3.00100255,
            1.89220095],
          [-1.84301293,  2.47804785, -0.16527693,  0.09639604,  2.32900023,
            1.95001042],
          [-5.15389776, -2.06883192, -2.26369762,  0.42327836,  0.05411994,
            1.10092700]],
 
         [[ 0.04728505, -0.08945351,  0.18634082,  0.25031447,  0.09962809,
            0.21215336],
          [-0.82568324, -2.56469464,  0.46839160,  2.38476038, -2.23856521,
           -2.33627939],
          [ 0.08647355, -0.38998175,  0.03578772, -0.57296872, -1.59423816,
           -1.51470804]]]),
 Tensor(shape=[2, 3, 6], dtype=float32, place=Place(cpu), stop_gradient=True,
        [[[-2.70795441, -1.08019197, -1.68676019, -0.20745376, -0.36489257,
            0.42772758],
          [-6.40970278,  0.32431003, -2.31939125,  0.40442321,  2.07964325,
            2.74445200],
          [ 1.52228880,  5.824

In [204]:
paddle.split(
    paddle.split(Q12 @ K12.transpose(perm=(0,2,1)), num_or_sections=2, axis=1)[0],
    num_or_sections=2,
    axis=-1
)[1] @ V1

Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[14.70955753,  0.39050823, -6.59274006],
         [11.57777119, -0.04599017, -5.65099573],
         [ 2.36672425, -0.32883200, -3.55854321]],

        [[ 1.01845992, -0.29503533, -0.74935389],
         [-4.35839844,  0.62893754,  5.80981445],
         [-6.91150713,  1.56529462,  5.66818810]]])

In [205]:
v12

Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[14.70955753,  0.39050823, -6.59274006],
         [11.57777119, -0.04599017, -5.65099573],
         [ 2.36672425, -0.32883200, -3.55854321]],

        [[ 1.01845992, -0.29503533, -0.74935389],
         [-4.35839844,  0.62893754,  5.80981445],
         [-6.91150713,  1.56529462,  5.66818810]]])

In [203]:
paddle.sum(v11, axis=1)

Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[ 2.21433640, -0.66012204, 14.01506424],
        [-6.29798555,  0.27603284,  3.67877316]])

In [208]:
T = paddle.randn(shape=[2, 3, 4]) # batch_size, seq_len, emb_dim
T

Tensor(shape=[2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[-0.43987498, -0.28947183, -0.54199237,  0.05782108],
         [-2.25413513, -1.09585834, -0.83873957, -0.33688429],
         [-1.20188248,  0.57606184, -0.12936078, -2.04433656]],

        [[-0.75086045, -1.43526423, -1.65541399, -0.55382407],
         [ 1.38693011, -1.06155503,  0.35375360, -0.57037371],
         [-1.11362147,  0.63797563, -0.87924236,  0.69093412]]])

In [211]:
Z = paddle.bmm(T, paddle.transpose(T, perm=[0, 2, 1]))
Z

Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[ 0.57438296,  1.74386919,  0.31383112],
         [ 1.74386919,  7.09900570,  2.87512851],
         [ 0.31383112,  2.87512851,  5.97241497]],

        [[ 5.67089128,  0.21249898,  0.99336487],
         [ 0.21249898,  3.50094199, -2.92688727],
         [ 0.99336487, -2.92688727,  2.89762306]]])

In [212]:
MIN_FLOAT = np.finfo(np.float32).min / 100.0
MIN_FLOAT

-3.4028234663852885e+36

In [225]:
Zflat = paddle.triu(Z, 1) + paddle.tril(paddle.ones_like(Z) * MIN_FLOAT, 0)
Zflat

Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[[-3402823593150348620939981984515489792.,
           1.74386919                            ,
           0.31383112                            ],
         [-3402823593150348620939981984515489792.,
          -3402823593150348620939981984515489792.,
           2.87512851                            ],
         [-3402823593150348620939981984515489792.,
          -3402823593150348620939981984515489792.,
          -3402823593150348620939981984515489792.]],

        [[-3402823593150348620939981984515489792.,
           0.21249898                            ,
           0.99336487                            ],
         [-3402823593150348620939981984515489792.,
          -3402823593150348620939981984515489792.,
          -2.92688727                            ],
         [-3402823593150348620939981984515489792.,
          -3402823593150348620939981984515489792.,
          -3402823593150348620939981984515489792.

In [226]:
paddle.masked_select(Zflat, paddle.greater_than(Zflat, paddle.ones_like(Zflat) * MIN_FLOAT))

Tensor(shape=[6], dtype=float32, place=Place(cpu), stop_gradient=True,
       [ 1.74386919,  0.31383112,  2.87512851,  0.21249898,  0.99336487,
        -2.92688727])

In [227]:
paddle.masked_select(Zflat, paddle.greater_than(Zflat, paddle.ones_like(Zflat) * MIN_FLOAT)).reshape(shape=[2, -1])

Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[ 1.74386919,  0.31383112,  2.87512851],
        [ 0.21249898,  0.99336487, -2.92688727]])

In [18]:
padding = 0
sparse_slots = "log_key click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
sparse_slots = sparse_slots.strip().split(" ")
dense_slots = ["dense_feature"]
dense_slots_shape = [13]
slots = sparse_slots
slot2index = {}
visit = {}

for i in range(len(slots)):
    slot2index[slots[i]] = i
    visit[slots[i]] = False

In [19]:
max_len = 20

In [13]:
train_folder = "../../data/data205411/2023-cvr-contest-data/train_data"
file_list = sorted(os.listdir(train_folder))

file_list = [os.path.join(train_folder, _file) for _file in file_list]

In [20]:
all_res = []

for file in file_list[:1]:
    with open(file, "r") as rf:
        for line in rf:
            items = line.strip("\n").split("\t")
            log_key = int(items[0])
            conv1 = items[1]
            conv2 = items[2]
            conv3 = items[3]

            if conv1 == "1" or conv2 == "1" or conv3 == "1":
                conv = 1
            else:
                conv = 0

            output = [(i, []) for i in slots]
            feasigns = items[4].split(" ")

            for i in feasigns:
                slot_feasign = i.split(":")
                slot = slot_feasign[1]
                if slot not in slots:
                    continue
                if slot in sparse_slots:
                    feasign = int(slot_feasign[0])
                else:
                    feasign = float(slot_feasign[0])
                output[slot2index[slot]][1].append(feasign)
                visit[slot] = True

            output[0][1].append(log_key)
            visit['log_key'] = True
            output[1][1].append(conv)
            visit['click'] = True

            for i in visit:
                slot = i
                if not visit[slot]:
                    if i in dense_slots:
                        output[slot2index[i]][1].extend(
                            [padding] *
                            dense_slots_shape[slot2index[i]]
                        )
                    else:
                        output[slot2index[i]][1].extend([padding])
                else:
                    visit[slot] = False

            res = []
            for key, value in output:
                if key == "log_key":
                    continue

                if key == "click":
                    res.append(np.array(value).astype("float32").reshape([-1,]))
                    continue

                padding_zeros = [0] * (max_len - len(value))
                res.append(
                    np.array(value + padding_zeros).astype("int64").reshape([max_len,])
                )

            len_array = [len(value) for key, value in output][2:]
            mask = np.array(
                [[0] * x + [-1e9] * (max_len - x) for x in len_array]
            ).reshape([-1, max_len])

            res.append(mask)
            all_res.append(res)

In [26]:
class CVRDataset(Dataset):
    def __init__(self, features):
        super().__init__()
        self.features = features
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx]

In [27]:
dataset = CVRDataset(all_res)

In [33]:
dataloader = DataLoader(
    dataset,
    batch_size=2,
    places="cpu",
    shuffle=False,
    drop_last=False
)

In [34]:
for batch_id, batch in enumerate(dataloader):
    print(batch)
    break

[Tensor(shape=[2, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[0.],
        [0.]]), Tensor(shape=[2, 20], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[159, 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  ,
         0  , 0  , 0  , 0  , 0  , 0  ],
        [704, 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  ,
         0  , 0  , 0  , 0  , 0  , 0  ]]), Tensor(shape=[2, 20], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[259, 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  ,
         0  , 0  , 0  , 0  , 0  , 0  ],
        [2  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  , 0  ,
         0  , 0  , 0  , 0  , 0  , 0  ]]), Tensor(shape=[2, 20], dtype=int64, place=Place(cpu), stop_gradient=True,
       [[4630199, 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
         0      , 0      , 0      , 0      , 0      , 0      , 0      , 0      ,
         0      , 0      , 0      

In [7]:
cel = paddle.nn.CrossEntropyLoss(soft_label=True)

In [13]:
pred = paddle.to_tensor([[0.8, 0.15, 0.05], [0.3, 0.6, 0.1]], dtype="float32")
label = paddle.to_tensor([[1, 0, 0], [0, 1, 0]], dtype="float32")

In [14]:
cel(pred, label)

Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
       [0.77181804])

In [18]:
paddle.slice(pred, axes=[1], starts=[2], ends=[3])

Tensor(shape=[2, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[0.05000000],
        [0.10000000]])

In [41]:
pred = paddle.to_tensor([[0.05, 0.75, 0.05, 0.05, 0.05, 0.05], [0.05, 0.05, 0.75, 0.05, 0.05, 0.05]])
label = paddle.to_tensor([[0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]], dtype="float32")


F.cross_entropy(pred, label, soft_label=True)

Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
       [1.24787295])