In [1]:
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import numpy as np
import random

from tqdm import tqdm

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *


  from .autonotebook import tqdm as notebook_tqdm
2024-11-12 05:12:12.945439: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-12 05:12:13.929908: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64
2024-11-12 05:12:13.930023: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/n

In [3]:
seed = 6
torch.manual_seed(seed)  # 为CPU设置随机种子
torch.cuda.manual_seed(seed)  # 为当前GPU设置随机种子
torch.cuda.manual_seed_all(seed)  # 为所有GPU设置随机种子
np.random.seed(seed)
random.seed(seed)

sparse_features = ['C' + str(i) for i in range(1, 27)]   #C代表类别特征 class
dense_features =  ['I' + str(i) for i in range(1, 14)]   #I代表数值特征 int
col_names = ['label'] + dense_features + sparse_features
test_col_names = dense_features + sparse_features

train_data_fraction = 0.05
train_data_chunks = pd.read_csv('./oridata/train.txt', names=col_names, sep='\t', chunksize=30000)
data = pd.concat(chunk.sample(frac=train_data_fraction, random_state=42) for chunk in tqdm(train_data_chunks, desc="Loading train data"))

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])


Loading train data: 1529it [04:45,  5.35it/s]


In [4]:
 fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=4)
                          for feat in sparse_features] + [DenseFeat(feat, 1, )
                                                          for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(
    linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model

train, test = train_test_split(data, test_size=0.2, random_state=2020)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}


In [5]:
 # 4.Define Model,train,predict and evaluate

device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'
else:
    print("cpu")
# model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
#                task='binary',
#                l2_reg_embedding=1e-5, device=device)
model = xDeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
               task='binary',
               l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )

history = model.fit(train_model_input, train[target].values, batch_size=4096, epochs=10, verbose=1,
                    validation_split=0.2)
pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

cuda ready...
cuda:0
Train on 1466899 samples, validate on 366725 samples, 359 steps per epoch


359it [00:38,  9.24it/s]


Epoch 1/10
42s - loss:  0.4860 - binary_crossentropy:  0.4859 - auc:  0.7553 - val_binary_crossentropy:  0.4748 - val_auc:  0.7713


359it [00:31, 11.29it/s]


Epoch 2/10
35s - loss:  0.4024 - binary_crossentropy:  0.4023 - auc:  0.8489 - val_binary_crossentropy:  0.5124 - val_auc:  0.7488


359it [00:31, 11.24it/s]


Epoch 3/10
35s - loss:  0.3305 - binary_crossentropy:  0.3306 - auc:  0.8972 - val_binary_crossentropy:  0.5278 - val_auc:  0.7433


359it [00:31, 11.24it/s]


Epoch 4/10
35s - loss:  0.3045 - binary_crossentropy:  0.3045 - auc:  0.9125 - val_binary_crossentropy:  0.5424 - val_auc:  0.7413


359it [00:31, 11.30it/s]


Epoch 5/10
35s - loss:  0.2915 - binary_crossentropy:  0.2916 - auc:  0.9197 - val_binary_crossentropy:  0.5573 - val_auc:  0.7370


359it [00:31, 11.42it/s]


Epoch 6/10
34s - loss:  0.2834 - binary_crossentropy:  0.2833 - auc:  0.9244 - val_binary_crossentropy:  0.5690 - val_auc:  0.7373


359it [00:30, 11.89it/s]


Epoch 7/10
33s - loss:  0.2767 - binary_crossentropy:  0.2767 - auc:  0.9281 - val_binary_crossentropy:  0.5848 - val_auc:  0.7330


359it [00:31, 11.46it/s]


Epoch 8/10
34s - loss:  0.2703 - binary_crossentropy:  0.2703 - auc:  0.9317 - val_binary_crossentropy:  0.5994 - val_auc:  0.7303


359it [00:31, 11.29it/s]


Epoch 9/10
35s - loss:  0.2641 - binary_crossentropy:  0.2642 - auc:  0.9352 - val_binary_crossentropy:  0.6202 - val_auc:  0.7258


359it [00:31, 11.51it/s]


Epoch 10/10
34s - loss:  0.2579 - binary_crossentropy:  0.2578 - auc:  0.9386 - val_binary_crossentropy:  0.6396 - val_auc:  0.7235

test LogLoss 0.6411
test AUC 0.7229
