In [1]:
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import numpy as np
import random

from tqdm import tqdm

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *


2024-11-19 10:47:28.991781: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731984449.442744   48728 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731984449.568030   48728 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-19 10:47:30.621484: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
seed = 6
torch.manual_seed(seed)  # 为CPU设置随机种子
torch.cuda.manual_seed(seed)  # 为当前GPU设置随机种子
torch.cuda.manual_seed_all(seed)  # 为所有GPU设置随机种子
np.random.seed(seed)
random.seed(seed)

sparse_features = ['C' + str(i) for i in range(1, 27)]   #C代表类别特征 class
dense_features =  ['I' + str(i) for i in range(1, 14)]   #I代表数值特征 int
col_names = ['label'] + dense_features + sparse_features
test_col_names = dense_features + sparse_features

train_data_fraction = 1
train_data_chunks = pd.read_csv('./train.txt', names=col_names, sep='\t', chunksize=10000)
train_data = pd.concat(chunk.sample(frac=train_data_fraction, random_state=42) for chunk in tqdm(train_data_chunks, desc="Loading train data"))

train_data[sparse_features] = train_data[sparse_features].fillna('-1', )
train_data[dense_features] = train_data[dense_features].fillna(0, )
target = ['label']

Loading train data: 4585it [04:09, 18.40it/s]


In [3]:
test_data_fraction = 1
test_data_chunks = pd.read_csv('./test.txt', names=test_col_names, sep='\t', chunksize=10000)
test_data = pd.concat(chunk.sample(frac=test_data_fraction, random_state=42) for chunk in tqdm(test_data_chunks, desc="Loading test data"))

Loading test data: 605it [00:31, 19.05it/s]


In [4]:
test_data[sparse_features] = test_data[sparse_features].fillna('-1', )   # 类别特征缺失 ，使用-1代替
test_data[dense_features] = test_data[dense_features].fillna(0, )        # 数值特征缺失，使用0代替

In [5]:
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
# 使用LabelEncoder()，为类别特征的每一个item编号
for feat in tqdm(sparse_features, desc="Label Encoding Sparse Features"):
    lbe = LabelEncoder()
    all_values = np.concatenate([train_data[feat], test_data[feat]])
    lbe.fit(all_values)
    train_data[feat] = lbe.transform(train_data[feat])
    test_data[feat] = lbe.transform(test_data[feat])
    
print("Scaling dense features...")
mms = MinMaxScaler(feature_range=(0, 1))
train_data[dense_features] = mms.fit_transform(train_data[dense_features])
test_data[dense_features] = mms.transform(test_data[dense_features])

Label Encoding Sparse Features: 100%|██████████| 26/26 [10:24<00:00, 24.04s/it]


Scaling dense features...


In [6]:

# 修正词汇大小定义
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=max(train_data[feat].max(), test_data[feat].max()) + 1, embedding_dim=4)
                          for feat in sparse_features] + [DenseFeat(feat, 1) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(
    linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model

train, val = train_test_split(train_data, test_size=0.2, random_state=2020)
train_model_input = {name: train[name] for name in feature_names}
val_model_input = {name: val[name] for name in feature_names}


In [7]:
 # 4.Define Model,train,predict and evaluate

device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'
else:
    print("cpu")

model = xDeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
               task='binary',
               l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )

history = model.fit(train_model_input, train[target].values, batch_size=4096, epochs=1, verbose=1,
                    validation_split=0.2)
pred_ans = model.predict(val_model_input, 256)
print("")
print("test LogLoss", round(log_loss(val[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(val[target].values, pred_ans), 4))

cuda ready...
cuda:0
Train on 29337994 samples, validate on 7334499 samples, 7163 steps per epoch


7163it [16:58,  7.03it/s]


Epoch 1/1
1091s - loss:  0.4635 - binary_crossentropy:  0.4635 - auc:  0.7854 - val_binary_crossentropy:  0.4572 - val_auc:  0.7931

test LogLoss 0.4575
test AUC 0.7928


In [8]:
# 7. 对测试数据进行预测
test_model_input = {name: test_data[name] for name in feature_names}
test_pred = model.predict(test_model_input, batch_size=256)
    
# 8. 保存测试数据的预测结果
test_data['predicted_label'] = test_pred
test_data[['predicted_label']].to_csv('./predictions.csv', index=False)

print("Test predictions saved to './predictions.csv'.")

Test predictions saved to './predictions.csv'.
