In [1]:
import os
import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
from os.path import isfile, join
import time
import math
import logging
from tqdm import tqdm, tqdm_pandas 

from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat,DenseFeat, get_feature_names, VarLenSparseFeat

In [2]:
# gpu number setting

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

# tensorflow & keras version check
print('tensorflow version : ' , tf.__version__)
print('keras version : ' , tf.keras.__version__)

# tensorflow gpu available check 
print('GPU available ? : ', tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None))


tensorflow version :  2.4.4
keras version :  2.4.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU available ? :  True


In [3]:
basedir = '/home/lms/ms/poc'
rawdata_path = join(basedir, 'dataset')
procdata_path = join(basedir, 'deepfm', 'dataset', 'preprocess', 'rating')

model_path_5_no_f= join(basedir, 'deepfm', 'model', 'model_5_no_f')
model_path_r_no_f= join(basedir, 'deepfm', 'model', 'model_r_no_f')
model_path_5_f= join(basedir, 'deepfm', 'model', 'model_5_f')
model_path_r_f= join(basedir, 'deepfm', 'model', 'model_r_f')

target_5 = 'rating_5'
target_r = 'rating_r'
epochs=20
batch_size=2**15
learner='adam'
num_hidden=256
task='regression'
dropout=0.6
lr=0.001
lr_decay=0.0001
embedding_dim=5
data_filename='hist.pkl'

In [4]:
data = pd.read_pickle(os.path.join(procdata_path, data_filename))
data.head()

Unnamed: 0,sa_id_CODE,category_id_CODE,rating_r,rating_5,seg_1,seg_2,seg_3,seg_4,pr_info_CODE,price_CODE,release_date,run_time_ss
0,0,0,4.5,5,1.0,0.0,0.0,0.0,0,0,0.990018,0.17365
1,1,0,5.0,5,0.97,0.0,0.03,0.0,0,0,0.990018,0.17365
2,2,0,4.5,5,0.97,0.0,0.03,0.0,0,0,0.990018,0.17365
3,3,0,5.0,5,0.98,0.0,0.01,0.01,0,0,0.990018,0.17365
4,4,0,4.5,5,0.86,0.0,0.01,0.13,0,0,0.990018,0.17365


In [5]:
sparse_features = ['sa_id_CODE','category_id_CODE','price_CODE','pr_info_CODE'] # nominal variable
dense_features = ['seg_1',
                  'seg_2',
                  'seg_3',
                  'seg_4',                       
                  'release_date',
                  'run_time_ss'] # continuous 

In [6]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=embedding_dim) 
                                  for feat in sparse_features] + [DenseFeat(feat, 1,) for feat in dense_features]

linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [7]:
fixlen_feature_columns

[SparseFeat(name='sa_id_CODE', vocabulary_size=1973461, embedding_dim=5, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fe0502c3ba8>, embedding_name='sa_id_CODE', group_name='default_group', trainable=True),
 SparseFeat(name='category_id_CODE', vocabulary_size=17221, embedding_dim=5, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fe0502c3b38>, embedding_name='category_id_CODE', group_name='default_group', trainable=True),
 SparseFeat(name='price_CODE', vocabulary_size=53, embedding_dim=5, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7fe0502c3cc0>, embedding_name='price_CODE', group_name='default_group', trainable=True),
 SparseFeat(name='pr_info_CODE', vocabulary_

In [8]:
data = data[['sa_id_CODE', 'category_id_CODE', 'rating_5', 'seg_1', 'seg_2', 'seg_3', 'seg_4', 'pr_info_CODE', 'price_CODE', 'release_date', 'run_time_ss']]
data.head()

Unnamed: 0,sa_id_CODE,category_id_CODE,rating_5,seg_1,seg_2,seg_3,seg_4,pr_info_CODE,price_CODE,release_date,run_time_ss
0,0,0,5,1.0,0.0,0.0,0.0,0,0,0.990018,0.17365
1,1,0,5,0.97,0.0,0.03,0.0,0,0,0.990018,0.17365
2,2,0,5,0.97,0.0,0.03,0.0,0,0,0.990018,0.17365
3,3,0,5,0.98,0.0,0.01,0.01,0,0,0.990018,0.17365
4,4,0,5,0.86,0.0,0.01,0.13,0,0,0.990018,0.17365


In [9]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=2020)

In [10]:
# negative random sampling
max_code = data['category_id_CODE'].values.tolist()
max_num = max(max_code)

In [11]:
np.random.seed = 2020
rand1 = np.random.randint(max_num, size=len(train)) # 12511
rand2 = np.random.randint(max_num, size=len(train))
rand3 = np.random.randint(max_num, size=len(train))
rand4 = np.random.randint(max_num, size=len(train))
rand5 = np.random.randint(max_num, size=len(train))

In [12]:
train0 = pd.DataFrame(columns=['sa_id_CODE', 'category_id_CODE'])
train0['sa_id_CODE'] = train['sa_id_CODE']
train0

Unnamed: 0,sa_id_CODE,category_id_CODE
11999337,278987,
8141526,386253,
12344380,1880334,
9598807,597462,
2416674,460703,
...,...,...
8302275,1123885,
1948278,839798,
9870659,649940,
1765768,796509,


In [13]:
train1 = train0.copy()
train2 = train0.copy()
train3 = train0.copy()
train4 = train0.copy()
train5 = train0.copy()

In [14]:
train1['category_id_CODE'] = rand1
train1['rating_5'] = 0
train2['category_id_CODE'] = rand2
train2['rating_5'] = 0
train3['category_id_CODE'] = rand3
train3['rating_5'] = 0
train4['category_id_CODE'] = rand4
train4['rating_5'] = 0
train5['category_id_CODE'] = rand5
train5['rating_5'] = 0

In [15]:
train = train[['sa_id_CODE', 'category_id_CODE', 'rating_5']]

In [16]:
train = pd.concat([train, train1, train2, train3, train4, train5]) # 77011698

In [17]:
# user feature merge
user_data = pd.read_pickle(os.path.join(procdata_path, 'user.pkl'))
user_data = user_data[['sa_id_CODE', 'seg_1', 'seg_2', 'seg_3', 'seg_4']]
# item feature merge
item_data = pd.read_pickle(os.path.join(procdata_path, 'item.pkl'))
item_data = item_data[['category_id_CODE', 'pr_info_CODE', 'price_CODE', 'release_date', 'run_time_ss']]

In [18]:
train = train.merge(user_data, on='sa_id_CODE')

In [19]:
train = train.merge(item_data, on='category_id_CODE')

In [20]:
del [[train1, train2, train3, train4, train5, train0, data]]
#del train2
#del train3
#del train4
#del train5
#del train0
#del data
#del user_data
#del item_data

In [21]:
# 셔플
train = train.sample(frac=1).reset_index(drop=True)

In [23]:
# save
train.to_pickle(os.path.join(procdata_path, 'train.pkl'))

In [24]:
# save
test.to_pickle(os.path.join(procdata_path, 'test.pkl'))

In [12]:
del [[train, test]]
del data

In [13]:
train = pd.read_pickle(os.path.join(procdata_path, 'train.pkl'))
test = pd.read_pickle(os.path.join(procdata_path, 'test.pkl'))

In [15]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [16]:
train_model_input

{'sa_id_CODE': array([ 826711,  282766,  312530, ...,  593020,  429594, 1640081]),
 'category_id_CODE': array([11179,  9326,  2711, ...,  4587, 10396,  9774]),
 'price_CODE': array([3, 3, 3, ..., 0, 2, 9]),
 'pr_info_CODE': array([0, 1, 0, ..., 0, 0, 0]),
 'seg_1': array([1.        , 1.        , 0.89999998, ..., 1.        , 1.        ,
        1.        ]),
 'seg_2': array([0., 0., 0., ..., 0., 0., 0.]),
 'seg_3': array([0.  , 0.  , 0.07, ..., 0.  , 0.  , 0.  ]),
 'seg_4': array([0.  , 0.  , 0.03, ..., 0.  , 0.  , 0.  ]),
 'release_date': array([0.95861112, 0.98945863, 0.97958747, ..., 0.94756976, 0.8406508 ,
        0.86255161]),
 'run_time_ss': array([0.22659587, 0.19204279, 0.2464889 , ..., 0.14960175, 0.23713879,
        0.24652738])}

In [17]:
model = DeepFM(linear_feature_columns,
                            dnn_feature_columns,
                            dnn_hidden_units=(num_hidden, num_hidden),
                            dnn_dropout=dropout,
                            task=task)
        
if learner.lower() == "adagrad": 
    opt = tf.keras.optimizers.Adagrad(lr=lr,  decay=lr_decay)
elif learner.lower() == "rmsprop":
    opt = tf.keras.optimizers.RMSprop(lr=lr,  decay=lr_decay)
elif learner.lower() == "adam":
    opt = tf.keras.optimizers.Adam(lr=lr, decay=lr_decay)
else:
    opt = tf.keras.optimizers.SGD(lr=lr,  decay=lr_decay)

model.compile(opt, 'mse', metrics=['mse',tf.keras.metrics.RootMeanSquaredError()])

In [18]:
epochs = 4
best_loss = epochs
        
for epoch in range(epochs):

    # Training
    hist = model.fit(train_model_input, #input
                          train[target_5].values, # labels
                          validation_data=(test_model_input, test[target_5].values),
                          batch_size=batch_size,
                          initial_epoch=epoch, epochs=epoch + 1, 
                          verbose=1, 
                          shuffle=True)

    train_loss = hist.history['loss'][0]
    train_rmse = hist.history['root_mean_squared_error'][0]
    val_loss = hist.history['val_loss'][0]
    val_rmse = hist.history['val_root_mean_squared_error'][0]


    if val_loss < best_loss :
        best_loss = val_loss
        model.save_weights(os.path.join(model_path_5_f, 'best.h5'))

Epoch 2/2
Epoch 3/3
Epoch 4/4


In [19]:
model.save_weights(os.path.join(model_path_5_f, 'best.h5'))

In [20]:
model.load_weights(os.path.join(model_path_5_f, 'best.h5'))
tf.keras.models.save_model(model, os.path.join(model_path_5_f, 'bestmodel'))



INFO:tensorflow:Assets written to: /home/lms/ms/poc/deepfm/model/model_5_f/bestmodel/assets


INFO:tensorflow:Assets written to: /home/lms/ms/poc/deepfm/model/model_5_f/bestmodel/assets
