In [1]:
import os
import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
from os.path import isfile, join
import time
import math
import logging
from tqdm import tqdm, tqdm_pandas

from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat,DenseFeat, get_feature_names, VarLenSparseFeat

In [2]:
# gpu number setting

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

# tensorflow & keras version check
print('tensorflow version : ' , tf.__version__)
print('keras version : ' , tf.keras.__version__)

# tensorflow gpu available check 
print('GPU available ? : ', tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None))


tensorflow version :  2.4.4
keras version :  2.4.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU available ? :  True


In [3]:
basedir = '/home/lms/ms/poc'
rawdata_path = join(basedir, 'dataset')
procdata_path = join(basedir, 'deepfm', 'dataset', 'preprocess', 'rating')

model_path_5_no_f= join(basedir, 'deepfm', 'model', 'model_5_no_f')
model_path_r_no_f= join(basedir, 'deepfm', 'model', 'model_r_no_f')
model_path_5_f= join(basedir, 'deepfm', 'model', 'model_5_f')
model_path_r_f= join(basedir, 'deepfm', 'model', 'model_r_f')

target_5 = 'rating_5'
target_r = 'rating_r'
epochs=20
batch_size=2**15
learner='adam'
num_hidden=256
task='regression'
dropout=0.6
lr=0.001
lr_decay=0.0001
embedding_dim=5
data_filename='hist.pkl'

In [4]:
data = pd.read_pickle(os.path.join(procdata_path, data_filename))
data.head()

Unnamed: 0,sa_id_CODE,category_id_CODE,rating_r,rating_5,seg_1,seg_2,seg_3,seg_4,pr_info_CODE,price_CODE,release_date,run_time_ss
0,0,0,4.5,5,1.0,0.0,0.0,0.0,0,0,0.990018,0.17365
1,1,0,5.0,5,0.97,0.0,0.03,0.0,0,0,0.990018,0.17365
2,2,0,4.5,5,0.97,0.0,0.03,0.0,0,0,0.990018,0.17365
3,3,0,5.0,5,0.98,0.0,0.01,0.01,0,0,0.990018,0.17365
4,4,0,4.5,5,0.86,0.0,0.01,0.13,0,0,0.990018,0.17365


In [5]:
sparse_features = ['sa_id_CODE','category_id_CODE'] # nominal variable

In [6]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=embedding_dim) 
                                  for feat in sparse_features]

linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [7]:
fixlen_feature_columns

[SparseFeat(name='sa_id_CODE', vocabulary_size=1973461, embedding_dim=5, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7f7cb5ba3cc0>, embedding_name='sa_id_CODE', group_name='default_group', trainable=True),
 SparseFeat(name='category_id_CODE', vocabulary_size=17221, embedding_dim=5, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x7f7cb642db00>, embedding_name='category_id_CODE', group_name='default_group', trainable=True)]

In [8]:
data = data[['sa_id_CODE', 'category_id_CODE', 'rating_r']]
data.head()

Unnamed: 0,sa_id_CODE,category_id_CODE,rating_r
0,0,0,4.5
1,1,0,5.0
2,2,0,4.5
3,3,0,5.0
4,4,0,4.5


In [9]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=2020)

In [10]:
# negative random sampling
max_code = data['category_id_CODE'].values.tolist()
max_num = max(max_code)

In [11]:
np.random.seed = 2020
rand1 = np.random.randint(max_num, size=len(train)) # 12511
rand2 = np.random.randint(max_num, size=len(train))
rand3 = np.random.randint(max_num, size=len(train))
rand4 = np.random.randint(max_num, size=len(train))
rand5 = np.random.randint(max_num, size=len(train))

In [12]:
train1 = train.copy()
train2 = train.copy()
train3 = train.copy()
train4 = train.copy()
train5 = train.copy()

In [13]:
train1['category_id_CODE'] = rand1
train1['rating_r'] = 0
train2['category_id_CODE'] = rand2
train2['rating_r'] = 0
train3['category_id_CODE'] = rand3
train3['rating_r'] = 0
train4['category_id_CODE'] = rand4
train4['rating_r'] = 0
train5['category_id_CODE'] = rand5
train5['rating_r'] = 0

In [14]:
train = pd.concat([train, train1, train2, train3, train4, train5]) # 77011698
train

Unnamed: 0,sa_id_CODE,category_id_CODE,rating_r
11999337,278987,1752,4.50000
8141526,386253,523,4.50000
12344380,1880334,1900,4.50000
9598807,597462,820,4.40675
2416674,460703,92,4.88550
...,...,...,...
8302275,1123885,8932,0.00000
1948278,839798,1561,0.00000
9870659,649940,6709,0.00000
1765768,796509,2349,0.00000


In [15]:
# 셔플
train = train.sample(frac=1).reset_index(drop=True)

In [16]:
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

In [17]:
train_model_input

{'sa_id_CODE': array([104567, 823509,  71099, ..., 788536, 765919, 466828]),
 'category_id_CODE': array([6879, 4786, 6275, ..., 5580, 7254, 1659])}

In [18]:
test.head()

Unnamed: 0,sa_id_CODE,category_id_CODE,rating_r
10838611,503022,1188,4.5
15979174,652914,11948,4.5
4541125,502835,166,5.0
14053222,352951,3104,4.5
8806112,643387,669,2.61375


In [19]:
model = DeepFM(linear_feature_columns,
                            dnn_feature_columns,
                            dnn_hidden_units=(num_hidden, num_hidden),
                            dnn_dropout=dropout,
                            task=task)
        
if learner.lower() == "adagrad": 
    opt = tf.keras.optimizers.Adagrad(lr=lr,  decay=lr_decay)
elif learner.lower() == "rmsprop":
    opt = tf.keras.optimizers.RMSprop(lr=lr,  decay=lr_decay)
elif learner.lower() == "adam":
    opt = tf.keras.optimizers.Adam(lr=lr, decay=lr_decay)
else:
    opt = tf.keras.optimizers.SGD(lr=lr,  decay=lr_decay)

model.compile(opt, 'mse', metrics=['mse',tf.keras.metrics.RootMeanSquaredError()])

In [22]:
epochs = 7
best_loss = epochs
        
for epoch in range(epochs):

    # Training
    hist = model.fit(train_model_input, #input
                          train[target_r].values, # labels
                          validation_data=(test_model_input, test[target_r].values),
                          batch_size=batch_size,
                          initial_epoch=epoch, epochs=epoch + 1, 
                          verbose=1, 
                          shuffle=True)

    train_loss = hist.history['loss'][0]
    train_rmse = hist.history['root_mean_squared_error'][0]
    val_loss = hist.history['val_loss'][0]
    val_rmse = hist.history['val_root_mean_squared_error'][0]


    if val_loss < best_loss :
        best_loss = val_loss
        model.save_weights(os.path.join(model_path_r_no_f, 'best.h5'))

Epoch 2/2
Epoch 3/3
Epoch 4/4
Epoch 5/5
Epoch 6/6
Epoch 7/7


In [23]:
model.save_weights(os.path.join(model_path_r_no_f, 'best.h5'))

In [24]:
model.load_weights(os.path.join(model_path_r_no_f, 'best.h5'))
tf.keras.models.save_model(model, os.path.join(model_path_r_no_f, 'bestmodel'))



INFO:tensorflow:Assets written to: /home/lms/ms/poc/deepfm/model/model_r_no_f/bestmodel/assets


INFO:tensorflow:Assets written to: /home/lms/ms/poc/deepfm/model/model_r_no_f/bestmodel/assets
