In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from tqdm import tqdm
from deepctr.models import DeepFM
from deepctr.layers import custom_objects
from tensorflow.data import Dataset as tfds

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
#os.environ["CUDA_VISIBLE_DEVICES"] = '3'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

# Gpu growth setting
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)


# tensorflow & keras version check
print('tensorflow version : ' , tf.__version__)
print('keras version : ' , tf.keras.__version__)

# tensorflow gpu available check 
print('GPU available ? : ', tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None))

tensorflow version :  2.4.4
keras version :  2.4.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU available ? :  True


In [14]:
from os.path import isfile, join
basedir = '/home/lms/ms/poc'
rawdata_path = join(basedir, 'dataset')
procdata_path = join(basedir, 'deepfm', 'dataset', 'preprocess', 'rating')
result_path_5_no_f = join(basedir, 'deepfm', 'result', 'result_5_no_f') # path 설정 필요
result_path_r_no_f = join(basedir, 'deepfm', 'result', 'result_r_no_f') # path 설정 필요
result_path_5_f = join(basedir, 'deepfm', 'result', 'result_5_f') # path 설정 필요
result_path_r_f = join(basedir, 'deepfm', 'result', 'result_r_f') # path 설정 필요

model_path_5_no_f= join(basedir, 'deepfm', 'model', 'model_5_no_f')
model_path_r_no_f= join(basedir, 'deepfm', 'model', 'model_r_no_f')
model_path_5_f= join(basedir, 'deepfm', 'model', 'model_5_f')
model_path_r_f= join(basedir, 'deepfm', 'model', 'model_r_f')

rawdata_path = rawdata_path
procdata_path = procdata_path
model_path = model_path
batch_size = 50
topk = 25

In [3]:
user_file='user.pkl'
item_file='item.pkl'

users = pd.read_pickle(os.path.join(procdata_path, user_file))
items = pd.read_pickle(os.path.join(procdata_path, item_file))

In [4]:
item_columns = ['category_id','gubun','rep_album_id','category_name']
category_data = pd.read_pickle(os.path.join(rawdata_path, 'item.pkl'))
category_data = category_data.astype({'category_id':'str', 'category_name':'str', 'album_id':'str'})
category_data = category_data.drop_duplicates(subset=item_columns)

n_users = len(users)
n_items = len(items)

category_data = items.merge(category_data, on='category_id')

In [5]:
unique_user = users[users.columns[1:]]
unique_item = items[items.columns[1:]]

In [6]:
model = tf.keras.models.load_model(os.path.join(model_path_r_no_f, 'bestmodel'), 
                                   custom_objects=custom_objects)

In [8]:
def save_reclist(pred_mat, start_idx, end_idx, out_file):

    values, indices = tf.math.top_k(pred_mat, topk)

    user_codes = np.repeat(np.arange(start_idx, end_idx), topk)
    item_codes = tf.reshape(indices, [-1])

    gubun = category_data.iloc[item_codes]['gubun'].values
    sa_id = users.iloc[user_codes]['sa_id_CODE'].values
    category_id = category_data.iloc[item_codes]['category_id_CODE'].values
    category_name = category_data.iloc[item_codes]['category_name'].values
    score = tf.reshape(values, [-1])
    rank = np.tile(np.arange(1, topk+1), end_idx-start_idx)

    np.savetxt(out_file, 
               np.vstack((gubun, sa_id, category_id, category_name, score, rank)).T, 
               delimiter='\001', 
               fmt=['%s','%s','%s','%s','%1.5f','%i'],
               encoding='utf-8')

In [9]:
def batch_input_data():
    start = 0
    end = start + batch_size
    finished = False

    while not finished:
        yielded_users = np.repeat(unique_user.iloc[start:end]['sa_id_CODE'].values, n_items)
        yielded_albums = np.tile(unique_item['category_id_CODE'].values, batch_size)

        yield yielded_users, yielded_albums

        start = end
        end = start + batch_size

        if end > n_users:
            finished = True

In [10]:
def gen_data_loader():

    return tfds.from_generator(batch_input_data,
                              output_types=(tf.int32,
                                            tf.int32)).prefetch(32)

In [12]:
def file_open(i):
    return open(os.path.join(result_path_r_no_f, 'reclist_' + str(i) + '.csv'), 'wb')


def file_close(file):
    file.close()

In [13]:
with tqdm(total=n_users//batch_size) as pbar:
    i = 0

    f = file_open(i)
    for said, category in gen_data_loader():
        pred = model.predict_on_batch({'sa_id_CODE':said,
                                           'category_id_CODE':category}).reshape(-1, n_items)

        if f.tell() > 1024**3:  # split file by size
            break

        save_reclist(pred, said[0], said[-1]+1, f)

        pbar.update(1)

    file_close(f)

 43%|████▎     | 17099/39469 [16:33<21:40, 17.20it/s]
