In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from tqdm import tqdm
from deepctr.models import DeepFM
from deepctr.layers import custom_objects
from tensorflow.data import Dataset as tfds

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
#os.environ["CUDA_VISIBLE_DEVICES"] = '3'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

# Gpu growth setting
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)


# tensorflow & keras version check
print('tensorflow version : ' , tf.__version__)
print('keras version : ' , tf.keras.__version__)

# tensorflow gpu available check 
print('GPU available ? : ', tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None))

tensorflow version :  2.4.4
keras version :  2.4.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU available ? :  True


In [2]:
from os.path import isfile, join
basedir = '/home/lms/ms/poc'
rawdata_path = join(basedir, 'dataset')
procdata_path = join(basedir, 'deepfm', 'dataset', 'preprocess', 'rating')
procdata_path_5 = join(basedir, 'deepfm', 'dataset', 'preprocess', '5_1')
model_path = join(basedir, 'deepfm', 'model', 'model_base')
result_path = join(basedir, 'deepfm', 'result', 'base')

rawdata_path = rawdata_path
procdata_path = procdata_path
model_path = model_path
batch_size = 50
topk = 25

In [3]:
user_file='user.pkl'
item_file='item.pkl'

users = pd.read_pickle(os.path.join(procdata_path, user_file))
items = pd.read_pickle(os.path.join(procdata_path, item_file))

In [4]:

item_columns = ['category_id','gubun','rep_album_id','category_name']
category_data = pd.read_pickle(os.path.join(rawdata_path, 'item.pkl'))
category_data = category_data.astype({'category_id':'str', 'category_name':'str', 'album_id':'str'})
category_data = category_data.drop_duplicates(subset=item_columns)

n_users = len(users)
n_items = len(items)

category_data = items.merge(category_data, on='category_id')


In [5]:
#unique_user = users[users.columns[1]]
#unique_item = items[items.columns[1]]

In [6]:
unique_user = users[users.columns[1:]]
unique_item = items[items.columns[1:]]

In [7]:
category_data.head(2)

Unnamed: 0,category_id,category_id_CODE,price_CODE,pr_info_CODE,release_date_x,run_time_ss_x,gubun,depth0_id,dpeth0_nm,genre,...,close_yn,series_yn,series_no,rank_no,run_time_ss_y,pr_info,price,release_date_y,point_watcha,summary
0,B3054,0,0,0,0.990018,0.17365,VARIETY,3,예능시사교양,방송,...,Y,Y,18회,18.0,4707,4,2000,20210708.0,0.0,아웅산 폭탄 테러. 처참하고 가슴 아픈 그날의 현장. 유례없는 역사상 최악의 테러 ...
1,B503V,1,0,0,0.990313,0.131286,VARIETY,3,예능시사교양,방송,...,N,Y,11회,11.0,3546,4,2000,20210929.0,0.0,만삭 부인 살해 사건과 치과의사 모녀 살해 사건. 비슷한 두 사건이지만 판결은 정반...


In [8]:
users.head()

Unnamed: 0,sa_id,sa_id_CODE,seg_1,seg_2,seg_3,seg_4
0,205090929633,0,1.0,0.0,0.0,0.0
1,206059946955,1,0.97,0.0,0.03,0.0
2,207044266005,2,0.97,0.0,0.03,0.0
3,207044353341,3,0.98,0.0,0.01,0.01
4,208036007235,4,0.86,0.0,0.01,0.13


In [9]:
model = tf.keras.models.load_model(os.path.join(model_path, 'bestmodel'), 
                                   custom_objects=custom_objects)

In [10]:
def save_reclist(pred_mat, start_idx, end_idx, out_file):

    values, indices = tf.math.top_k(pred_mat, topk)

    user_codes = np.repeat(np.arange(start_idx, end_idx), topk)
    item_codes = tf.reshape(indices, [-1])

    gubun = category_data.iloc[item_codes]['gubun'].values
    sa_id = users.iloc[user_codes]['sa_id_CODE'].values
    category_id = category_data.iloc[item_codes]['category_id_CODE'].values
    category_name = category_data.iloc[item_codes]['category_name'].values
    score = tf.reshape(values, [-1])
    rank = np.tile(np.arange(1, topk+1), end_idx-start_idx)

    np.savetxt(out_file, 
               np.vstack((gubun, sa_id, category_id, category_name, score, rank)).T, 
               delimiter='\001', 
               fmt=['%s','%s','%s','%s','%1.5f','%i'],
               encoding='utf-8')

In [11]:
def batch_input_data():
    start = 0
    end = start + batch_size
    finished = False

    while not finished:
        yielded_users = np.repeat(unique_user.iloc[start:end]['sa_id_CODE'].values, n_items)
        yielded_albums = np.tile(unique_item['category_id_CODE'].values, batch_size)

        yield yielded_users, yielded_albums

        start = end
        end = start + batch_size

        if end > n_users:
            finished = True

In [12]:
def gen_data_loader():

    return tfds.from_generator(batch_input_data,
                              output_types=(tf.int32,
                                            tf.int32)).prefetch(32)

In [13]:
def file_open(i):
    return open(os.path.join(result_path, 'reclist_' + str(i) + '.csv'), 'wb')


def file_close(file):
    file.close()

In [14]:
with tqdm(total=n_users//batch_size) as pbar:
    i = 0

    f = file_open(i)
    for said, category in gen_data_loader():
        pred = model.predict_on_batch({'sa_id_CODE':said,
                                           'category_id_CODE':category}).reshape(-1, n_items)

        if f.tell() > 1024**3:  # split file by size
            break

        save_reclist(pred, said[0], said[-1]+1, f)

        pbar.update(1)

    file_close(f)

 42%|████▏     | 16632/39469 [16:09<22:11, 17.16it/s]
