In [1]:
####################################################################
# Implementation of BCQ.
#     State: (User Portraits, whether the item was clicked)
#     Action: itemID (each sample is split into 9 steps)
#     Rewards: item_purchase * price
# 0. split train data into training set and validation set
# 1. prepare data for DQN from training set
# 2. prepare data for DQN from validation set
# 3. train DQN
# 4. make suggestions for validation set
# 5. Calculate Metrics 1 for our suggestions
# 6. Generate suggestions for provided testset for true scoring
#####################################################################

Number of Multiprocessing threads: 31


In [1]:
from DataPrep import splitTrainSet2
import pandas as pd

userPortraitsTrain, clickTrain, recItemsTrain, purchaseLabelTrain, userPortraitsVal, clickVal, recItemsVal, purchaseLabelVal = splitTrainSet2()

N_ITEMS = 381
# load item price
itemInfo = pd.read_csv('/tf/shared/item_info.csv', ' ')
itemInfo = itemInfo.sort_values(by = 'item_id')
itemPrice = itemInfo.price

Number of Multiprocessing threads: 31


In [9]:
# 1. prepare data for BCQ from training set
import numpy as np
from d3rlpy.dataset import MDPDataset
from tqdm import tqdm

statesTrain = []
actionsTrain = []
rewardsTrain = []
terminalTrain = []  # terminal flag: 0 = game continue, 1 = game stop

for i in tqdm(range(len(userPortraitsTrain))):
    itemList = recItemsTrain[i]       # list of 9
    purchase = purchaseLabelTrain[i]  # list of 9
    userPortraits = userPortraitsTrain[i]  # this is list of 10
    clickedItems = clickTrain[i]      # list of variable length
    for step in range(9):
        # action: itemID
        itemID = itemList[step]
        actionsTrain.append(itemID)
        # state: user portrait + whether item was clicked + step one-hot-encoded
        if itemID in clickedItems:
            click = 1
        else:
            click = 0
        step_encoded = [0]*step + [1] + [0]*(8-step)
        state = list(userPortraits) + step_encoded + [click]
        statesTrain.append(state)
        # calculate reward
        if purchase[step]==1:
            rewardsTrain.append(itemPrice[itemID-1]) # itemID-1 becuase itemPrice is a 0-based array
        else:
            rewardsTrain.append(0)
        # terminal flag: determine by looking at previous purchase flags
        if step<2:
            terminalTrain.append(0) # game continue
        elif step==2 and purchase[0]*purchase[1]*purchase[2]==0:
            terminalTrain.append(1) # game stop
        elif step<5:
            terminalTrain.append(0) # game continue
        elif step==5 and purchase[3]*purchase[4]*purchase[5]:
            terminalTrain.append(1) # game stop
        elif step<8:
            terminalTrain.append(0) # game continue
        else:
            terminalTrain.append(1) # game stop

statesTrain = np.array(statesTrain)
actionsTrain = np.array(actionsTrain)
rewardsTrain = np.array(rewardsTrain)
terminalTrain = np.array(terminalTrain)
datasetTrain = MDPDataset(statesTrain, actionsTrain, rewardsTrain, terminalTrain, discrete_action = True)

100%|██████████| 208069/208069 [00:20<00:00, 9989.62it/s] 


In [10]:
# 1. prepare data for BCQ from validation set
import numpy as np
from d3rlpy.dataset import MDPDataset
from tqdm import tqdm

statesVal = []
actionsVal = []
rewardsVal = []
terminalVal = []  # terminal flag: 0 = game continue, 1 = game stop

for i in tqdm(range(len(userPortraitsVal))):
    itemList = recItemsVal[i]       # list of 9
    purchase = purchaseLabelVal[i]  # list of 9
    userPortraits = userPortraitsVal[i]  # this is list of 10
    clickedItems = clickVal[i]      # list of variable length
    for step in range(9):
        # action: itemID
        itemID = itemList[step]
        actionsVal.append(itemID)
        # state: user portrait + whether item was clicked + step one-hot-encoded
        if itemID in clickedItems:
            click = 1
        else:
            click = 0
        step_encoded = [0]*step + [1] + [0]*(8-step)
        state = list(userPortraits) + step_encoded + [click] 
        statesVal.append(state)
        # calculate reward
        if purchase[step]==1:
            rewardsVal.append(itemPrice[itemID-1]) # itemID-1 becuase itemPrice is a 0-based array
        else:
            rewardsVal.append(0)
        # terminal flag: determine by looking at previous purchase flags
        if step<2:
            terminalVal.append(0) # game continue
        elif step==2 and purchase[0]*purchase[1]*purchase[2]==0:
            terminalVal.append(1) # game stop
        elif step<5:
            terminalVal.append(0) # game continue
        elif step==5 and purchase[3]*purchase[4]*purchase[5]:
            terminalVal.append(1) # game stop
        elif step<8:
            terminalVal.append(0) # game continue
        else:
            terminalVal.append(1) # game stop

statesVal = np.array(statesVal)
actionsVal = np.array(actionsVal)
rewardsVal = np.array(rewardsVal)
terminalVal = np.array(terminalVal)
datasetVal = MDPDataset(statesVal, actionsVal, rewardsVal, terminalVal, discrete_action = True)

100%|██████████| 52018/52018 [00:04<00:00, 11358.77it/s]


In [11]:
# prepare BCQ Model
from d3rlpy.algos import DiscreteBCQ
BCQModel = DiscreteBCQ(use_gpu = True)
BCQModel.fit(datasetTrain, n_epochs=25, verbose=False, show_progress=False)


2021-08-21 23:31.18 [debug    ] RoundIterator is selected.
2021-08-21 23:31.18 [info     ] Directory is created at d3rlpy_logs/DiscreteBCQ_20210821233118
2021-08-21 23:31.18 [debug    ] Building models...
2021-08-21 23:31.18 [debug    ] Models have been built.
2021-08-21 23:35.34 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210821233118/model_46532.pt
2021-08-21 23:39.53 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210821233118/model_93064.pt
2021-08-21 23:44.04 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210821233118/model_139596.pt
2021-08-21 23:48.22 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210821233118/model_186128.pt
2021-08-21 23:52.42 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210821233118/model_232660.pt
2021-08-21 23:56.36 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210821233118/model_279192.pt
2021-08-22 00:00.55 [info     ] Model p

[(1,
  {'time_sample_batch': 0.00010749809639293822,
   'time_algorithm_update': 0.004675005991317682,
   'loss': 9160.539016765606,
   'time_step': 0.005499218389049889}),
 (2,
  {'time_sample_batch': 0.00010812649423131465,
   'time_algorithm_update': 0.00470144230881039,
   'loss': 1790.8441349574873,
   'time_step': 0.005538534156886543}),
 (3,
  {'time_sample_batch': 0.00010710878322184532,
   'time_algorithm_update': 0.004574579040950689,
   'loss': 2597.5195110422374,
   'time_step': 0.005398291688091233}),
 (4,
  {'time_sample_batch': 0.00010851423440954707,
   'time_algorithm_update': 0.0046932787627947566,
   'loss': 3438.688518540238,
   'time_step': 0.005525618179291448}),
 (5,
  {'time_sample_batch': 0.00010919761526546709,
   'time_algorithm_update': 0.00473179443063099,
   'loss': 4152.637403018741,
   'time_step': 0.005565003686560755}),
 (6,
  {'time_sample_batch': 0.00010344425297496673,
   'time_algorithm_update': 0.004234154467466701,
   'loss': 4920.146909250886,
 

In [15]:
# ---------------------------- Make prediction for test set ---------------------------
###### prepare test set to make prediction
###### expand each row of test samples to 9 rows, each correspond to a step
from DataPrep import getUserFeaturesTestSet, getClickedItems, getUserPortraits
userIDs, _ = getUserFeaturesTestSet()
_, clickedItemsTest = getClickedItems()
_, userPortraitsTest = getUserPortraits()

statesTest = []  # user portrait + step one-hot-encoded + whether item was clicked
for i in tqdm(range(userPortraitsTest.shape[0])):
# loop through samples
    userPortraits = list(userPortraitsTest[i])  # this is list of 10
    for step in range(9):
        step_encoded = [0]*step + [1] + [0]*(8-step)
        statesTest.append(userPortraits + step_encoded)


100%|██████████| 206096/206096 [00:03<00:00, 58590.35it/s]


In [22]:
# predict value for each action
# result: 2D matrix, D1: samples, D2: N_ITEMS
values_allActions = []
for itemID in tqdm(range(1, N_ITEMS+1)):
    statesTest_wClick = []
    # prepare states with click column
    for i in range(len(statesTest)):
        clickedItems = clickedItemsTest[i//9]
        if itemID in clickedItems:
            click = [1]
        else:
            click = [0]
        statesTest_wClick.append(statesTest[i] + click)
    # predict value for this itemID
    statesTest_wClick = np.array(statesTest_wClick)
    actions = np.array([itemID]*statesTest_wClick.shape[0])
    values_allStates = BCQModel.predict_value(statesTest_wClick, actions)
    values_allActions.append(values_allStates)

100%|██████████| 381/381 [1:27:58<00:00, 13.85s/it]


In [30]:
# finalize best itemIDs
items_out = []
for i in tqdm(range(len(statesTest))):
# loop thru expanded samples *9
    if i%9==0:
        items = []
        for step in range(9):
            rowID = i + step # row index on statesTest
            values = [ v[rowID] for v in values_allActions ] # length N_ITEMS, value for each action in this step
            negValues = [-v for v in values]
            order = np.argsort(negValues)
            for j in order:
                itemID = j + 1
                if itemID not in items:
                    items.append(itemID)
                    break
        # done predicting for all 9 steps
        items_out.append(items)


100%|██████████| 1854864/1854864 [02:36<00:00, 11871.62it/s]


In [31]:
# calculate % of recommended items in the clicked items
count = 0
for items in tqdm(items_out):

[[124, 68, 31, 140, 64, 23, 120, 118, 139], [31, 68, 120, 9, 21, 23, 32, 119, 20], [124, 68, 31, 140, 64, 23, 120, 118, 139], [31, 139, 68, 120, 332, 23, 21, 118, 119], [31, 9, 23, 68, 21, 32, 20, 119, 120], [31, 120, 68, 23, 9, 21, 32, 119, 124], [118, 31, 23, 2, 30, 124, 32, 3, 39], [68, 31, 9, 120, 32, 23, 21, 20, 30], [332, 139, 31, 68, 9, 32, 23, 4, 120], [31, 68, 9, 23, 120, 139, 64, 124, 21], [68, 31, 9, 120, 23, 21, 64, 119, 32], [31, 68, 23, 21, 120, 30, 9, 32, 8], [68, 9, 31, 139, 120, 21, 23, 332, 64], [9, 68, 31, 21, 120, 32, 23, 30, 20], [31, 68, 120, 9, 21, 23, 32, 119, 20], [68, 31, 120, 9, 23, 64, 119, 32, 21], [31, 68, 9, 32, 23, 20, 21, 120, 30], [23, 31, 68, 9, 124, 21, 139, 64, 120], [31, 9, 21, 23, 68, 120, 32, 20, 30], [31, 68, 120, 9, 23, 64, 21, 139, 30], [68, 31, 9, 23, 32, 20, 21, 120, 8], [68, 23, 9, 31, 21, 118, 4, 119, 57], [23, 21, 31, 9, 32, 120, 68, 119, 30], [9, 31, 68, 23, 32, 21, 20, 30, 8], [31, 68, 9, 21, 23, 32, 20, 139, 119], [31, 23, 9, 21, 119, 

In [None]:
# write recommended items to output csv file
from classes.output import writeOutput
writeOutput(items_out, 'BCQ-QLearning_v3.csv', userIDs)