In [1]:
###############################################################################
# BCQ to predict first 3D item set
# Q Learning to predict the other 2 item sets 
#     Q Table: state = itemSetID
#              action = itemSetID
#
# 0. split train data into training set and validation set
# ----- Train BCQ -----------------
# 1.1 prepare data for training set:
#       step: 0, 1, 2, ..., 9
#       state: userFeatures20D + [step]
#       action: itemID
# 1.2 Train BCQ
#
# ----- Train Q Learning ----------
# 2.1 prepare data for training set:
#     state: itemSetID
#     action itemSetID
# 2.2 Train Q Learning
# 
# ----- Prediction ----------------
# 3.1 transform userFeaturesTest to 20D by using PCA
# 3.2 Make prediction of the first itemSet by using BCQModel, name it itemSet1
# 3.3 use itemSet1 as state for QLModel to predict best itemSet2
# 3.4 use itemSet2 as state for QLModel to predict best itemSet3
###############################################################################

In [8]:
# 1. Split Train 
from DataPrep import *
from tqdm import tqdm
userFeaturesTrain, recItemsTrain, purchaseLabelTrain, userFeaturesVal, recItemsVal, purchaseLabelVal = splitTrainSet()
# when training, userFeaturesTrain represent state
N_ITEMS = 381
# load item info
from classes.Items import Items
itemInfo = Items()




Number of Multiprocessing threads: 31


  userFeaturesTrain, recItemsTrain, purchaseLabelTrain, userFeaturesVal, recItemsVal, purchaseLabelVal = splitTrainSet()
  itemInfo = Items()


In [9]:
# dimension reduction with PCA
# comment this part out to use original user features 
import pandas as pd

# cluster model of 20D
from DataPrep import getClusterModel200_20D
ClusterModel, clusterLabels = getClusterModel200_20D()

from DataPrep import getPCATransformer
PCAtransformer = getPCATransformer()
userFeaturesTrain = pd.DataFrame(PCAtransformer.transform(userFeaturesTrain))
userFeaturesVal = pd.DataFrame(PCAtransformer.transform(userFeaturesVal))


In [3]:
# 1.1 prepare data for training set
import numpy as np
from d3rlpy.dataset import MDPDataset

statesTrain = []
actionsTrain = []
rewardsTrain = []
terminalTrain = []  # terminal flag: 0 = game continue, 1 = game stop
for i in tqdm(range(userFeaturesTrain.shape[0])):
# loop through samples
    state = list(userFeaturesTrain.iloc[i])
    itemList = recItemsTrain[i]
    purchase = purchaseLabelTrain[i]
    for step in range(9):
        # check if game is still running
        if step>2 and purchase[0]*purchase[1]*purchase[2]==0:
            # stop adding to data set if game stopped
            break
        if step>5 and purchase[3]*purchase[4]*purchase[5]==0:
            # stop adding to data set if game stopped
            break
        # after passing check, we can add new record to train set
        # append step to state
        statesTrain.append(state + [step])
        # action = itemID
        itemID = itemList[step]
        actionsTrain.append(itemID)
        # calculate reward
        if purchase[step]==1:
            rewardsTrain.append(itemInfo.getItemPrice(itemID))
        else:
            rewardsTrain.append(0)
        # terminal flag: determine by looking at previous purchase flags
        if step<2:
            terminalTrain.append(0) # game continue
        elif step==2 and purchase[0]*purchase[1]*purchase[2]==0:
            terminalTrain.append(1) # game stop
        elif step<5:
            terminalTrain.append(0) # game continue
        elif step==5 and purchase[3]*purchase[4]*purchase[5]:
            terminalTrain.append(1) # game stop
        elif step<8:
            terminalTrain.append(0) # game continue
        else:
            terminalTrain.append(1) # game stop

# ### terminal flags: all 1
statesTrain = np.array(statesTrain)
actionsTrain = np.array(actionsTrain)
rewardsTrain = np.array(rewardsTrain)
terminalTrain = np.array(terminalTrain)
datasetTrain = MDPDataset(statesTrain, actionsTrain, rewardsTrain, terminalTrain, discrete_action = True)

100% 208069/208069 [00:42<00:00, 4932.98it/s]


In [4]:
# 1.1 prepare data for validation set
statesVal = []
actionsVal = []
rewardsVal = []
terminalVal = []  # terminal flag: 0 = game continue, 1 = game stop
for i in tqdm(range(userFeaturesVal.shape[0])):
# loop through samples
    state = list(userFeaturesVal.iloc[i])
    itemList = recItemsVal[i]
    purchase = purchaseLabelVal[i]
    for step in range(9):
        # check if game is still running
        if step>2 and purchase[0]*purchase[1]*purchase[2]==0:
            # stop adding to data set if game stopped
            break
        if step>5 and purchase[3]*purchase[4]*purchase[5]==0:
            # stop adding to data set if game stopped
            break
        # after passing check, we can add new record to val set
        # append step to state
        statesVal.append(state + [step])
        # action = itemID
        itemID = itemList[step]
        actionsVal.append(itemID)
        # calculate reward
        if purchase[step]==1:
            rewardsVal.append(itemInfo.getItemPrice(itemID))
        else:
            rewardsVal.append(0)
        # terminal flag: determine by looking at previous purchase flags
        if step<2:
            terminalVal.append(0) # game continue
        elif step==2 and purchase[0]*purchase[1]*purchase[2]==0:
            terminalVal.append(1) # game stop
        elif step<5:
            terminalVal.append(0) # game continue
        elif step==5 and purchase[3]*purchase[4]*purchase[5]:
            terminalVal.append(1) # game stop
        elif step<8:
            terminalVal.append(0) # game continue
        else:
            terminalVal.append(1) # game stop


# ### terminal flags: all 1
statesVal = np.array(statesVal)
actionsVal = np.array(actionsVal)
rewardsVal = np.array(rewardsVal)
terminalVal = np.array(terminalVal)
datasetVal = MDPDataset(statesVal, actionsVal, rewardsVal, terminalVal, discrete_action = True)

100% 52018/52018 [00:10<00:00, 5112.48it/s]


In [5]:
# save checkpoint
with open('/tf/shared/checkpoints/data-3D-20DFeatures2_v2.pkl', 'wb') as file:
    pickle.dump((statesTrain, actionsTrain, rewardsTrain, terminalTrain, statesVal, actionsVal, rewardsVal, terminalVal),
                file, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
# reload checkpoint
import pickle
with open('/tf/shared/checkpoints/data-3D-20DFeatures2_v2.pkl', 'rb') as file:
    statesTrain, actionsTrain, rewardsTrain, terminalTrain, statesVal, actionsVal, rewardsVal, terminalVal = pickle.load(file)

from d3rlpy.dataset import MDPDataset
datasetTrain = MDPDataset(statesTrain, actionsTrain, rewardsTrain, terminalTrain, discrete_action = True)
datasetVal = MDPDataset(statesVal, actionsVal, rewardsVal, terminalVal, discrete_action = True)

In [6]:
# 1.2 Train BCQ
from d3rlpy.algos import DiscreteBCQ
BCQModel = DiscreteBCQ(use_gpu = True)
BCQModel.build_with_dataset(datasetTrain)
BCQModel.fit(datasetTrain, 
    n_epochs = 25, verbose = False, show_progress = False)

2021-08-17 21:20.16 [debug    ] RoundIterator is selected.
2021-08-17 21:20.16 [info     ] Directory is created at d3rlpy_logs/DiscreteBCQ_20210817212016
2021-08-17 21:27.20 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210817212016/model_35276.pt
2021-08-17 21:34.26 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210817212016/model_70552.pt
2021-08-17 21:41.33 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210817212016/model_105828.pt
2021-08-17 21:48.41 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210817212016/model_141104.pt
2021-08-17 21:55.43 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210817212016/model_176380.pt
2021-08-17 22:02.50 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210817212016/model_211656.pt
2021-08-17 22:09.55 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210817212016/model_246932.pt
2021-08-17 22:17.01 [info    

[(1,
  {'time_sample_batch': 0.00023121772432613838,
   'time_algorithm_update': 0.011458939571393592,
   'loss': 1101.7750068670607,
   'time_step': 0.011967472653497657}),
 (2,
  {'time_sample_batch': 0.00022424331177982806,
   'time_algorithm_update': 0.011576907091879333,
   'loss': 1377.555936580424,
   'time_step': 0.012082560429538996}),
 (3,
  {'time_sample_batch': 0.00025576197623882125,
   'time_algorithm_update': 0.01153325472901043,
   'loss': 1402.503883558196,
   'time_step': 0.0120786996517123}),
 (4,
  {'time_sample_batch': 0.00024646318530614634,
   'time_algorithm_update': 0.011596293747594046,
   'loss': 1328.6241755876456,
   'time_step': 0.012126999831899844}),
 (5,
  {'time_sample_batch': 0.00023319069957203274,
   'time_algorithm_update': 0.011424832082359762,
   'loss': 1229.9802922479068,
   'time_step': 0.011935575754007983}),
 (6,
  {'time_sample_batch': 0.00022762471119121218,
   'time_algorithm_update': 0.011577660006884207,
   'loss': 1177.5188194324214,
 

In [19]:
# ----------------------------- Train Q Learning --------------------------------------------


In [15]:
# 2.1 Prepare train data set
#### state: step 0-2
#### action: the itemSetID recommended
#### reward: (item is purchased) * price
#### nextState: next step
#### to feed a set of (state, action, reward, nextState) to a Q table
from tqdm import tqdm
from classes.ItemSet import ItemSet3
itemSet3 = ItemSet3()
N_ACTIONS = itemSet3.getNSets()
N_STATES = 3

trainSetQL = []
for i in tqdm(range(userFeaturesTrain.shape[0])):
# loop through samples
    recItems = recItemsTrain[i]
    purLabel = purchaseLabelTrain[i]
    for j in [0, 3, 6]: # process each Set3 at once
        if j>2 and purLabel[0]*purLabel[1]*purLabel[2]==0:
            # don't train if game stopped
            break
        if j>5 and purLabel[3]*purLabel[4]*purLabel[5]==0:
            # don't train if game stopped
            break
        # calculate state:
        state = j/3
        # action: itemSetID
        itemSet = [recItems[j], recItems[j+1], recItems[j+2]]
        itemSetID = itemSet3.getSetID(itemSet)
        action = itemSetID
        # reward:
        prices = [itemInfo.getItemPrice(itemSet[0]), itemInfo.getItemPrice(itemSet[1]), itemInfo.getItemPrice(itemSet[2])]
        labels = [purLabel[j], purLabel[j+1], purLabel[j+2]]
        reward = sum([prices[t]*labels[t] for t in range(3)])
        # next state:
        if j==6:
            nextState = -1
        else:
            nextState = state+1
        # append to train data set 
        trainSetQL.append((state, action, reward, nextState))



100% 208069/208069 [00:21<00:00, 9736.83it/s]


In [None]:
# 2.2 Train QL model
# initialize
from classes import QLearning2
from importlib import reload  
QLearning2 = reload(QLearning2)

print('N_ACTIONS: ' + str(N_ACTIONS) + ' N_STATES: ' + str(N_STATES))
QLModel = QLearning2.QLearning(n_states = N_STATES, n_actions = N_ACTIONS)
# train in parallel
# QLModel.trainParallel(trainSetQL)
QLModel.train(trainSetQL)


 20% 95869/473522 [28:20<1:57:31, 53.56it/s]

In [None]:
# save checkpoint
import pickle
with open('/tf/shared/checkpoints/model-BCQ-QL_v2.pkl', 'wb') as file:
    pickle.dump((BCQModel, QLModel),
                file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# reload checkpoint
import pickle
with open('/tf/shared/checkpoints/model-BCQ-QL_v2.pkl', 'rb') as file:
    BCQModel, QLModel = pickle.load(file)

In [10]:
######################## PREDICTION ##########################################
userIDs, userFeaturesTest = getUserFeaturesTestSet()


In [11]:
# 3.1 transform userFeaturesTest to 20D by using PCA
userFeaturesTest = pd.DataFrame(PCAtransformer.transform(userFeaturesTest))

In [None]:
# predict first 6 items with BCQ
from classes.d3rlpy_wrapper import predictBestK, finalizeItemSetsTestSet
statesTest = []  # this will be userFeaturesTest appended with a column of step = 0 to 8
for i in tqdm(range(userFeaturesTest.shape[0])):
# loop through samples
    state = list(userFeaturesTest.iloc[i])
    for step in range(6):
        # append step to state
        statesTest.append(state + [step])
statesTest = np.array(statesTest)
bestItems_6xSamples = predictBestK(BCQModel, statesTest, 6)
bestFirst6tems = finalizeItemSetsTestSet(bestItems_6xSamples)


# QLModel.initPredCache()
# recItems_test = []
# for i in tqdm(range(userFeaturesTest.shape[0])):
# # loop thru samples
#     recItems = []  # recommended list for this sample
#     # 3.2 Make prediction of the first itemSet by using BCQModel, name it itemSet1
#     state = userFeaturesTest.iloc[i]  # first step of the game
#     itemSetID1 = BCQModel.predict([np.array(state)])[0]
#     recItems.extend(list(itemSet3.getItemSet(itemSetID1)))
#     # 3.3 use itemSet1 as state for QLModel to predict best itemSet2
#     # now stateID = itemSetID1
#     candidateSetIDs = QLModel.predictBestK(itemSetID1, 20)
#     for setID in candidateSetIDs:
#         items = itemSet3.getItemSet(setID)
#         if (items[0] not in recItems) and (items[1] not in recItems) and (items[2] not in recItems):
#             # we have found a suitable solution for step 2
#             itemSetID2 = setID
#             recItems.extend(list(items))
#             break
#     # 3.4 use itemSet2 as state for QLModel to predict best itemSet3
#     # now stateID = itemSetID2
#     candidateSetIDs = QLModel.predictBestK(itemSetID2, 20)
#     for setID in candidateSetIDs:
#         items = itemSet3.getItemSet(setID)
#         if (items[0] not in recItems) and (items[1] not in recItems) and (items[2] not in recItems):
#             # we have found a suitable solution for step 2
#             itemSetID3 = setID
#             recItems.extend(list(items))
#             break
#     recItems_test.append(recItems)


100% 206096/206096 [00:22<00:00, 9229.85it/s] 
100% 381/381 [57:54<00:00,  9.12s/it]
100% 1236576/1236576 [05:27<00:00, 3775.36it/s]


In [13]:
bestFirst6tems = finalizeItemSetsTestSet(statesTest, bestItems_3xSamples, 6)
assert len(bestFirst6tems)==len(userIDs)


100% 1236576/1236576 [00:11<00:00, 104933.51it/s]


In [None]:
# predict last 3 items with QL
bestItemSetIDs = QLModel.predictBestK(2, 100)
finalItems = []
for i in tqdm(range(len(bestFirst6tems))):
    first6 = bestFirst6tems[i]
    for setID in bestItemSetIDs:
        items = itemSet3.getItemSet(setID)
        if (items[0] not in first6) and (items[1] not in first6) and (items[2] not in first6):
            finalItems.append(first6 + list(items))
            break



In [14]:
print(bestFirst6tems[:50])

[[220, 196, 221, 95, 97, 48], [220, 196, 221, 95, 97, 42], [220, 196, 221, 95, 97, 48], [220, 196, 221, 98, 97, 29], [220, 196, 218, 240, 238, 29], [220, 196, 221, 218, 97, 29], [220, 196, 218, 221, 98, 29], [220, 196, 221, 95, 97, 48], [220, 196, 221, 95, 97, 42], [220, 196, 221, 218, 97, 29], [220, 196, 218, 221, 98, 29], [220, 196, 218, 240, 238, 29], [220, 196, 221, 95, 97, 42], [220, 196, 221, 95, 97, 48], [220, 196, 221, 95, 97, 42], [220, 196, 221, 95, 97, 42], [220, 196, 221, 95, 97, 42], [220, 196, 218, 240, 98, 29], [220, 196, 221, 95, 97, 48], [220, 196, 221, 218, 97, 29], [220, 196, 218, 240, 238, 29], [220, 196, 218, 240, 238, 29], [220, 196, 218, 221, 98, 29], [220, 196, 218, 240, 238, 29], [220, 196, 218, 221, 98, 29], [220, 196, 218, 240, 221, 29], [220, 196, 221, 95, 97, 48], [220, 196, 221, 95, 97, 42], [220, 196, 221, 95, 97, 48], [220, 196, 221, 95, 97, 42], [220, 196, 221, 95, 97, 42], [220, 196, 221, 218, 97, 29], [220, 196, 221, 95, 97, 42], [220, 196, 221, 95, 9

In [None]:
# write recommended items to output csv file
from classes.output import writeOutput
writeOutput(recItems_test, 'BCQ-QLearning_v1.csv', userIDs)