In [1]:
###############################################################################
# BCQ to predict first 3D item set
# Q Learning to predict the other 2 item sets 
#     Q Table: state = itemSetID
#              action = itemSetID
#
# 0. split train data into training set and validation set
# ----- Train BCQ -----------------
# 1.1 prepare data for training set:
#       state: 0, 1, 2
#       action: itemSetID
# 1.2 Train BCQ
#
# ----- Train Q Learning ----------
# 2.1 prepare data for training set:
#     state: itemSetID
#     action itemSetID
# 2.2 Train Q Learning
# 
# ----- Prediction ----------------
# 3.1 transform userFeaturesTest to 20D by using PCA
# 3.2 Make prediction of the first itemSet by using BCQModel, name it itemSet1
# 3.3 use itemSet1 as state for QLModel to predict best itemSet2
# 3.4 use itemSet2 as state for QLModel to predict best itemSet3
###############################################################################

In [14]:
# 1. Split Train 
from DataPrep import *
userFeaturesTrain, recItemsTrain, purchaseLabelTrain, userFeaturesVal, recItemsVal, purchaseLabelVal = splitTrainSet()
userFeaturesTrain = pd.concat((userFeaturesTrain, userFeaturesVal), ignore_index=True)
recItemsTrain = np.vstack((recItemsTrain, recItemsVal))
purchaseLabelTrain = np.vstack((purchaseLabelTrain, purchaseLabelVal))

# load item info
from classes.Items import Items
itemInfo = Items()
# translator from (ID1, ID2, ID3) to setID
from classes.ItemSet import ItemSet3
itemSet3 = ItemSet3()

  userFeaturesTrain, recItemsTrain, purchaseLabelTrain, userFeaturesVal, recItemsVal, purchaseLabelVal = splitTrainSet()
  itemInfo = Items()


In [15]:
# dimension reduction with PCA
# comment this part out to use original user features 
import pandas as pd

# cluster model of 20D
from DataPrep import getClusterModel200_20D
ClusterModel, clusterLabels = getClusterModel200_20D()

from DataPrep import getPCATransformer
PCAtransformer = getPCATransformer()
userFeaturesTrain = pd.DataFrame(PCAtransformer.transform(userFeaturesTrain))
userFeaturesVal = pd.DataFrame(PCAtransformer.transform(userFeaturesVal))


In [16]:
# 1.1 prepare data for training set
import numpy as np
from d3rlpy.dataset import MDPDataset

statesTrain = []
actionsTrain = []
rewardsTrain = []
terminalTrain = []  # terminal flag: 0 = game continue, 1 = game stop

for i in tqdm(range(userFeaturesTrain.shape[0])):
# loop through samples
    state = list(userFeaturesTrain.iloc[i])
    itemList = recItemsTrain[i]
    purchase = purchaseLabelTrain[i]
    for step in range(3):
        # check if game is still running
        if step>0 and purchase[0]*purchase[1]*purchase[2]==0:
            # stop adding to data set if game stopped
            break
        if step>1 and purchase[3]*purchase[4]*purchase[5]==0:
            # stop adding to data set if game stopped
            break
        # after passing check, we can add new record to train set
        # append step to state
        if step==0:
            step_OneHot = [1, 0, 0]
        elif step==1:
            step_OneHot = [0, 1, 0]
        else:
            step_OneHot = [0, 0, 1]
        statesTrain.append(state + step_OneHot)
        # action = itemSetID
        itemIDs = (itemList[step*3], itemList[step*3+1], itemList[step*3+2])
        itemSetID = itemSet3.getSetID(itemIDs)
        actionsTrain.append(itemSetID)
        # calculate reward
        price0 = itemInfo.getItemPrice(itemIDs[0])
        price1 = itemInfo.getItemPrice(itemIDs[1])
        price2 = itemInfo.getItemPrice(itemIDs[2])
        purch0 = purchase[step*3]
        purch1 = purchase[step*3+1]
        purch2 = purchase[step*3+2]
        reward = price0*purch0 + price1*purch1 + price2*purch2
        rewardsTrain.append(reward)
        # terminal flag: determine by looking at previous purchase flags
        if step!=2:
            if purch0*purch1*purch2 == 1: # game continue if all is 1
                terminalTrain.append(0)
            else:
                terminalTrain.append(1) # game stop
        else: # game stop at step 2
            terminalTrain.append(1)

# ### terminal flags: all 1
statesTrain = np.array(statesTrain)
actionsTrain = np.array(actionsTrain)
rewardsTrain = np.array(rewardsTrain)
terminalTrain = np.array(terminalTrain)
datasetTrain = MDPDataset(statesTrain, actionsTrain, rewardsTrain, terminalTrain, discrete_action = True)

100% 260087/260087 [00:53<00:00, 4832.45it/s]


In [17]:
# 1.1 prepare data for validation set
statesVal = []
actionsVal = []
rewardsVal = []
terminalVal = []  # terminal flag: 0 = game continue, 1 = game stop

for i in tqdm(range(userFeaturesTrain.shape[0])):
# loop through samples
    state = list(userFeaturesTrain.iloc[i])
    itemList = recItemsTrain[i]
    purchase = purchaseLabelTrain[i]
    for step in range(3):
        # check if game is still running
        if step>0 and purchase[0]*purchase[1]*purchase[2]==0:
            # stop adding to data set if game stopped
            break
        if step>1 and purchase[3]*purchase[4]*purchase[5]==0:
            # stop adding to data set if game stopped
            break
        # after passing check, we can add new record to train set
        # append step to state
        if step==0:
            step_OneHot = [1, 0, 0]
        elif step==1:
            step_OneHot = [0, 1, 0]
        else:
            step_OneHot = [0, 0, 1]
        statesVal.append(state + step_OneHot)
        # action = itemSetID
        itemIDs = (itemList[step*3], itemList[step*3+1], itemList[step*3+2])
        itemSetID = itemSet3.getSetID(itemIDs)
        actionsVal.append(itemSetID)
        # calculate reward
        price0 = itemInfo.getItemPrice(itemIDs[0])
        price1 = itemInfo.getItemPrice(itemIDs[1])
        price2 = itemInfo.getItemPrice(itemIDs[2])
        purch0 = purchase[step*3]
        purch1 = purchase[step*3+1]
        purch2 = purchase[step*3+2]
        reward = price0*purch0 + price1*purch1 + price2*purch2
        rewardsVal.append(reward)
        # terminal flag: determine by looking at previous purchase flags
        if step!=2:
            if purch0*purch1*purch2 == 1: # game continue if all is 1
                terminalVal.append(0)
            else:
                terminalVal.append(1) # game stop
        else: # game stop at step 2
            terminalVal.append(1)

# ### terminal flags: all 1
statesVal = np.array(statesVal)
actionsVal = np.array(actionsVal)
rewardsVal = np.array(rewardsVal)
terminalVal = np.array(terminalVal)
datasetVal = MDPDataset(statesVal, actionsVal, rewardsVal, terminalVal, discrete_action = True)

100% 260087/260087 [00:54<00:00, 4789.80it/s]


In [6]:
# save checkpoint
with open('/tf/shared/checkpoints/data-3D-20DFeatures2.pkl', 'wb') as file:
    pickle.dump((statesTrain, actionsTrain, rewardsTrain, terminalTrain, statesVal, actionsVal, rewardsVal, terminalVal),
                file, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
# reload checkpoint
import pickle
with open('/tf/shared/checkpoints/data-3D-20DFeatures2.pkl', 'rb') as file:
    statesTrain, actionsTrain, rewardsTrain, terminalTrain, statesVal, actionsVal, rewardsVal, terminalVal = pickle.load(file)

from d3rlpy.dataset import MDPDataset
datasetTrain = MDPDataset(statesTrain, actionsTrain, rewardsTrain, terminalTrain, discrete_action = True)
datasetVal = MDPDataset(statesVal, actionsVal, rewardsVal, terminalVal, discrete_action = True)

In [18]:
# 1.2 Train BCQ
from d3rlpy.algos import DiscreteBCQ
BCQModel = DiscreteBCQ(use_gpu = True)
BCQModel.build_with_dataset(datasetTrain)
BCQModel.fit(datasetTrain, 
    eval_episodes = datasetVal,
    n_epochs = 10, verbose = False, show_progress = False)

2021-08-14 22:48.02 [debug    ] RoundIterator is selected.
2021-08-14 22:48.02 [info     ] Directory is created at d3rlpy_logs/DiscreteBCQ_20210814224802
2021-08-14 22:51.14 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210814224802/model_10367.pt
2021-08-14 22:54.26 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210814224802/model_20734.pt
2021-08-14 22:57.39 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210814224802/model_31101.pt
2021-08-14 23:00.52 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210814224802/model_41468.pt
2021-08-14 23:04.05 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210814224802/model_51835.pt
2021-08-14 23:07.19 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210814224802/model_62202.pt
2021-08-14 23:10.32 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteBCQ_20210814224802/model_72569.pt
2021-08-14 23:13.47 [info     ] Mo

[(1,
  {'time_sample_batch': 0.0001515564836185809,
   'time_algorithm_update': 0.01805083475934398,
   'loss': 4282.665614854753,
   'time_step': 0.018301082138064466}),
 (2,
  {'time_sample_batch': 0.00015867410727679656,
   'time_algorithm_update': 0.018053972952228117,
   'loss': 5393.518721679028,
   'time_step': 0.018312125837514265}),
 (3,
  {'time_sample_batch': 0.00015568935596300113,
   'time_algorithm_update': 0.0180574414400513,
   'loss': 6499.4272664878645,
   'time_step': 0.018311882497397985}),
 (4,
  {'time_sample_batch': 0.00015911578074179563,
   'time_algorithm_update': 0.018094284320344482,
   'loss': 5442.25285130204,
   'time_step': 0.018352510545733718}),
 (5,
  {'time_sample_batch': 0.0001569997267356498,
   'time_algorithm_update': 0.0181235435718026,
   'loss': 4624.050513665564,
   'time_step': 0.018379950277299606}),
 (6,
  {'time_sample_batch': 0.00015844944140438956,
   'time_algorithm_update': 0.018198276009959335,
   'loss': 4637.572388401806,
   'time_

In [19]:
# ----------------------------- Train Q Learning --------------------------------------------


In [20]:
# 2.1 Prepare train data set
#### state: =N_STATES-1 if first set, =previous recommended itemSetID otherwise
#### action: the itemSetID recommended
#### reward: (item is purchased) * price
#### nextState: -1 if we are at 3rd set (game terminated), =itemSetID otherwise
#### to feed a set of (state, action, reward, nextState) to a Q table
from tqdm import tqdm
from classes.ItemSet import ItemSet3
itemSet3 = ItemSet3()
N_ACTIONS = itemSet3.getNSets()
N_STATES = N_ACTIONS + 1 # 1 for initial state

trainSetQL = []
for i in tqdm(range(userFeaturesTrain.shape[0])):
# loop through samples
    state = list(userFeaturesTrain.iloc[i])
    recItems = recItemsTrain[i]
    purLabel = purchaseLabelTrain[i]
    for j in [0, 3, 6]: # process each Set3 at once
        if j>2 and purLabel[0]*purLabel[1]*purLabel[2]==0:
            # don't train if game stopped
            break
        if j>5 and purLabel[3]*purLabel[4]*purLabel[5]==0:
            # don't train if game stopped
            break
        # calculate state:
        if j==0:
            state = N_STATES-1
        else:
            state = itemSetID # previous recommened itemSet
        # action: itemSetID
        itemSet = [recItems[j], recItems[j+1], recItems[j+2]]
        itemSetID = itemSet3.getSetID(itemSet)
        action = itemSetID
        # reward:
        prices = [itemInfo.getItemPrice(itemSet[0]), itemInfo.getItemPrice(itemSet[1]), itemInfo.getItemPrice(itemSet[2])]
        labels = [purLabel[j], purLabel[j+1], purLabel[j+2]]
        reward = sum([prices[t]*labels[t] for t in range(3)])
        # next state:
        if j==6:
            nextState = -1
        else:
            nextState = itemSetID
        # append to train data set 
        trainSetQL.append((state, action, reward, nextState))



100% 260087/260087 [00:52<00:00, 4924.20it/s]


In [21]:
# 2.2 Train QL model
# initialize
from classes import QLearning2
from importlib import reload  
QLearning2 = reload(QLearning2)

print('N_ACTIONS: ' + str(N_ACTIONS) + ' N_STATES: ' + str(N_STATES))
QLModel = QLearning2.QLearning(n_states = N_STATES, n_actions = N_ACTIONS)
# train in parallel
# QLModel.trainParallel(trainSetQL)
QLModel.train(trainSetQL)


N_ACTIONS: 112368 N_STATES: 112369


100% 591837/591837 [4:54:06<00:00, 33.54it/s]  


In [22]:
# save checkpoint
import pickle
with open('/tf/shared/checkpoints/model-BCQ-QL.pkl', 'wb') as file:
    pickle.dump((BCQModel, QLModel),
                file, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
# reload checkpoint
import pickle
with open('/tf/shared/checkpoints/model-BCQ-QL.pkl', 'rb') as file:
    BCQModel, QLModel = pickle.load(file)

In [26]:
######################## PREDICTION ##########################################
userIDs, userFeaturesTest = getUserFeaturesTestSet()


In [27]:
# 3.1 transform userFeaturesTest to 20D by using PCA
userFeaturesTest = pd.DataFrame(PCAtransformer.transform(userFeaturesTest))

In [None]:
QLModel.initPredCache()
recItems_test = []
for i in tqdm(range(userFeaturesTest.shape[0])):
# loop thru samples
    recItems = []  # recommended list for this sample
    # 3.2 Make prediction of the first itemSet by using BCQModel, name it itemSet1
    state = list(userFeaturesTest.iloc[i]) + [1, 0, 0]  # first step of the game
    itemSetID1 = BCQModel.predict([np.array(state)])[0]
    recItems.extend(list(itemSet3.getItemSet(itemSetID1)))
    # 3.3 use itemSet1 as state for QLModel to predict best itemSet2
    # now stateID = itemSetID1
    candidateSetIDs = QLModel.predictBestK(itemSetID1, 20)
    for setID in candidateSetIDs:
        items = itemSet3.getItemSet(setID)
        if (items[0] not in recItems) and (items[1] not in recItems) and (items[2] not in recItems):
            # we have found a suitable solution for step 2
            itemSetID2 = setID
            recItems.extend(list(items))
            break
    # 3.4 use itemSet2 as state for QLModel to predict best itemSet3
    # now stateID = itemSetID2
    candidateSetIDs = QLModel.predictBestK(itemSetID2, 20)
    for setID in candidateSetIDs:
        items = itemSet3.getItemSet(setID)
        if (items[0] not in recItems) and (items[1] not in recItems) and (items[2] not in recItems):
            # we have found a suitable solution for step 2
            itemSetID3 = setID
            recItems.extend(list(items))
            break
    recItems_test.append(recItems)


In [None]:
# write recommended items to output csv file
from classes.output import writeOutput
writeOutput(recItems_test, 'BCQ-QLearning_v1.csv', userIDs)

In [None]:
print(1)