In [1]:
from environment import ContextualEnvironment
from policies import KLUCBSegmentPolicy, RandomPolicy, ExploreThenCommitSegmentPolicy, EpsilonGreedySegmentPolicy, TSSegmentPolicy, LinearTSPolicy
import argparse
import json
import logging
import numpy as np
import pandas as pd
import time

In [2]:
logging.basicConfig(level = logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
def set_policies(policies_name, user_segment, user_features, n_playlists):
    # Please see section 3.3 of RecSys paper for a description of policies
    POLICIES_SETTINGS = {
        'random' : RandomPolicy(n_playlists),
        'etc-seg-explore' : ExploreThenCommitSegmentPolicy(user_segment, n_playlists, min_n = 100, cascade_model = True),
        'etc-seg-exploit' : ExploreThenCommitSegmentPolicy(user_segment, n_playlists, min_n = 20, cascade_model = True),
        'epsilon-greedy-explore' : EpsilonGreedySegmentPolicy(user_segment, n_playlists, epsilon = 0.1, cascade_model = True),
        'epsilon-greedy-exploit' : EpsilonGreedySegmentPolicy(user_segment, n_playlists, epsilon = 0.01, cascade_model = True),
        'kl-ucb-seg' : KLUCBSegmentPolicy(user_segment, n_playlists, cascade_model = True),
        'ts-seg-naive' : TSSegmentPolicy(user_segment, n_playlists, alpha_zero = 1, beta_zero = 1, cascade_model = True),
        'ts-seg-pessimistic' : TSSegmentPolicy(user_segment, n_playlists, alpha_zero = 1, beta_zero = 99, cascade_model = True),
        'ts-lin-naive' : LinearTSPolicy(user_features, n_playlists, bias = 0.0, cascade_model = True),
        'ts-lin-pessimistic' : LinearTSPolicy(user_features, n_playlists, bias = -5.0, cascade_model = True),
        # Versions of epsilon-greedy-explore and ts-seg-pessimistic WITHOUT cascade model
        'epsilon-greedy-explore-no-cascade' : EpsilonGreedySegmentPolicy(user_segment, n_playlists, epsilon = 0.1, cascade_model = False),
        'ts-seg-pessimistic-no-cascade' : TSSegmentPolicy(user_segment, n_playlists, alpha_zero = 1, beta_zero = 99, cascade_model = False)
    }

    return [POLICIES_SETTINGS[name] for name in policies_name]



In [10]:
parser = argparse.ArgumentParser()
parser.add_argument("--users_path", type = str, default = "data/user_features.csv", required = False,
                    help = "Path to user features file")
parser.add_argument("--playlists_path", type = str, default = "data/playlist_features.csv", required = False,
                    help = "Path to playlist features file")
parser.add_argument("--output_path", type = str, default = "results.json", required = False,
                    help = "Path to json file to save regret values")
parser.add_argument("--policies", type = str, default = "ts-lin-pessimistic", required = False,
                    help = "Bandit algorithms to evaluate, separated by commas")
parser.add_argument("--n_recos", type = int, default = 12, required = False,
                    help = "Number of slots L in the carousel i.e. number of recommendations to provide")
parser.add_argument("--l_init", type = int, default = 3, required = False,
                    help = "Number of slots L_init initially visible in the carousel")
parser.add_argument("--n_users_per_round", type = int, default = 20, required = False,
                    help = "Number of users randomly selected (with replacement) per round")
parser.add_argument("--n_rounds", type = int, default = 100, required = False,
                    help = "Number of simulated rounds")
parser.add_argument("--print_every", type = int, default = 10, required = False,
                    help = "Print cumulative regrets every 'print_every' round")

args = parser.parse_args(args = [])

In [5]:
playlists_df = pd.read_csv('data/playlist_features.csv')
users_df = pd.read_csv('data/user_features_small.csv')
n_users = len(users_df)
n_playlists = len(playlists_df)

In [13]:
n_recos = args.n_recos
print_every = args.print_every

user_features = np.array(users_df.drop(["segment"], axis = 1)) # segment 제외
user_features = np.concatenate([user_features, np.ones((n_users,1))], axis = 1) # feature 맨 뒤에 값 추가
playlist_features = np.array(playlists_df)

user_segment = np.array(users_df.segment)

In [14]:
cont_env = ContextualEnvironment(user_features, playlist_features, user_segment, n_recos)

policies_name = args.policies.split(",")
policies = set_policies(policies_name, user_segment, user_features, n_playlists)                    # init 수행
n_policies = len(policies)
n_users_per_round = args.n_users_per_round
n_rounds = args.n_rounds
overall_rewards = np.zeros((n_policies, n_rounds))
overall_optimal_reward = np.zeros(n_rounds)

In [15]:
logger.info("STARTING SIMULATIONS")
logger.info("for %d rounds, with %d users per round (randomly drawn with replacement)\n \n" % (n_rounds, n_users_per_round))
start_time = time.time()

i = 0
# Select batch of n_users_per_round users
user_ids = np.random.choice(range(n_users), n_users_per_round)                                  # 전체 유저에서 n_users_per_round 크기 만큼 샘플링 / 중복 유저도 가능한데...?
overall_optimal_reward[i] = np.take(cont_env.th_rewards, user_ids).sum()                        # overall_optimal_reward[i] = batch user에 있는 사람들의 reward 합
# Iterate over all policies
for j in range(n_policies):
    # Compute n_recos recommendations
    recos = policies[j].recommend_to_users_batch(user_ids, args.n_recos, args.l_init)           # user_ids(배치 크기)에 있는 유저에 대한 추천리스트 / (20000 x 12)
    # Compute rewards
    rewards = cont_env.simulate_batch_users_reward(batch_user_ids= user_ids, batch_recos=recos) # Sample 유저에 대한 reward 반환
    # Update policy based on rewards
    policies[j].update_policy(user_ids, recos, rewards, args.l_init)                            
    overall_rewards[j,i] = rewards.sum()
# Print info
if i == 0 or (i+1) % print_every == 0 or i+1 == n_rounds:
    logger.info("Round: %d/%d. Elapsed time: %f sec." % (i+1, n_rounds, time.time() - start_time))
    logger.info("Cumulative regrets: \n%s \n" % "\n".join(["	%s : %s" % (policies_name[j], str(np.sum(overall_optimal_reward - overall_rewards[j]))) for j in range(n_policies)]))


INFO:__main__:STARTING SIMULATIONS
INFO:__main__:for 100 rounds, with 20 users per round (randomly drawn with replacement)
 

INFO:__main__:Round: 1/100. Elapsed time: 0.164642 sec.
INFO:__main__:Cumulative regrets: 
	ts-lin-pessimistic : 10.635216074036354 



In [20]:
rewards

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.,

In [12]:
logger.info("STARTING SIMULATIONS")
logger.info("for %d rounds, with %d users per round (randomly drawn with replacement)\n \n" % (n_rounds, n_users_per_round))
start_time = time.time()

for i in range(n_rounds):
    # Select batch of n_users_per_round users
    user_ids = np.random.choice(range(n_users), n_users_per_round)                                  # 전체 유저에서 n_users_per_round 크기 만큼 샘플링 / 중복 유저도 가능한데...?
    overall_optimal_reward[i] = np.take(cont_env.th_rewards, user_ids).sum()                        # overall_optimal_reward[i] = batch user에 있는 사람들의 reward 합
    # Iterate over all policies
    for j in range(n_policies):
        # Compute n_recos recommendations
        recos = policies[j].recommend_to_users_batch(user_ids, args.n_recos, args.l_init)           # user_ids(배치 크기)에 있는 유저에 대한 추천리스트 / (20000 x 12)
        # Compute rewards
        rewards = cont_env.simulate_batch_users_reward(batch_user_ids= user_ids, batch_recos=recos) # Sample 유저에 대한 reward 반환
        # Update policy based on rewards
        policies[j].update_policy(user_ids, recos, rewards, args.l_init)                            
        overall_rewards[j,i] = rewards.sum()
    # Print info
    if i == 0 or (i+1) % print_every == 0 or i+1 == n_rounds:
        logger.info("Round: %d/%d. Elapsed time: %f sec." % (i+1, n_rounds, time.time() - start_time))
        logger.info("Cumulative regrets: \n%s \n" % "\n".join(["	%s : %s" % (policies_name[j], str(np.sum(overall_optimal_reward - overall_rewards[j]))) for j in range(n_policies)]))


INFO:__main__:STARTING SIMULATIONS
INFO:__main__:for 100 rounds, with 20 users per round (randomly drawn with replacement)
 

INFO:__main__:Round: 1/100. Elapsed time: 0.169040 sec.
INFO:__main__:Cumulative regrets: 
	ts-lin-pessimistic : 10.352586198915116 

INFO:__main__:Round: 10/100. Elapsed time: 1.408333 sec.
INFO:__main__:Cumulative regrets: 
	ts-lin-pessimistic : 82.71741827346239 

INFO:__main__:Round: 20/100. Elapsed time: 2.176514 sec.
INFO:__main__:Cumulative regrets: 
	ts-lin-pessimistic : 174.9784613134419 

INFO:__main__:Round: 30/100. Elapsed time: 2.813665 sec.
INFO:__main__:Cumulative regrets: 
	ts-lin-pessimistic : 260.5503801048198 

INFO:__main__:Round: 40/100. Elapsed time: 3.455817 sec.
INFO:__main__:Cumulative regrets: 
	ts-lin-pessimistic : 342.276439639027 

INFO:__main__:Round: 50/100. Elapsed time: 4.086966 sec.
INFO:__main__:Cumulative regrets: 
	ts-lin-pessimistic : 429.88134390256096 

INFO:__main__:Round: 60/100. Elapsed time: 4.716115 sec.
INFO:__main__