In [117]:
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = 'python'
import rs_datasets
import numpy as np 
import pandas as pd
from d3rlpy.dataset import MDPDataset

In [2]:
import d3rlpy

In [4]:
ml = rs_datasets.MovieLens()
raitings = ml.ratings
raitings['date'] = pd.to_datetime(raitings["timestamp"].astype(int), unit='s')

991kB [00:05, 188kB/s]                                                                                                                                                                        


In [22]:
raitings.keys()

Index(['user_id', 'item_id', 'rating', 'timestamp', 'date'], dtype='object')

### Items encoding

In [23]:
EMBEDDING_SIZE = 64

In [20]:
items_mapping = dict()
items = list(set(raitings['item_id']))
for item in items:
    new_vector = np.random.uniform(0,1, size = EMBEDDING_SIZE)
    items_mapping[item] = new_vector
    

### MDP

#### Make mdp for user

In [167]:
def calc_states_one_episode(episode_logs, mapping, framestack = 5):
    no_op_framestack = np.zeros((framestack,EMBEDDING_SIZE))
    chooses = np.asarray([mapping[item_idx] for item_idx in episode_logs['item_id']])
    chooses = np.append(no_op_framestack, chooses, axis = 0)
    states = []
    for i in range(len(episode_logs)):
        state = chooses[i: i+framestack]
        states.append(state)
    return states
    
def user_mdps(user_df, mapping, window_size = 20, framestack = 5):
    no_op_framestack = np.zeros((5,EMBEDDING_SIZE))
    states = []
    rewars = []
    actions = []
    termations = []
    for i in range(len(user_df)-window_size):
        logs = user_df[i:i+window_size]
        states_one_episode = calc_states_one_episode(logs, mapping, framestack)
        rewards_one_episode = logs['rating']
        actions_one_episode = logs['item_id']
        
        ### remove_no_op
        states_one_episode = states_one_episode[framestack:]
        rewards_one_episode = rewards_one_episode[framestack:]
        actions_one_episode = actions_one_episode[framestack:]
        ### 
        termations_one_episode = np.zeros_like(actions_one_episode)
        termations_one_episode[-1] = 1
        
        rewars.append(rewards_one_episode)
        states.append(states_one_episode)
        actions.append(actions_one_episode)
        termations.append(termations_one_episode)
    return np.asarray(states), np.asarray(rewars), np.asarray(actions), np.asarray(termations)

def make_mdp(df, items_mapping, window_size = 20, framestack = 5):
    users = list(set(df['user_id']))
    full_states = []
    full_rewards = []
    full_actions = []
    full_termates = []
    for user in users:
        user_df = df[df['user_id'] == user].sort_values('timestamp')
        states, rewards, actions, termates = user_mdps(user_df, items_mapping, window_size, framestack)
        full_states += states.tolist()
        full_rewards += rewards.tolist()
        full_actions += actions.tolist()
        full_termates += termates.tolist()      
    return full_states, full_rewards, full_actions, full_termates
        
        
def to_d3rlpy_form(full_states, full_rewards, full_actions, full_termates):
     users_pesodes, steps_in_episode, framestack, emb_size = np.asarray(full_states).shape 
     states = np.asarray(full_states)
     states = states.reshape(-1, framestack*emb_size)     
    # print(states.shape)
     rewards = np.asarray(full_rewards).reshape(-1,1)    
     actions = np.asarray(full_actions).reshape(-1,1)        
     termates = np.asarray(full_termates).reshape(-1,1)
     
     
     dataset = MDPDataset(
            observations=states[:5000],
            actions=actions[:5000],
            rewards=rewards[:5000],
            terminals=termates[:5000]
        )
    
     return dataset
        

In [88]:
states, rewards, actions = user_mdps(raitings[raitings['user_id']<5], items_mapping)

  rewards_one_episode = rewards_one_episode[framestack:]
  actions_one_episode = actions_one_episode[framestack:]


### Train CQL

In [162]:
states, rewards, actions, termates = make_mdp(raitings[raitings['user_id']<1000], items_mapping, window_size = 40, framestack = 5)

  rewards_one_episode = rewards_one_episode[framestack:]
  actions_one_episode = actions_one_episode[framestack:]


In [168]:
dataset = to_d3rlpy_form(states, rewards, actions, termates)


In [169]:
from d3rlpy.algos import DiscreteCQL
from d3rlpy.metrics.scorer import evaluate_on_environment


In [None]:
import sys
sys.path.append("../")
from rl_experiments.utils.fake_recommender_env import FakeRecomenderEnv
import wandb
wandb.init(project="RecommendationsSDAC", group = "MovieLens_SDAC")

env = FakeRecomenderEnv(wandb, raitings[raitings['user_id']<5][:1000], 10)
evaluate_scorer = evaluate_on_environment(env)

In [170]:
algo = DiscreteCQL(use_gpu=True)

In [171]:
algo.fit(dataset, n_epochs=10)

2022-11-08 13:20.46 [debug    ] RoundIterator is selected.
2022-11-08 13:20.46 [info     ] Directory is created at d3rlpy_logs/DiscreteCQL_20221108132046
2022-11-08 13:20.46 [debug    ] Building models...
2022-11-08 13:20.46 [debug    ] Models have been built.
2022-11-08 13:20.46 [info     ] Parameters are saved to d3rlpy_logs/DiscreteCQL_20221108132046/params.json params={'action_scaler': None, 'alpha': 1.0, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_update_interval': 8000, 'use_gpu': 0, 'algorithm': 'DiscreteCQL', 'observation_sh

Epoch 1/10:   0%|          | 0/155 [00:00<?, ?it/s]

2022-11-08 13:20.47 [info     ] DiscreteCQL_20221108132046: epoch=1 step=155 epoch=1 metrics={'time_sample_batch': 0.00013720143225885207, 'time_algorithm_update': 0.007556327696769468, 'loss': 11.207797191989037, 'time_step': 0.007765654594667496} step=155
2022-11-08 13:20.47 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20221108132046/model_155.pt


Epoch 2/10:   0%|          | 0/155 [00:00<?, ?it/s]

2022-11-08 13:20.48 [info     ] DiscreteCQL_20221108132046: epoch=2 step=310 epoch=2 metrics={'time_sample_batch': 0.00015003142818327872, 'time_algorithm_update': 0.008050481734737273, 'loss': 6.248589804864699, 'time_step': 0.008291052233788275} step=310
2022-11-08 13:20.48 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20221108132046/model_310.pt


Epoch 3/10:   0%|          | 0/155 [00:00<?, ?it/s]

2022-11-08 13:20.50 [info     ] DiscreteCQL_20221108132046: epoch=3 step=465 epoch=3 metrics={'time_sample_batch': 0.00014419863300938761, 'time_algorithm_update': 0.008009650630335654, 'loss': 5.097815369021508, 'time_step': 0.008235423795638545} step=465
2022-11-08 13:20.50 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20221108132046/model_465.pt


Epoch 4/10:   0%|          | 0/155 [00:00<?, ?it/s]

2022-11-08 13:20.51 [info     ] DiscreteCQL_20221108132046: epoch=4 step=620 epoch=4 metrics={'time_sample_batch': 0.00014169754520539316, 'time_algorithm_update': 0.007737321238363942, 'loss': 4.942206299689508, 'time_step': 0.0079546328513853} step=620
2022-11-08 13:20.51 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20221108132046/model_620.pt


Epoch 5/10:   0%|          | 0/155 [00:00<?, ?it/s]

2022-11-08 13:20.52 [info     ] DiscreteCQL_20221108132046: epoch=5 step=775 epoch=5 metrics={'time_sample_batch': 0.00014964995845671622, 'time_algorithm_update': 0.00814522312533471, 'loss': 4.810022729442966, 'time_step': 0.008375298592352098} step=775
2022-11-08 13:20.52 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20221108132046/model_775.pt


Epoch 6/10:   0%|          | 0/155 [00:00<?, ?it/s]

2022-11-08 13:20.54 [info     ] DiscreteCQL_20221108132046: epoch=6 step=930 epoch=6 metrics={'time_sample_batch': 0.00014400789814610635, 'time_algorithm_update': 0.008138984249484154, 'loss': 4.664327178462859, 'time_step': 0.008352679591025076} step=930
2022-11-08 13:20.54 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20221108132046/model_930.pt


Epoch 7/10:   0%|          | 0/155 [00:00<?, ?it/s]

2022-11-08 13:20.55 [info     ] DiscreteCQL_20221108132046: epoch=7 step=1085 epoch=7 metrics={'time_sample_batch': 0.0001445970227641444, 'time_algorithm_update': 0.007526057766329857, 'loss': 4.500921049425679, 'time_step': 0.007747734746625347} step=1085
2022-11-08 13:20.55 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20221108132046/model_1085.pt


Epoch 8/10:   0%|          | 0/155 [00:00<?, ?it/s]

2022-11-08 13:20.56 [info     ] DiscreteCQL_20221108132046: epoch=8 step=1240 epoch=8 metrics={'time_sample_batch': 0.00014451396080755418, 'time_algorithm_update': 0.007696676254272461, 'loss': 4.313734223765712, 'time_step': 0.007917684124362084} step=1240
2022-11-08 13:20.56 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20221108132046/model_1240.pt


Epoch 9/10:   0%|          | 0/155 [00:00<?, ?it/s]

2022-11-08 13:20.57 [info     ] DiscreteCQL_20221108132046: epoch=9 step=1395 epoch=9 metrics={'time_sample_batch': 0.0001403331756591797, 'time_algorithm_update': 0.008188439953711725, 'loss': 4.093966231807586, 'time_step': 0.008408592593285346} step=1395
2022-11-08 13:20.58 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20221108132046/model_1395.pt


Epoch 10/10:   0%|          | 0/155 [00:00<?, ?it/s]

2022-11-08 13:20.59 [info     ] DiscreteCQL_20221108132046: epoch=10 step=1550 epoch=10 metrics={'time_sample_batch': 0.00014486774321525328, 'time_algorithm_update': 0.008166379313315115, 'loss': 3.8470312041621053, 'time_step': 0.008394378231417749} step=1550
2022-11-08 13:20.59 [info     ] Model parameters are saved to d3rlpy_logs/DiscreteCQL_20221108132046/model_1550.pt


[(1,
  {'time_sample_batch': 0.00013720143225885207,
   'time_algorithm_update': 0.007556327696769468,
   'loss': 11.207797191989037,
   'time_step': 0.007765654594667496}),
 (2,
  {'time_sample_batch': 0.00015003142818327872,
   'time_algorithm_update': 0.008050481734737273,
   'loss': 6.248589804864699,
   'time_step': 0.008291052233788275}),
 (3,
  {'time_sample_batch': 0.00014419863300938761,
   'time_algorithm_update': 0.008009650630335654,
   'loss': 5.097815369021508,
   'time_step': 0.008235423795638545}),
 (4,
  {'time_sample_batch': 0.00014169754520539316,
   'time_algorithm_update': 0.007737321238363942,
   'loss': 4.942206299689508,
   'time_step': 0.0079546328513853}),
 (5,
  {'time_sample_batch': 0.00014964995845671622,
   'time_algorithm_update': 0.00814522312533471,
   'loss': 4.810022729442966,
   'time_step': 0.008375298592352098}),
 (6,
  {'time_sample_batch': 0.00014400789814610635,
   'time_algorithm_update': 0.008138984249484154,
   'loss': 4.664327178462859,
   '

In [195]:
import math
def ndcg(k, pred, ground_truth) -> float:
        pred_len = min(k, len(pred))
        ground_truth_len = min(k, len(ground_truth))
        denom = [1 / math.log2(i + 2) for i in range(k)]
        dcg = sum(denom[i] for i in range(pred_len) if pred[i] in ground_truth)
        idcg = sum(denom[:ground_truth_len])

        return dcg / idcg

In [206]:
preds = algo.predict(dataset.episodes[0].observations[:20])
print(preds)

[ 596 2948 1136 2427 1031 1136 1031 3671 2948 1196 1031 1136 1031 2700
 2395  596 3253 1517 2141 1732]


In [207]:
true_acts = dataset.episodes[0].actions[:20]
print(true_acts)

[3578 3617 3744 2858  101  441 1473 2997  235 1060  356  223 1500 2700
 2395 3243 3253 1517 1580 1732]


In [205]:
ndcg(20, preds, true_acts)

0.2445923242397208

In [208]:
def mean_ndcg():
    mean_ndcg_v = []
    for i in range(100):
        preds = algo.predict(dataset.episodes[i].observations[:20])
        true_acts = dataset.episodes[i].actions[:20]
        mean_ndcg_v.append(ndcg(20, preds, true_acts))
    return np.mean(mean_ndcg_v)

In [210]:
mean_ndcg()

0.8509331578215731