In [1]:
from rs_datasets import MovieLens
from d3rlpy.base import LearnableBase
from d3rlpy.dataset import MDPDataset
from d3rlpy.models.optimizers import OptimizerFactory, AdamFactory
from pyspark.sql import functions as sf, DataFrame
import numpy as np
from typing import Optional, Callable


In [2]:
ds = MovieLens(version="1m")

In [3]:
ds.ratings

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


### Data to MDP

In [4]:
def _prepare_data(log: DataFrame) -> MDPDataset:
        use_negative_events = True #False
        rating_based_reward = False #False
        reward_top_k = True
        k = 10
        
        test_size = 0.3
        action_randomization_scale = 0.01
        raw_rating_to_reward_rescale = {
        1.0: -1.0,
        2.0: -0.3,
        3.0: 0.25,
        4.0: 0.7,
        5.0: 1.0,
    }
        binary_rating_to_reward_rescale = {
            1.0: -1.0,
            2.0: -1.0,
            3.0: 1.0,
            4.0: 1.0,
            5.0: 1.0,
        }
        if not use_negative_events:
            # remove negative events
            log = log[log['rating'] >= 3]

        # TODO: consider making calculations in Spark before converting to pandas
        user_logs = log.sort_values(['user_id', 'timestamp'], ascending=True)

        if rating_based_reward:
            rescale = raw_rating_to_reward_rescale
        else:
            rescale = binary_rating_to_reward_rescale
        rewards = user_logs['rating'].map(rescale).to_numpy()

        if reward_top_k:
            # additionally reward top-K watched movies
            user_top_k_idxs = (
                
                user_logs
                .sort_values(['rating', 'timestamp'], ascending=[False, True])
                .groupby('user_id')
                .head(k)
                .index
            )
            # rescale positives and additionally reward top-K watched movies
            rewards[rewards > 0] /= 2
            rewards[user_top_k_idxs] += 0.5

        user_logs['rewards'] = rewards

        # every user has his own episode (the latest item is defined as terminal)
        user_terminal_idxs = (
            user_logs[::-1]
            .groupby('user_id')
            .head(1)
            .index
        )
        terminals = np.zeros(len(user_logs))
        terminals[user_terminal_idxs] = 1
        user_logs['terminals'] = terminals

        # cannot set zero scale as d3rlpy will treat transitions as discrete :/
        
        
        #разбиение на трейн тест
        user_id_list = list(set(user_logs['user_id']))
        count_of_test = int(test_size*len(user_id_list))
        test_idx = int(user_id_list[-count_of_test])
        
        user_logs_train = user_logs[user_logs['user_id'].astype(int) < test_idx]
        user_logs_test = user_logs[user_logs['user_id'].astype(int) >= test_idx]
        
        action_randomization_scale = action_randomization_scale + 1e-4
        action_randomization = np.random.randn(len(user_logs_train)) * action_randomization_scale

        train_dataset = MDPDataset(
            observations=np.array(user_logs_train[['user_id', 'item_id']]),
            actions=np.array(
                user_logs_train['rating']
            )[:, None] ,
            rewards=user_logs_train['rewards'],
            terminals=user_logs_train['terminals']
        )
      #  print( user_logs_test['rating'])
        test_dataset = MDPDataset(
            observations=np.array(user_logs_test[['user_id', 'item_id']]),
            actions=np.array(
                user_logs_test['rating'] 
            )[:, None],
            rewards=user_logs_test['rewards'],
            terminals=user_logs_test['terminals']
        )
        return train_dataset, user_logs_train

In [5]:
import pandas as pd

def _predict(
    model,
    log: DataFrame = None,
    k: int = 10,
    users: DataFrame = None,
    items: DataFrame = None,
    user_features: Optional[DataFrame] = None,
    item_features: Optional[DataFrame] = None,
    filter_seen_items: bool = True,
) -> DataFrame:
    if user_features or item_features:
        message = f'RL recommender does not support user/item features'
      #  self.logger.debug(message)

    users = np.array(users).flatten()
    items = np.array(items).flatten()

    # TODO: consider size-dependent batch prediction instead of by user
    user_predictions = []
    for user in users:
        user_item_pairs = pd.DataFrame({
            'user_idx': np.repeat(user, len(items)),
            'item_idx': items
        })
        user_item_pairs['relevance'] = model.predict(user_item_pairs.to_numpy())
        user_predictions.append(user_item_pairs)

    prediction = pd.concat(user_predictions)
    prediction = prediction.sort_values(['relevance'])[::-1][:k]
    # it doesn't explicitly filter seen items and doesn't return top k items
    # instead, it keeps all predictions as is to be filtered further by base methods
    return prediction



### Metrics

In [6]:
import math  
def ndcg(k, pred, ground_truth) -> float:
        pred_len = min(k, len(pred))
        ground_truth_len = min(k, len(ground_truth))
        denom = [1 / math.log2(i + 2) for i in range(k)]
        dcg = sum(denom[i] for i in range(pred_len) if pred[i] in ground_truth)
        idcg = sum(denom[:ground_truth_len])

        return dcg / idcg
    
def mape(k, pred, ground_truth) -> float:
        length = min(k, len(pred))
        max_good = min(k, len(ground_truth))
        if len(ground_truth) == 0 or len(pred) == 0:
            return 0
        tp_cum = 0
        result = 0
        for i in range(length):
            if pred[i] in ground_truth:
                tp_cum += 1
                result += tp_cum / ((i + 1) * max_good)
        return result

In [7]:
def original_for_user(df, target, k = 10):
    mask = df['user_id'] == target
    user_relevance = df[mask]
    return user_relevance.sort_values(['rating'])[::-1][:k]
    


### Fake env for evaluation

In [8]:
import gym
from gym.spaces import Discrete, Box, Tuple
import wandb

exp = wandb.init(project="RecommendationsSDAC", group = "MovieLens_SDAC")

class FakeRecomenderEnv(gym.Env):
    def __init__(self, test_data, top_k):
        self.action_space = gym.spaces.Discrete(5)
        self.observation_space = Box(0,100000, (2,))
        self.log_data = test_data
        self.top_k = top_k
        self.steps = 0
        self.episode_num = 0
        self.episodes = list(set(self.log_data['user_id']))
        self.total_episodes = 0
        #mask = self.log_data['user_id'] == episodes[episode_num]
        self.current_episode = None

    def step(self, action): 
        #print(action)
        self.relevance_hist.append(action)
        done = False
        reward = 0
        ob = (self.current_episode['user_id'].values[self.steps], 
                self.current_episode['item_id'].values[self.steps])
        self.steps += 1
        if len(self.current_episode['user_id']) == self.steps:
            done = True
          #  print(len(self.user_hist), len(self.item_hist), len(self.relevance_hist))
            pred_df = pd.DataFrame({'user_id': self.user_hist, 'item_hist': self.item_hist,
                                    'relevance': self.relevance_hist})
            pred_top_k = pred_df.sort_values(['relevance'])[::-1][:self.top_k]
            reward = ndcg( self.top_k, pred_top_k['relevance'].values, self.original['rating'].values)
            mape_ = mape( self.top_k, pred_top_k['relevance'].values, self.original['rating'].values)
            exp.log({"episode": self.total_episodes, "NDCG": reward, "MAP": mape_})
            ob = []
        else:
            self.user_hist.append(self.current_episode['user_id'].values[self.steps])
            self.item_hist.append(self.current_episode['item_id'].values[self.steps])
        
        return np.asarray(ob), reward, done, {}
    
    def reset(self):
        self.user_hist = []
        self.item_hist = []
        self.relevance_hist = []
        self.total_episodes += 1
        self.episode_num += 1
        if self.episode_num == len(self.episodes):
            self.episode_num = 0
        self.steps = 0 
        mask = self.log_data['user_id'] == self.episodes[self.episode_num]
        self.current_episode = self.log_data[mask]
       # print(self.current_episode['user_id'])
        self.user_hist.append(self.current_episode['user_id'].values[0])
        self.item_hist.append( self.current_episode['item_id'].values[0])
        self.original = original_for_user(self.log_data, self.current_episode['user_id'].values[0], k = self.top_k)
        obs = self.current_episode['user_id'].values[0], \
                       self.current_episode['item_id'].values[0]
      #  print( np.asarray(obs))
        return np.asarray(obs)



[34m[1mwandb[0m: Currently logged in as: [33mbabycar27[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Train

In [9]:
train_dataset,user_logs_train = _prepare_data(ds.ratings)

In [10]:
env = FakeRecomenderEnv(user_logs_train[:1000], 10)

from d3rlpy.metrics.scorer import evaluate_on_environment
evaluate_scorer = evaluate_on_environment(env)

In [11]:
env.reset()

array([   2, 1198], dtype=int32)

In [12]:
from d3rlpy.algos import SDAC
from d3rlpy.algos import CQL

In [13]:
sdac = SDAC(use_gpu=True)

In [14]:
sdac.fit(train_dataset,
        eval_episodes=train_dataset,
        n_epochs=10,
#         n_steps = 100000,
#         n_steps_per_epoch=3000,
        scorers={'environment': evaluate_scorer})

2022-10-24 23:38.17 [debug    ] RoundIterator is selected.
2022-10-24 23:38.17 [info     ] Directory is created at d3rlpy_logs/SDAC_20221024233817
2022-10-24 23:38.17 [debug    ] Building models...
2022-10-24 23:38.20 [debug    ] Models have been built.
2022-10-24 23:38.20 [info     ] Parameters are saved to d3rlpy_logs/SDAC_20221024233817/params.json params={'action_scaler': None, 'actor_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'actor_learning_rate': 0.0003, 'actor_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'batch_size': 256, 'critic_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'critic_learning_rate': 0.0003, 'critic_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'gamma': 0.99, 'generated_maxlen': 10

Epoch 1/10:   0%|          | 0/2757 [00:00<?, ?it/s]

2022-10-24 23:39.07 [info     ] SDAC_20221024233817: epoch=1 step=2757 epoch=1 metrics={'time_sample_batch': 0.00031592208577274714, 'time_algorithm_update': 0.016395952116115654, 'temp_loss': 4.688700980538253, 'temp': 0.7025570158492321, 'critic_loss': 497.7889070926605, 'actor_loss': 124.23836702188375, 'time_step': 0.01686050929061563, 'environment': 0.866073276015265} step=2757
2022-10-24 23:39.07 [info     ] Model parameters are saved to d3rlpy_logs/SDAC_20221024233817/model_2757.pt


Epoch 2/10:   0%|          | 0/2757 [00:00<?, ?it/s]

2022-10-24 23:39.55 [info     ] SDAC_20221024233817: epoch=2 step=5514 epoch=2 metrics={'time_sample_batch': 0.0003165322712983694, 'time_algorithm_update': 0.016503631528149404, 'temp_loss': 2.3955926939677537, 'temp': 0.35900406214608893, 'critic_loss': 217.07717646398447, 'actor_loss': 111.70893010584652, 'time_step': 0.01697999790262216, 'environment': 0.850383833502282} step=5514
2022-10-24 23:39.55 [info     ] Model parameters are saved to d3rlpy_logs/SDAC_20221024233817/model_5514.pt


Epoch 3/10:   0%|          | 0/2757 [00:00<?, ?it/s]

2022-10-24 23:40.42 [info     ] SDAC_20221024233817: epoch=3 step=8271 epoch=3 metrics={'time_sample_batch': 0.00031721423318258957, 'time_algorithm_update': 0.01634811784289737, 'temp_loss': 1.2818467911274745, 'temp': 0.19205819038991095, 'critic_loss': 141.74411496157919, 'actor_loss': 90.76071189407166, 'time_step': 0.016822678479847374, 'environment': 0.8282613992113355} step=8271
2022-10-24 23:40.42 [info     ] Model parameters are saved to d3rlpy_logs/SDAC_20221024233817/model_8271.pt


Epoch 4/10:   0%|          | 0/2757 [00:00<?, ?it/s]

2022-10-24 23:41.29 [info     ] SDAC_20221024233817: epoch=4 step=11028 epoch=4 metrics={'time_sample_batch': 0.00031318377445346856, 'time_algorithm_update': 0.016273709367399938, 'temp_loss': 0.6910703219513898, 'temp': 0.10355228539130633, 'critic_loss': 93.05168088913872, 'actor_loss': 73.15194988008595, 'time_step': 0.016747509693814052, 'environment': 0.9038113127357011} step=11028
2022-10-24 23:41.29 [info     ] Model parameters are saved to d3rlpy_logs/SDAC_20221024233817/model_11028.pt


Epoch 5/10:   0%|          | 0/2757 [00:00<?, ?it/s]

2022-10-24 23:42.16 [info     ] SDAC_20221024233817: epoch=5 step=13785 epoch=5 metrics={'time_sample_batch': 0.0003087724686360766, 'time_algorithm_update': 0.016028912464345242, 'temp_loss': 0.3730291300573166, 'temp': 0.0558976375639417, 'critic_loss': 58.286689132830354, 'actor_loss': 57.782562399758, 'time_step': 0.016496270819336425, 'environment': 0.7845485335723721} step=13785
2022-10-24 23:42.16 [info     ] Model parameters are saved to d3rlpy_logs/SDAC_20221024233817/model_13785.pt


Epoch 6/10:   0%|          | 0/2757 [00:00<?, ?it/s]

2022-10-24 23:43.02 [info     ] SDAC_20221024233817: epoch=6 step=16542 epoch=6 metrics={'time_sample_batch': 0.00030817257393777134, 'time_algorithm_update': 0.015980883011318094, 'temp_loss': 0.20138110813343918, 'temp': 0.030178222490429316, 'critic_loss': 37.342537481639106, 'actor_loss': 46.25992627121376, 'time_step': 0.016444091568574293, 'environment': 0.8883173937796588} step=16542
2022-10-24 23:43.02 [info     ] Model parameters are saved to d3rlpy_logs/SDAC_20221024233817/model_16542.pt


Epoch 7/10:   0%|          | 0/2757 [00:00<?, ?it/s]

2022-10-24 23:43.49 [info     ] SDAC_20221024233817: epoch=7 step=19299 epoch=7 metrics={'time_sample_batch': 0.0003127688552142487, 'time_algorithm_update': 0.01641291157248922, 'temp_loss': 0.10872782119048173, 'temp': 0.016292811226468612, 'critic_loss': 24.15405641736697, 'actor_loss': 37.14584957727454, 'time_step': 0.0168795941919791, 'environment': 0.8654287632144317} step=19299
2022-10-24 23:43.49 [info     ] Model parameters are saved to d3rlpy_logs/SDAC_20221024233817/model_19299.pt


Epoch 8/10:   0%|          | 0/2757 [00:00<?, ?it/s]

2022-10-24 23:44.36 [info     ] SDAC_20221024233817: epoch=8 step=22056 epoch=8 metrics={'time_sample_batch': 0.00031517751414922, 'time_algorithm_update': 0.016278085736290977, 'temp_loss': 0.058697698515558484, 'temp': 0.008795916823191264, 'critic_loss': 15.487082586162627, 'actor_loss': 29.34837512188214, 'time_step': 0.01675619835173863, 'environment': 0.8605340618160662} step=22056
2022-10-24 23:44.36 [info     ] Model parameters are saved to d3rlpy_logs/SDAC_20221024233817/model_22056.pt


Epoch 9/10:   0%|          | 0/2757 [00:00<?, ?it/s]

2022-10-24 23:45.23 [info     ] SDAC_20221024233817: epoch=9 step=24813 epoch=9 metrics={'time_sample_batch': 0.0003087400395583968, 'time_algorithm_update': 0.016004327850839908, 'temp_loss': 0.03168714083839271, 'temp': 0.004748719202041745, 'critic_loss': 9.621754662213137, 'actor_loss': 22.798708118387527, 'time_step': 0.01646675698602377, 'environment': 0.8385267756808069} step=24813
2022-10-24 23:45.23 [info     ] Model parameters are saved to d3rlpy_logs/SDAC_20221024233817/model_24813.pt


Epoch 10/10:   0%|          | 0/2757 [00:00<?, ?it/s]

2022-10-24 23:46.10 [info     ] SDAC_20221024233817: epoch=10 step=27570 epoch=10 metrics={'time_sample_batch': 0.00031466677779514894, 'time_algorithm_update': 0.01632879832796658, 'temp_loss': 0.017108451642263593, 'temp': 0.0025636908337038104, 'critic_loss': 6.698706114447805, 'actor_loss': 18.350095682140704, 'time_step': 0.01680376246912046, 'environment': 0.8753925810756311} step=27570
2022-10-24 23:46.10 [info     ] Model parameters are saved to d3rlpy_logs/SDAC_20221024233817/model_27570.pt


[(1,
  {'time_sample_batch': 0.00031592208577274714,
   'time_algorithm_update': 0.016395952116115654,
   'temp_loss': 4.688700980538253,
   'temp': 0.7025570158492321,
   'critic_loss': 497.7889070926605,
   'actor_loss': 124.23836702188375,
   'time_step': 0.01686050929061563,
   'environment': 0.866073276015265}),
 (2,
  {'time_sample_batch': 0.0003165322712983694,
   'time_algorithm_update': 0.016503631528149404,
   'temp_loss': 2.3955926939677537,
   'temp': 0.35900406214608893,
   'critic_loss': 217.07717646398447,
   'actor_loss': 111.70893010584652,
   'time_step': 0.01697999790262216,
   'environment': 0.850383833502282}),
 (3,
  {'time_sample_batch': 0.00031721423318258957,
   'time_algorithm_update': 0.01634811784289737,
   'temp_loss': 1.2818467911274745,
   'temp': 0.19205819038991095,
   'critic_loss': 141.74411496157919,
   'actor_loss': 90.76071189407166,
   'time_step': 0.016822678479847374,
   'environment': 0.8282613992113355}),
 (4,
  {'time_sample_batch': 0.0003131

In [15]:
sdac.predict(np.array([(90, 390)]))

array([5])

In [16]:
user_logs_train[:1000]

Unnamed: 0,user_id,item_id,rating,timestamp,rewards,terminals
31,1,3186,4,978300019,1.0,0.0
22,1,1270,5,978300055,0.5,0.0
27,1,1721,4,978300055,0.5,0.0
37,1,1022,5,978300055,0.5,0.0
24,1,2340,3,978300103,0.5,0.0
...,...,...,...,...,...,...
1023,10,1031,4,978228546,0.5,0.0
1195,10,2045,3,978228575,0.5,0.0
909,10,3608,3,978228601,0.5,0.0
1039,10,1042,5,978228601,0.5,0.0
