In [1]:
from rs_datasets import MovieLens
from d3rlpy.base import LearnableBase
from d3rlpy.dataset import MDPDataset
from d3rlpy.models.optimizers import OptimizerFactory, AdamFactory
from pyspark.sql import functions as sf, DataFrame
import numpy as np
from typing import Optional, Callable


In [2]:
ds = MovieLens(version="1m")

In [3]:
ds.ratings

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


### Data to MDP

In [4]:
def _prepare_data(log: DataFrame) -> MDPDataset:
        use_negative_events = True #False
        rating_based_reward = True #False
        reward_top_k = True
        k = 10
        
        test_size = 0.3
        action_randomization_scale = 0.3
        raw_rating_to_reward_rescale = {
        1.0: -1.0,
        2.0: -0.3,
        3.0: 0.25,
        4.0: 0.7,
        5.0: 1.0,
    }
        binary_rating_to_reward_rescale = {
            1.0: -1.0,
            2.0: -1.0,
            3.0: 1.0,
            4.0: 1.0,
            5.0: 1.0,
        }
        if not use_negative_events:
            # remove negative events
            log = log.filter(sf.col('rating') >= sf.lit(3.0))

        # TODO: consider making calculations in Spark before converting to pandas
        user_logs = log.sort_values(['user_id', 'timestamp'], ascending=True)

        if rating_based_reward:
            rescale = raw_rating_to_reward_rescale
        else:
            rescale = binary_rating_to_reward_rescale
        rewards = user_logs['rating'].map(rescale).to_numpy()

        if reward_top_k:
            # additionally reward top-K watched movies
            user_top_k_idxs = (
                
                user_logs
                .sort_values(['rating', 'timestamp'], ascending=[False, True])
                .groupby('user_id')
                .head(k)
                .index
            )
            # rescale positives and additionally reward top-K watched movies
            rewards[rewards > 0] /= 2
            rewards[user_top_k_idxs] += 0.5

        user_logs['rewards'] = rewards

        # every user has his own episode (the latest item is defined as terminal)
        user_terminal_idxs = (
            user_logs[::-1]
            .groupby('user_id')
            .head(1)
            .index
        )
        terminals = np.zeros(len(user_logs))
        terminals[user_terminal_idxs] = 1
        user_logs['terminals'] = terminals

        # cannot set zero scale as d3rlpy will treat transitions as discrete :/
        
        
        #разбиение на трейн тест
        user_id_list = list(set(user_logs['user_id']))
        count_of_test = int(test_size*len(user_id_list))
        test_idx = int(user_id_list[-count_of_test])
        
        user_logs_train = user_logs[user_logs['user_id'].astype(int) < test_idx]
        user_logs_test = user_logs[user_logs['user_id'].astype(int) >= test_idx]
        
        action_randomization_scale = action_randomization_scale + 1e-4
        action_randomization = np.random.randn(len(user_logs_train)) * action_randomization_scale

        train_dataset = MDPDataset(
            observations=np.array(user_logs_train[['user_id', 'item_id']]),
            actions=np.array(
                user_logs_train['rating'] + action_randomization
            )[:, None],
            rewards=user_logs_train['rewards'],
            terminals=user_logs_train['terminals']
        )
        test_dataset = MDPDataset(
            observations=np.array(user_logs_test[['user_id', 'item_id']]),
            actions=np.array(
                user_logs_test['rating']
            )[:, None],
            rewards=user_logs_test['rewards'],
            terminals=user_logs_test['terminals']
        )
        return train_dataset, user_logs_train

In [5]:
import pandas as pd

def _predict(
    model,
    log: DataFrame = None,
    k: int = 10,
    users: DataFrame = None,
    items: DataFrame = None,
    user_features: Optional[DataFrame] = None,
    item_features: Optional[DataFrame] = None,
    filter_seen_items: bool = True,
) -> DataFrame:
    if user_features or item_features:
        message = f'RL recommender does not support user/item features'
      #  self.logger.debug(message)

    users = np.array(users).flatten()
    items = np.array(items).flatten()

    # TODO: consider size-dependent batch prediction instead of by user
    user_predictions = []
    for user in users:
        user_item_pairs = pd.DataFrame({
            'user_idx': np.repeat(user, len(items)),
            'item_idx': items
        })
        user_item_pairs['relevance'] = model.predict(user_item_pairs.to_numpy())
        user_predictions.append(user_item_pairs)

    prediction = pd.concat(user_predictions)
    prediction = prediction.sort_values(['relevance'])[::-1][:k]
    # it doesn't explicitly filter seen items and doesn't return top k items
    # instead, it keeps all predictions as is to be filtered further by base methods
    return prediction



### Metrics

In [6]:
import math  
def ndcg(k, pred, ground_truth) -> float:
        pred_len = min(k, len(pred))
        ground_truth_len = min(k, len(ground_truth))
        denom = [1 / math.log2(i + 2) for i in range(k)]
        dcg = sum(denom[i] for i in range(pred_len) if pred[i] in ground_truth)
        idcg = sum(denom[:ground_truth_len])

        return dcg / idcg
    
def mape(k, pred, ground_truth) -> float:
        length = min(k, len(pred))
        max_good = min(k, len(ground_truth))
        if len(ground_truth) == 0 or len(pred) == 0:
            return 0
        tp_cum = 0
        result = 0
        for i in range(length):
            if pred[i] in ground_truth:
                tp_cum += 1
                result += tp_cum / ((i + 1) * max_good)
        return result

In [7]:
def original_for_user(df, target, k = 10):
    mask = df['user_id'] == target
    user_relevance = df[mask]
    return user_relevance.sort_values(['rating'])[::-1][:k]
    


### Fake env for evaluation

In [8]:
import gym
from gym.spaces import Discrete, Box, Tuple
import wandb

exp = wandb.init(project="RecommendationsSDAC", group = "MovieLens")

class FakeRecomenderEnv(gym.Env):
    def __init__(self, test_data, top_k):
        self.action_space = gym.spaces.Discrete(5)
        self.observation_space = Box(0,100000, (2,))
        self.log_data = test_data
        self.top_k = top_k
        self.steps = 0
        self.episode_num = 0
        self.episodes = list(set(self.log_data['user_id']))
        self.total_episodes = 0
        #mask = self.log_data['user_id'] == episodes[episode_num]
        self.current_episode = None

    def step(self, action): 
        self.relevance_hist.append(action)
        done = False
        reward = 0
        ob = (self.current_episode['user_id'].values[self.steps], 
                self.current_episode['item_id'].values[self.steps])
        self.steps += 1
        if len(self.current_episode['user_id']) == self.steps:
            done = True
          #  print(len(self.user_hist), len(self.item_hist), len(self.relevance_hist))
            pred_df = pd.DataFrame({'user_id': self.user_hist, 'item_hist': self.item_hist,
                                    'relevance': self.relevance_hist})
            pred_top_k = pred_df.sort_values(['relevance'])[::-1][:self.top_k]
            reward = ndcg(10, pred_top_k['relevance'].values, self.original['rating'].values)
            mape_ = mape(10, pred_top_k['relevance'].values, self.original['rating'].values)
            exp.log({"episode": self.total_episodes, "NDCG": reward, "MAP": mape_})
            ob = []
        else:
            self.user_hist.append(self.current_episode['user_id'].values[self.steps])
            self.item_hist.append(self.current_episode['item_id'].values[self.steps])
        
        return np.asarray(ob), reward, done, {}
    
    def reset(self):
        self.user_hist = []
        self.item_hist = []
        self.relevance_hist = []
        self.total_episodes += 1
        self.episode_num += 1
        if self.episode_num == len(self.episodes):
            self.episode_num = 0
        self.steps = 0 
        mask = self.log_data['user_id'] == self.episodes[self.episode_num]
        self.current_episode = self.log_data[mask]
       # print(self.current_episode['user_id'])
        self.user_hist.append(self.current_episode['user_id'].values[0])
        self.item_hist.append( self.current_episode['item_id'].values[0])
        self.original = original_for_user(self.log_data, self.current_episode['user_id'].values[0], k = self.top_k)
        obs = self.current_episode['user_id'].values[0], \
                       self.current_episode['item_id'].values[0]
      #  print( np.asarray(obs))
        return np.asarray(obs)



[34m[1mwandb[0m: Currently logged in as: [33mbabycar27[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Train

In [9]:
train_dataset,user_logs_train = _prepare_data(ds.ratings)

In [10]:
env = FakeRecomenderEnv(user_logs_train, 10)

from d3rlpy.metrics.scorer import evaluate_on_environment
evaluate_scorer = evaluate_on_environment(env)

In [11]:
env.reset()

array([   2, 1198], dtype=int32)

In [12]:
sdac = SDAC(use_gpu=True)

NameError: name 'SDAC' is not defined

In [13]:
sdac.fit(train_dataset,
        eval_episodes=train_dataset,
        n_epochs=3,
#         n_steps = 100000,
#         n_steps_per_epoch=3000,
        scorers={'environment': evaluate_scorer})

NameError: name 'sdac' is not defined