In [106]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import itertools
import pandas as pd
import numpy as np
import random
import csv
import time

In [108]:
def read_file(data_path):
    ''' Load data from train.csv or test.csv. '''

    data = pd.read_csv(data_path, sep=';')
    for col in ['state', 'n_state', 'action_reward']:
        data[col] = [np.array([[np.int(k) for k in ee.split('&')] for ee in e.split('|')]) for e in data[col]]
    for col in ['state', 'n_state']:
        data[col] = [np.array([e[0] for e in l]) for l in data[col]]

    data['action'] = [[e[0] for e in l] for l in data['action_reward']]
    data['reward'] = [tuple(e[1] for e in l) for l in data['action_reward']]
    data.drop(columns=['action_reward'], inplace=True)

    return data

def read_embeddings(embeddings_path):
    ''' Load embeddings (a vector for each item). '''

    embeddings = pd.read_csv(embeddings_path, sep=';')

    return np.array([[np.float64(k) for k in e.split('|')]
                   for e in embeddings['vectors']])

In [109]:
data = read_file('train.csv')
# data.head()

In [110]:
class Embeddings:
    def __init__(self, item_embeddings):
        self.item_embeddings = item_embeddings

    def size(self):
        return self.item_embeddings.shape[1]

    def get_embedding_vector(self):
        return self.item_embeddings

    def get_embedding(self, item_index):
        return self.item_embeddings[item_index]

    def embed(self, item_list):
        return np.array([self.get_embedding(item) for item in item_list])

In [111]:
embeddings = Embeddings(read_embeddings('embeddings.csv'))

In [288]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import itertools
import pandas as pd
import numpy as np
import random
import csv
import time

class Fairrec(gym.Env):
    def __init__(self, data, embeddings, alpha, gamma, fixed_length, state_size, trajectory_length):

        self.embedding_len = embeddings.size()
        self.embeddings = embeddings
        self.embedded_data = pd.DataFrame()
        self.embedded_data['state'] = [row['state'] for _, row in data.iterrows()]
        self.embedded_data['action'] = [row['action'] for _, row in data.iterrows()]
        self.embedded_data['reward'] = data['reward']
        self.embedded_data['state_embed'] = [np.array([embeddings.get_embedding(item_id) 
			for item_id in row['state']]) for _, row in data.iterrows()]
        self.embedded_data['action_embed'] = [np.array([embeddings.get_embedding(item_id) 
			for item_id in row['action']]) for _, row in data.iterrows()]

        self.alpha = alpha # α (alpha) in Equation (1)
        self.gamma = gamma # Γ (Gamma) in Equation (4)
        self.fixed_length = True
        self.current_state = self.reset()
        # print(self.current_state)
        self.groups = self.get_groups()

        self.action_space = spaces.Box(low=-10, high=10, shape=(self.embedding_len,))
        self.observation_space = spaces.Box(low=-10, high=10, shape=(state_size*self.embedding_len,))
        self.counter = 1
        self.trajectory_length = trajectory_length
        self.done = False

        print('Fair-Rec Environment initialized')

    # 	def set_observation_space(self, state_size):
    # 		self.observation_space = spaces.Box(low=-1, high=1, shape=(state_size*fixed_length,))

    # 	def set_action_space(self, action_size):
    # 		self.action_space = spaces.Box(low=-1, high=1, shape=(action_size*fixed_length,))

    def reset(self):
        self.init_state = self.embedded_data['state'].sample(1).values[0]
        return self.init_state

    def step(self, actions):
        '''
        Compute reward and update state.
        Args:
          actions: embedded chosen items.
        Returns:
          cumulated_reward: overall reward.
          current_state: updated state.
        '''
        self.counter += 1
        
        current_state_embeds = np.array(self.embeddings.embed(self.current_state))
        action_embeds = np.array(self.embeddings.embed(actions))

        # '18: Compute overall reward r_t according to Equation (4)'
        simulated_rewards, cumulated_reward = self.simulate_rewards(current_state_embeds.reshape((1, -1)), action_embeds.reshape((1, -1)))

        # '11: Set s_t+1 = s_t' <=> self.current_state = self.current_state

        for k in range(len(simulated_rewards)): # '12: for k = 1, K do'
            if simulated_rewards[k] > 0: # '13: if r_t^k > 0 then'
                # print(simulated_rewards[k])
            # '14: Add a_t^k to the end of s_t+1'
                self.current_state = np.append(self.current_state, [actions[k]], axis=0)
                if self.fixed_length: # '15: Remove the first item of s_t+1'
                    self.current_state = np.delete(self.current_state, 0, axis=0)

        if self.counter > self.trajectory_length:
        	self.done = True
        
        return cumulated_reward, self.current_state, self.done

    def get_groups(self):
        ''' Calculate average state/action value for each group. Equation (3). '''

        groups = []
        for rewards, group in self.embedded_data.groupby(['reward']):
            size = group.shape[0]
            states = np.array(list(group['state_embed'].values))
            actions = np.array(list(group['action_embed'].values))
            groups.append({
            'size': size, # N_x in article
            'rewards': rewards, # U_x in article (combination of rewards)
            'average state': (np.sum(states / np.linalg.norm(states, 2, axis=1)[:, np.newaxis], axis=0) / size).reshape((1, -1)), # s_x^-
            'average action': (np.sum(actions / np.linalg.norm(actions, 2, axis=1)[:, np.newaxis], axis=0) / size).reshape((1, -1)) # a_x^-
            })
        return groups

    def simulate_rewards(self, current_state, chosen_actions, reward_type='grouped cosine'):
        '''
        Calculate simulated rewards.
        Args:
          current_state: history, list of embedded items.
          chosen_actions: embedded chosen items.
          reward_type: from ['normal', 'grouped average', 'grouped cosine'].
        Returns:
          returned_rewards: most probable rewards.
          cumulated_reward: probability weighted rewards.
        '''
        
        # Equation (1)
        def cosine_state_action(s_t, a_t, s_i, a_i):
            cosine_state = np.dot(s_t, s_i.T) / (np.linalg.norm(s_t, 2) * np.linalg.norm(s_i, 2))
            cosine_action = np.dot(a_t, a_i.T) / (np.linalg.norm(a_t, 2) * np.linalg.norm(a_i, 2))
            return (self.alpha * cosine_state + (1 - self.alpha) * cosine_action).reshape((1,))

        if reward_type == 'normal':
            # Calculate simulated reward in normal way: Equation (2)
            probabilities = [cosine_state_action(current_state, chosen_actions, row['state'], row['action'])
            for _, row in self.embedded_data.iterrows()]
        elif reward_type == 'grouped average':
            # Calculate simulated reward by grouped average: Equation (3)
            probabilities = np.array([g['size'] for g in self.groups]) *\
            [(self.alpha * (np.dot(current_state, g['average state'].T) / np.linalg.norm(current_state, 2))\
            + (1 - self.alpha) * (np.dot(chosen_actions, g['average action'].T) / np.linalg.norm(chosen_actions, 2)))
            for g in self.groups]
        elif reward_type == 'grouped cosine':
            # Calculate simulated reward by grouped cosine: Equations (1) and (3)
            probabilities = [cosine_state_action(current_state, chosen_actions, g['average state'], g['average action'])
            for g in self.groups]

        # Normalize (sum to 1)
        probabilities = np.array(probabilities) / sum(probabilities)

        # Get most probable rewards
        if reward_type == 'normal':
            returned_rewards = self.embedded_data.iloc[np.argmax(probabilities)]['reward']
        elif reward_type in ['grouped average', 'grouped cosine']:
            returned_rewards = self.groups[np.argmax(probabilities)]['rewards']

        # Equation (4)
        def overall_reward(rewards, gamma):
            return np.sum([gamma**k * reward for k, reward in enumerate(rewards)])

        if reward_type in ['normal', 'grouped average']:
            # Get cumulated reward: Equation (4)
            cumulated_reward = overall_reward(returned_rewards, self.gamma)
        elif reward_type == 'grouped cosine':
            # Get probability weighted cumulated reward
            cumulated_reward = np.sum([p * overall_reward(g['rewards'], self.gamma)
            for p, g in zip(probabilities, self.groups)])

        return returned_rewards, cumulated_reward

In [289]:
env = Fairrec(data=data, embeddings=embeddings, alpha=0.5, gamma=0.9, fixed_length=False, state_size=12, trajectory_length=5)

Fair-Rec Environment initialized


In [295]:
env.step([1,2,3,1223])

(10.850312981840485,
 array([   1,    2,    3, 1223,    1,    2,    3, 1223,    1,    2,    3,
        1223]),
 True)

In [222]:
# env.preprocess(data, embeddings, 0.5, 0.9, True)
# env = gym.make('fairrec-v0',data=data, embeddings=embeddings, alpha=0.5, gamma=0.9, fixed_length=True)

In [223]:
current_state = env.reset()

In [224]:
current_state

array([ 249,  455,  301,  295,  117,  286, 1013,  274,  291,  748,  282,
        815])

In [225]:
actions = embeddings.get_embedding([1,2,3,4])
# actions = embeddings.embed([1,2,3,4])
# actions

In [297]:
env.embedded_data['state'][0]

array([1121,  686,  135,  492, 1203,  481,  216,  524,   23,  705,  217,
       1050])

In [35]:
import torch

In [85]:
item_embeddings = torch.from_numpy(embeddings.get_embedding_vector()).expand(4,-1,100)
actions = torch.from_numpy(embeddings.get_embedding([1,2,3,4])).view(4,1,100)#.view(4,100,1)

In [86]:
actions.shape,item_embeddings.shape

(torch.Size([4, 1, 100]), torch.Size([4, 1682, 100]))

In [87]:
res = torch.mul(item_embeddings,actions)
res.shape

torch.Size([4, 1682, 100])

In [88]:
item_embeddings = torch.from_numpy(embeddings.get_embedding_vector())
actions = torch.from_numpy(embeddings.get_embedding([1,2,3,4])).view(100,-1)
print(actions.shape,item_embeddings.shape)
res = torch.mm(item_embeddings,actions)
res.shape

torch.Size([100, 4]) torch.Size([1682, 100])


torch.Size([1682, 4])

In [298]:
M = torch.Tensor([[[1, 2, 3], [4, 5, 6]],[[10, 20, 30], [40, 50, 60]]])
M.expand(4,2,2,3)

tensor([[[[ 1.,  2.,  3.],
          [ 4.,  5.,  6.]],

         [[10., 20., 30.],
          [40., 50., 60.]]],


        [[[ 1.,  2.,  3.],
          [ 4.,  5.,  6.]],

         [[10., 20., 30.],
          [40., 50., 60.]]],


        [[[ 1.,  2.,  3.],
          [ 4.,  5.,  6.]],

         [[10., 20., 30.],
          [40., 50., 60.]]],


        [[[ 1.,  2.,  3.],
          [ 4.,  5.,  6.]],

         [[10., 20., 30.],
          [40., 50., 60.]]]])

In [318]:
x = torch.tensor([[1, 2, 3],[4, 5, 6]])
x[:,0:2]

tensor([[1, 2],
        [4, 5]])

In [96]:
torch.transpose(M,2,1)

tensor([[[ 1.,  4.],
         [ 2.,  5.],
         [ 3.,  6.]],

        [[10., 40.],
         [20., 50.],
         [30., 60.]]])

In [315]:
for i in range(0,400,100):
    print(i)

0
100
200
300


In [98]:
values, indices = torch.max(res, 0)

In [99]:
indices

tensor([ 287, 1293,  741,  160])

In [102]:
torch.topk(res, k=4, dim=0)

torch.return_types.topk(
values=tensor([[ 0.6172, -0.0315,  0.2512,  0.1234],
        [ 0.5960, -0.0460,  0.2373,  0.1164],
        [ 0.5311, -0.0733,  0.2235,  0.1156],
        [ 0.5309, -0.0995,  0.2215,  0.1144]], dtype=torch.float64),
indices=tensor([[ 287, 1293,  741,  160],
        [ 244, 1463,  463, 1581],
        [1366,  568, 1292, 1627],
        [1296,  316,  475, 1490]]))

In [264]:
class Fairrec(gym.Env):
    def __init__(self, data, embeddings, alpha, gamma, fixed_length):
        self.action_space = spaces.Box(low=-1, high=1, shape=(100,))
        self.observation_space = spaces.Box(low=-1, high=1, shape=(12*100,))

        self.embeddings = embeddings
        self.embedded_data = pd.DataFrame()
        self.embedded_data['state'] = [np.array([embeddings.get_embedding(item_id) 
            for item_id in row['state']]) for _, row in data.iterrows()]
        self.embedded_data['action'] = [np.array([embeddings.get_embedding(item_id) 
            for item_id in row['action']]) for _, row in data.iterrows()]
        self.embedded_data['reward'] = data['reward']

        self.alpha = alpha # α (alpha) in Equation (1)
        self.gamma = gamma # Γ (Gamma) in Equation (4)
        self.fixed_length = True
        self.current_state = self.reset()
        self.groups = self.get_groups()

        print('Fair-Rec Environment initialized')

    def set_observation_space(self, state_size):
        self.observation_space = spaces.Box(low=-1, high=1, shape=(state_size*100,))

    def set_action_space(self, action_size):
        self.action_space = spaces.Box(low=-1, high=1, shape=(action_size*100,))

    def reset(self):
        self.init_state = self.embedded_data['state'].sample(1).values[0]
        return self.init_state

    def step(self, actions):
        '''
        Compute reward and update state.
        Args:
          actions: embedded chosen items.
        Returns:
          cumulated_reward: overall reward.
          current_state: updated state.
        '''

        # '18: Compute overall reward r_t according to Equation (4)'
        simulated_rewards, cumulated_reward = self.simulate_rewards(self.current_state.reshape((1, -1)), actions.reshape((1, -1)))

        # '11: Set s_t+1 = s_t' <=> self.current_state = self.current_state

        for k in range(len(simulated_rewards)): # '12: for k = 1, K do'
            if simulated_rewards[k] > 0: # '13: if r_t^k > 0 then'
                # '14: Add a_t^k to the end of s_t+1'
                print(simulated_rewards[k])
                self.current_state = np.append(self.current_state, [actions[k]], axis=0)
                if self.fixed_length: # '15: Remove the first item of s_t+1'
                    self.current_state = np.delete(self.current_state, 0, axis=0)

        return cumulated_reward, self.current_state

    def get_groups(self):
        ''' Calculate average state/action value for each group. Equation (3). '''

        groups = []
        for rewards, group in self.embedded_data.groupby(['reward']):
            size = group.shape[0]
            states = np.array(list(group['state'].values))
            actions = np.array(list(group['action'].values))
            groups.append({
            'size': size, # N_x in article
            'rewards': rewards, # U_x in article (combination of rewards)
            'average state': (np.sum(states / np.linalg.norm(states, 2, axis=1)[:, np.newaxis], axis=0) / size).reshape((1, -1)), # s_x^-
            'average action': (np.sum(actions / np.linalg.norm(actions, 2, axis=1)[:, np.newaxis], axis=0) / size).reshape((1, -1)) # a_x^-
            })
        return groups

    def simulate_rewards(self, current_state, chosen_actions, reward_type='grouped cosine'):
        '''
        Calculate simulated rewards.
        Args:
          current_state: history, list of embedded items.
          chosen_actions: embedded chosen items.
          reward_type: from ['normal', 'grouped average', 'grouped cosine'].
        Returns:
          returned_rewards: most probable rewards.
          cumulated_reward: probability weighted rewards.
        '''

        # Equation (1)
        def cosine_state_action(s_t, a_t, s_i, a_i):
            cosine_state = np.dot(s_t, s_i.T) / (np.linalg.norm(s_t, 2) * np.linalg.norm(s_i, 2))
            cosine_action = np.dot(a_t, a_i.T) / (np.linalg.norm(a_t, 2) * np.linalg.norm(a_i, 2))
            return (self.alpha * cosine_state + (1 - self.alpha) * cosine_action).reshape((1,))

        if reward_type == 'normal':
            # Calculate simulated reward in normal way: Equation (2)
            probabilities = [cosine_state_action(current_state, chosen_actions, row['state'], row['action'])
            for _, row in self.embedded_data.iterrows()]
        elif reward_type == 'grouped average':
            # Calculate simulated reward by grouped average: Equation (3)
            probabilities = np.array([g['size'] for g in self.groups]) *\
            [(self.alpha * (np.dot(current_state, g['average state'].T) / np.linalg.norm(current_state, 2))\
            + (1 - self.alpha) * (np.dot(chosen_actions, g['average action'].T) / np.linalg.norm(chosen_actions, 2)))
            for g in self.groups]
        elif reward_type == 'grouped cosine':
            # Calculate simulated reward by grouped cosine: Equations (1) and (3)
            probabilities = [cosine_state_action(current_state, chosen_actions, g['average state'], g['average action'])
            for g in self.groups]

        # Normalize (sum to 1)
        probabilities = np.array(probabilities) / sum(probabilities)

        # Get most probable rewards
        if reward_type == 'normal':
            returned_rewards = self.embedded_data.iloc[np.argmax(probabilities)]['reward']
        elif reward_type in ['grouped average', 'grouped cosine']:
            returned_rewards = self.groups[np.argmax(probabilities)]['rewards']

        # Equation (4)
        def overall_reward(rewards, gamma):
            return np.sum([gamma**k * reward for k, reward in enumerate(rewards)])

        if reward_type in ['normal', 'grouped average']:
            # Get cumulated reward: Equation (4)
            cumulated_reward = overall_reward(returned_rewards, self.gamma)
        elif reward_type == 'grouped cosine':
            # Get probability weighted cumulated reward
            cumulated_reward = np.sum([p * overall_reward(g['rewards'], self.gamma)
            for p, g in zip(probabilities, self.groups)])

        return returned_rewards, cumulated_reward


Fair-Rec Environment initialized


In [268]:
env = Fairrec(data=data, embeddings=embeddings, alpha=0.5, gamma=0.9, fixed_length=True)
actions = embeddings.get_embedding([1,2,3,4])
env.step(actions)

Fair-Rec Environment initialized
5
5
5
5


(10.850470800900226,
 array([[-0.18108723,  0.04148261, -0.11469268, ...,  0.146907  ,
         -0.10624914,  0.15694171],
        [-0.24884087,  0.1280875 , -0.10616239, ...,  0.1545012 ,
         -0.11269389,  0.18657199],
        [-0.15192594,  0.00835354, -0.18226244, ...,  0.20323972,
         -0.12260215,  0.15912019],
        ...,
        [-0.20948793,  0.05360457, -0.10730472, ...,  0.07926252,
         -0.07703452,  0.03268938],
        [-0.20113458,  0.06713491, -0.14161727, ...,  0.20752539,
         -0.12113315,  0.14045045],
        [-0.15918495,  0.0930948 , -0.1697659 , ...,  0.13854662,
         -0.15439157,  0.06976631]]))

In [273]:
len(data['state'][0]) - len(data['n_state'][0])

-4

In [323]:
data['reward'][0]

(2, 4, 3, 4)