# Environment 3

In [1]:
import pandas as pd
import numpy as np
import requests
import random
from time import sleep
import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Dense, Lambda, Input, Flatten, Dropout, Embedding, Concatenate

Using TensorFlow backend.


## Preparation

In [2]:
user_id = '5PZFMN4UIV66SBO6A1KE'
base_url = 'http://35.180.178.243'
url_reset = base_url + '/reset'
url_predict = base_url + '/predict'
params = {'user_id': user_id}

In [3]:
# json file
r = requests.get(url=url_reset, params=params)
data = r.json()

nb_items = data['nb_items']
nb_users = data['nb_users']

action_history = data['action_history']
rewards_history = data['rewards_history']
state_history = data['state_history']

next_state = data['next_state']

In [4]:
# history 200
# action history : recommended item position
# reward history : 0 or price
# state history 
# state[j][0] = user
# state[j][1] = item
# state[j][2] = price
# state[j][3:] = variables

# data.keys()
# len(action_history), len(rewards_history), len(state_history)

## Model 0

In [5]:
# baseline, random
def model_zero(state):
    length = len(state)
    return np.random.randint(length)

## Model 1

In [6]:
# most expensive
def model_one(state):
    length = len(state)
    price = 0
    item = 0
    for i in range(length):
        p = state[i][2]
        if p > price:
            price = p
            item = i
    return item

## Model 2
Siamese Network

### Define triplet loss

In [7]:
def identity_loss(y_true, y_pred):
    
    # independent with y_true
    # insistant to reduce y_pred
    return tf.reduce_mean(y_pred + 0 * y_true)

def triplet_loss(inputs, alpha=0.2):
    pos_sim, neg_sim = inputs
    return tf.maximum(neg_sim - pos_sim + alpha, 0)

### Agent model, siamese network

In [47]:
class Model_two:
    def __init__(self, nb_users, nb_items):
        self.nb_users = nb_users
        self.nb_items = nb_items
        
        self.epsilon = 0.99  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.95
        
        self.pred_model, self.triplet_model = self._build_model()
        self.triplet_model.compile(loss=identity_loss, optimizer='adam')
        
    def _build_model(self):
        user_id_input = Input(shape=[1], name='user')
        item_id_input = Input(shape=[1], name='item')
        variable_input = Input(shape=[5], name='variable')

        user_embedding = Embedding(output_dim=10, input_dim=self.nb_users+1,
                                   input_length=1, name='user_embedding')(user_id_input)
        
        item_embedding = Embedding(output_dim=10, input_dim=self.nb_items+1,
                                   input_length=1, name='item_embedding')(item_id_input)
            
        
        user_vecs = Flatten()(user_embedding)
        item_vecs = Flatten()(item_embedding)

        cc = Concatenate()([user_vecs, item_vecs, variable_input])

        y1 = Dense(50, activation='relu')(cc)
        y2 = Dense(30, activation='relu')(y1)
        d = Dropout(0.5)(y2)
        y3 = Dense(1, activation='sigmoid')(d)
        
        model = Model(inputs=[user_id_input, item_id_input, variable_input], outputs=y3)

        # siamese network 
        input1_user = Input(shape=[1])  # 1: positive
        input2_user = Input(shape=[1])  # 2: negative
        
        input1_item = Input(shape=[1])
        input2_item = Input(shape=[1])
        
        input1_variable = Input(shape=[5])
        input2_variable = Input(shape=[5])
    
        
        prob1 = model([input1_user, input1_item, input1_variable])
        prob2 = model([input2_user, input2_item, input2_variable])
        
        tri_loss = Lambda(triplet_loss, output_shape=(1,))([prob1, prob2])
        
        triplet_model = Model(inputs=[input1_user, input1_item, input1_variable, 
                                      input2_user, input2_item, input2_variable],
                              outputs=tri_loss)

        return model, triplet_model

    def predict(self, state):
        length = len(state)
        
        decision = np.zeros(length)
        for i in range(length):
            prob = self.pred_model.predict([np.array([next_state[i][0]]), 
                                            np.array([next_state[i][1]]),
                                            np.array([next_state[i][3:]])])
            decision[i] = prob
        return np.argmax(decision)

### Training

In [48]:
def reset():
    r = requests.get(url=url_reset, params=params)
    data = r.json()

    nb_items = data['nb_items']
    nb_users = data['nb_users']

    action_history = data['action_history']
    rewards_history = data['rewards_history']
    state_history = data['state_history']

    next_state = data['next_state']
    
    # split positive and negative examples
    pos_user = []
    neg_user = []
    pos_item = []
    neg_item = []
    pos_variable = []
    neg_variable = []
    

    for i in range(200):
        action = action_history[i]
        reward = rewards_history[i]
        tmp = state_history[i][action]
        variable = tmp[3:]
        if reward > 0:
            pos_user.append(tmp[0])
            pos_item.append(tmp[1])
            pos_variable.append(variable)
        else:
            neg_user.append(tmp[0])
            neg_item.append(tmp[1])
            neg_variable.append(variable)
            
    return next_state, pos_user, pos_item, pos_variable, neg_user, neg_item, neg_variable


def sample_triplets(pos_user, pos_item, pos_variable, neg_user, neg_item, neg_variable):
    sub_pos_user = []
    sub_pos_item = []
    sub_pos_variable = []
    
    num_pos = len(pos_user)
    num_neg = len(neg_user)

    for i in range(num_neg):
        l = np.random.randint(num_pos)
        sub_pos_user.append(pos_user[l])
        sub_pos_item.append(pos_item[l])
        sub_pos_variable.append(pos_variable[l])
    fake_y = np.ones(num_neg)
    return [sub_pos_user, sub_pos_item, sub_pos_variable, neg_user, neg_item, neg_variable], fake_y

In [49]:
next_state, pos_user, pos_item, pos_variable, neg_user, neg_item, neg_variable = reset()

In [50]:
model_two = Model_two(nb_users, nb_items)

epochs = 50
for i in range(epochs):
    triplet_inputs, fake_y = sample_triplets(pos_user, pos_item, pos_variable, 
                                             neg_user, neg_item, neg_variable)
    
    model_two.triplet_model.fit(triplet_inputs, fake_y, 
                                shuffle=True, batch_size=32,
                                validation_split=0.1)

Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 119 samples, validate on 14 samples
Epoch 1/1
Train on 1

## Model 3
change predict rule: highest reward esperance    
add exploration   
add online training

In [40]:
class Model_three(Model_two):
    def __init__(self, nb_users, nb_items):
        Model_two.__init__(self, nb_users, nb_items)
        self.last_state = None
        self.last_action = None

    def predict(self, state):
        self.last_state = state
        length = len(state)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
        if np.random.rand() <= self.epsilon:
            # exploration
            action = model_one(next_state)
            self.last_action = action
            return action
        
        else:
            # exploitation
            decision = np.zeros(length)
            price = np.zeros(length)
            for i in range(length):
                prob = self.pred_model.predict([np.array([next_state[i][0]]), 
                                                np.array([next_state[i][1]]),
                                                np.array([next_state[i][3:]])])
                decision[i] = prob
                price[i] = state[i][2]
            action = np.argmax(decision*price)
            self.last_action = action
            return action
    
    def retrain(self, reward):
        user_id = np.array([self.last_state[self.last_action][0]])
        item_id = np.array([self.last_state[self.last_action][1]])
        variable = np.array([self.last_state[self.last_action][3:]])

        prob = self.pred_model.predict([user_id, item_id, variable])
        if reward > 0:
            new_prob = prob * 1.02
        else:
            new_prob = prob * 0.98
        self.pred_model.fit([user_id, item_id, variable], new_prob, verbose=0)

In [44]:
next_state, pos_user, pos_item, pos_variable, neg_user, neg_item, neg_variable = reset()

In [45]:
model_three = Model_three(nb_users, nb_items)

epochs = 50
for i in range(epochs):
    triplet_inputs, fake_y = sample_triplets(pos_user, pos_item, pos_variable, 
                                             neg_user, neg_item, neg_variable)
    
    model_three.triplet_model.fit(triplet_inputs, fake_y, 
                                shuffle=True, batch_size=32,
                                validation_split=0.1)

Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 129 samples, validate on 15 samples
Epoch 1/1
Train on 1

## Evaluation

In [9]:
episodes = 1000
reward = 0
rate = 0
params = {'user_id': user_id, 'recommended_item':0}

for i in range(episodes):
    sleep(0.02)
    
    # predict next state
    prediction = model_zero(next_state)
    
    # transmission with server
    params['recommended_item'] = prediction
    r = requests.get(url=url_predict, params=params)
    d = r.json()

    # conversion rate and reward
    if d['reward'] > 0:
        rate += 1
        reward += d['reward']
    
    # update next state
    next_state = d['state']
    
print('Conversion rate is:', rate/episodes)
print('Average reward is:', reward/episodes)

Conversion rate is: 0.286
Average reward is: 141.67441928488992


In [11]:
episodes = 1000
reward = 0
rate = 0
params = {'user_id': user_id, 'recommended_item':0}

for i in range(episodes):
    sleep(0.02)
    
    # predict next state
    prediction = model_one(next_state)
    
    # transmission with server
    params['recommended_item'] = prediction
    r = requests.get(url=url_predict, params=params)
    d = r.json()

    # conversion rate and reward
    if d['reward'] > 0:
        rate += 1
        reward += d['reward']
    
    # update next state
    next_state = d['state']
    
print('Conversion rate is:', rate/episodes)
print('Average reward is:', reward/episodes)

Conversion rate is: 0.218
Average reward is: 201.12549890726584


In [19]:
episodes = 1000
reward = 0
rate = 0
params = {'user_id': user_id, 'recommended_item':0}

for i in range(episodes):
    sleep(0.02)
    
    # predict next state
    prediction = model_two.predict(next_state)
    
    # transmission with server
    params['recommended_item'] = prediction
    r = requests.get(url=url_predict, params=params)
    d = r.json()

    # conversion rate and reward
    if d['reward'] > 0:
        rate += 1
        reward += d['reward']
    
    # update next state
    next_state = d['state']
    
print('Conversion rate is:', rate/episodes)
print('Average reward is:', reward/episodes)

Conversion rate is: 0.365
Average reward is: 162.72465993892374


In [46]:
episodes = 1000
reward = 0
rate = 0
params = {'user_id': user_id, 'recommended_item':0}

for i in range(episodes):
    sleep(0.02)
    
    # predict next state
    prediction = model_three.predict(next_state)
    
    # transmission with server
    params['recommended_item'] = prediction
    r = requests.get(url=url_predict, params=params)
    d = r.json()

    # conversion rate and reward
    if d['reward'] > 0:
        rate += 1
        reward += d['reward']
        
    # model_three.retrain(reward)
    
    # update next state
    next_state = d['state']
    
print('Conversion rate is:', rate/episodes)
print('Average reward is:', reward/episodes)

Conversion rate is: 0.296
Average reward is: 267.35459797169693
