In [None]:

from __future__ import print_function

import numpy as np
from importlib import reload
from keras import optimizers
from keras import backend as be
from keras.models import Sequential, load_model
from keras.layers import RNN, GRU, LSTM, Dense, Activation, Bidirectional, Masking, Embedding, Dropout,Input

from os import environ
from keras.callbacks import Callback

import numpy as np
import pandas as pd
import random

In [None]:
n_movie = 3706
hidden_units = 20
batch_size = 16
max_length = 40
lr = 0.3

In [None]:
train_set_sequences = pd.read_csv("./train_set_sequences",header=None)
val_set_sequences = pd.read_csv("./val_set_sequences",header=None)
test_set_sequences = pd.read_csv("./test_set_sequences",header=None)

In [None]:
def seq_generator(df):
    while True:
        t = df.reindex(np.random.permutation(df.index))
        for i in range(len(t[0])):
            yield t[0][i]

In [None]:
def target_selection(sequence, test=False,shuffle=False,bias = 0,n_targets=1):
    if not (test):
        if shuffle:
            random.shuffle(sequence)
    return sequence[:min(len(sequence), n_targets)]

In [None]:
def mini_batch_generator(seq_gen,batch_size=batch_size,max_length=max_length):
    while True:
        sequences = []
        j = 0
        sequences = []
        j = 0
        while j < batch_size:
            g = next(seq_gen).split(" ")
            s = g[1:]
            u = g[0]
            sequence = [[s[2*x],s[2*x+1]] for x in range(int(len(s)/2))]
            seq_lengths = sorted(random.sample(range(2, len(sequence)), min([batch_size - j, len(sequence) - 2])))
            skipped_seq = 0

            for l in seq_lengths:
                target = target_selection(sequence[l:])
                if len(target) == 0:
                    skipped_seq += 1
                    continue
                start = max(0, l - max_length) # sequences cannot be longer than self.max_lenght
#                 print(l,max_length,l-max_length)
                sequences.append([u, sequence[start:l], target])
            j += len(seq_lengths) - skipped_seq
#         print(sequences.shape)
        yield sequences

In [None]:
def ohencode(x,n_movie):
    encode = np.zeros((1,n_movie))
    encode[0][int(x)] = 1
    return encode

In [None]:
def prepare_input(sequences,max_length=max_length,n_movies=n_movie,input_size=n_movie):
    batch_size = len(sequences)

    # Shape return variables
    X = np.zeros((batch_size, max_length, input_size))
    Y = np.zeros((batch_size,input_size), dtype='int32')


    for i, sequence in enumerate(sequences):
        user_id, in_seq, target = sequence
        seq_features = [ohencode(x,n_movie) for x in [y[0] for y in in_seq]]

        X[i, :len(in_seq), :] = seq_features 
        Y[i,:] = ohencode(target[0][0],n_movie)

    return X,Y

In [None]:
def input_generator(batch_generator,max_length=max_length,n_movie=n_movie,input_size=n_movie):
    while True:
        n = next(batch_generator)
        yield prepare_input(n,max_length,n_movie,input_size)

In [None]:
model_unidirection = Sequential()

model_unidirection.add(LSTM(hidden_units, dropout=0.2,batch_input_shape=(batch_size,max_length,n_movie)))

model_unidirection.add(Dense(n_movie))
model_unidirection.add(Activation('softmax'))

# optimizer in paper
optimizer = optimizers.Adagrad(lr=lr) 
model_unidirection.compile(loss='categorical_crossentropy', optimizer=optimizer)
model_unidirection.summary()

In [None]:
model = Sequential()

model.add(Bidirectional(LSTM(hidden_units, dropout=0.2),batch_input_shape=(batch_size,max_length,n_movie)))

model.add(Dense(n_movie))
model.add(Activation('softmax'))

optimizer = optimizers.Adagrad(lr=lr) 
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

model.summary()

In [None]:
train_gen = input_generator(mini_batch_generator(seq_generator(train_set_sequences)))
val_gen = input_generator(mini_batch_generator(seq_generator(val_set_sequences)))

In [None]:
spsList = []

In [None]:
sps = 0
total = 0
for i in range(5000):
    model.fit_generator(train_gen,verbose=1,epochs=1, steps_per_epoch=1)
    val_x,val_y = next(val_gen)
    pred = model.predict(val_x)
    for p,t in zip(pred,val_y):
        total += 1
        top10pred = np.argpartition(np.array(p),-10)[-10:]
        truepred = np.argpartition(np.array(t),-1)[-1]
        if truepred in top10pred:
            sps += 1
    if i % 10 == 0:
        print("epochs",i,"sp rate:",sps/total)
        if i % 100 == 0:
            spsList.append(sps/total)

In [None]:
#unidirection model train
model = model_unidirection
for i in range(5000):
    model.fit_generator(train_gen,verbose=1,epochs=1, steps_per_epoch=1)
    val_x,val_y = next(val_gen)
    pred = model.predict(val_x)
    for p,t in zip(pred,val_y):
        total += 1
        top10pred = np.argpartition(np.array(p),-10)[-10:]
        truepred = np.argpartition(np.array(t),-1)[-1]
#         print(top10pred)
        if truepred in top10pred:
            sps += 1
#     print(i)
    if i % 10 == 0:
        print("epochs",i,"sp rate:",sps/total)
        if i % 100 == 0:
            spsList.append(sps/total)

In [None]:
#bidirection model train
for i in range(5000):
    model.fit_generator(train_gen,verbose=1,epochs=1, steps_per_epoch=1)
    val_x,val_y = next(val_gen)
    pred = model.predict(val_x)
    for p,t in zip(pred,val_y):
        total += 1
        top10pred = np.argpartition(np.array(p),-10)[-10:]
        truepred = np.argpartition(np.array(t),-1)[-1]
#         print(top10pred)
        if truepred in top10pred:
            sps += 1
#     print(i)
    if i % 10 == 0:
        print("epochs",i,"sp rate:",sps/total)
        if i % 100 == 0:
            spsList.append(sps/total)

In [None]:
import matplotlib.pyplot as plt
len(spsList)

In [None]:
plt.plot(spsList)
plt.ylabel("accuracy")
plt.xlabel("100 epochs")
plt.show()

In [None]:
#test
total = 0
sps = 0

for i in range(5000):
    test_x,test_y = next(test_gen)
    pred = model.predict(test_x)
    for p,t in zip(pred,test_y):
        total += 1
        top10pred = np.argpartition(np.array(p),-10)[-10:]
        truepred = np.argpartition(np.array(t),-1)[-1]
        if truepred in top10pred:
            sps += 1

print("5000 test examples","sp rate:",sps/total)