<h1><u>RNN with cross-entropy loss</u></h1>
<h4>Omer Nivron</h4>


<h2>Package loading</h2>

In [1]:
import numpy as np
import sys as sys
import random as rd
import tensorflow as tf
import gzip
from __future__ import division
import pandas as pd
import os
import math
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
import csv

# Data creation

In [2]:
def range_bet_col_t_col_n_append(col_1 ,col_2):
    app_ranges=[]
    for i in range(col_1.shape[0]):
        single_range = range((col_1[i]).astype(int), (col_2[i]).astype(int))
        app_ranges = np.append(app_ranges, single_range)
    return(app_ranges)

In [3]:
def sort_pd_df_by_ext_vec(df, ext_sor_vec, cols):
    df_s = df[((df[cols[0]]).astype(int)).isin(ext_sor_vec)] #
    df_s['sort_cat'] = pd.Categorical(df_s[cols[0]],categories = ext_sor_vec,ordered = True)
    if len(cols) > 1:
        df_s.sort_values(['sort_cat',cols[1]] ,inplace = True)
    
    else:
        df_s.sort_values(['sort_cat'],inplace = True) 
    
    df_s.reset_index(inplace = True)
    df_ = df_s.drop(['sort_cat','index'] ,axis = 1)
    
    return(df_)

In [4]:
def rnn_model(model,n_hidden,layers):
        # Define a lstm/GRU cell with tensorflow
    if (layers==1):
        if(model=='lstm'):
            cell=tf.nn.rnn_cell.LSTMCell(n_hidden,state_is_tuple=True)
        else:
            cell=tf.nn.rnn_cell.GRUCell(n_hidden)   
    else:
        if(model=='lstm'):
            lstm=tf.nn.rnn_cell.LSTMCell(n_hidden,state_is_tuple=True)
            cell=tf.nn.rnn_cell.MultiRNNCell([lstm]*layers)

        else:
            gru=tf.nn.rnn_cell.GRUCell(n_hidden)
            cell=tf.nn.rnn_cell.MultiRNNCell([gru]*layers)
    return(cell)

In [5]:
def algeb_geom_series(mode ,start ,jump ,length):
    u = np.empty((length,))
    u[0] = start
    u[1:] = jump
    if (mode == 0):
        series=np.cumsum(u)
    if (mode == 1):
        series=np.cumprod(u)
    return(series)

In [6]:
def np_pad_tr_x(x_tr, batch_size, str_idx, zero_array_x, length_vec):
    start = 0
    for i in range(batch_size):
        if i > 0:
            end = end + length_vec[i] 
        else:
            end = length_vec[0]
        zero_array_x[ str_idx[i] : (str_idx[i] + length_vec[i]) ] = x_tr[ start : end ] 
        start = end

    return(zero_array_x)

####################################################

Loading Fake data

In [7]:
def trainSamples(viewers,videos,probab,viewerFeat,videoFeat,contxFeat):
    trData = {} # trData = Dictionary with training data. This is histrory of viewer and video iteraction
    X = {} # X  = Dictionary with viewer features as arrays
    Y ={} # Y  = Dictionary with video features as arrays
    for i in range(viewers):
        X[i] = np.random.rand(viewerFeat)
        a = 0 # timing of the video for a particular user,... 
                #to give the order in which the videos have been watched
        for j in range(videos):
            if int(np.random.binomial(1,probab ,1)[0]):
                trData[(i,j,a)] = np.random.rand(contxFeat)
                a+=1 # when a video is watched, we increase the value of a by 1 
            if i==0:
                Y[j] = np.random.rand(videoFeat)
    return X,Y,trData

In [8]:
viewers = 1000  #number of viewers
videos = 1000  #number of videos
probab = 0.3  #probability of a viewer watching any one video
viewerFeat = 310  #number of features describing a veiwer
videoFeat = 300   #number of features describing a video
contxFeat = 15 # number of contextual features
# X  = Dictionary with viewer features as arrays
# Y  = Dictionary with video features as arrays
# trData = Dictionary with training data. This is histrory of viewer and video iteraction
X, Y, trData = trainSamples(viewers, videos, probab, viewerFeat, videoFeat, contxFeat)

### Uncomment only when real data is available 

In [9]:
#userFiltNum = 1000
#user_feat_inp_w_key_df = user_feat_inp_w_key_df2[user_feat_inp_w_key_df2['user_id'] < userFiltNum]
#user_vid_time_df  = user_vid_time_df2[user_vid_time_df2['user_id'] < userFiltNum]
#num_user_feat, num_video_feat = user_feat_inp_w_key_df.shape[1] -1 , vid_feat_inp_w_key_df.shape[1] -1
#num_users, contex_feat = user_feat_inp_w_key_df.shape[0], len(user_vid_time_df.columns[3:])

#user_vid_time_df_sort=user_vid_time_df[["user_id",'movie_id',"rank"]].sort_values(["user_id","rank"])
#user_vid_time_df_sort = user_vid_time_df_sort.reset_index(drop=True)

#h = pd.get_dummies(user_vid_time_df_sort["movie_id"],sparse=True,prefix='vid_')

#y_tr_p_w=pd.concat([user_vid_time_df_sort.reset_index(drop=True), h], axis=1)
#y_tr_p_w['desired'] = np.argmax(np.array(y_tr_p_w.iloc[:,3:]) ,1)

## transform a dict to numpy array

In [10]:
user_feat_inp = np.array([X[key] for key in sorted(X.keys())]) # dictionary to numpy array

vid_feat_inp = np.array([Y[key] for key in sorted(Y.keys())])

## Add a column key specifying user

In [11]:
key_user = np.asarray(range(user_feat_inp.shape[0])).reshape(user_feat_inp.shape[0],1)
key_vid = np.asarray(range(vid_feat_inp.shape[0])).reshape(vid_feat_inp.shape[0],1)
user_feat_inp_w_key = np.concatenate((user_feat_inp,key_user),axis=1)
vid_feat_inp_w_key = np.concatenate((vid_feat_inp,key_vid),axis=1)
user_vid_time = trData.keys()

## transform numpy to pandas df in order to use easy merging

In [12]:
user_feat_inp_w_key_df = pd.DataFrame(user_feat_inp_w_key) # numpy array to Data frame
vid_feat_inp_w_key_df = pd.DataFrame(vid_feat_inp_w_key)#d
user_vid_time_df = pd.DataFrame(user_vid_time)# b

# Construct true_y

In [13]:
rr = user_vid_time_df.sort_values([0,2])

In [14]:
h = pd.get_dummies(rr[1],prefix='vid_')
y_tr = pd.concat([rr.reset_index(drop = True), h], axis=1)
y_tr.rename(columns = {0: 'user'}, inplace = True)
y_tr.rename(columns = {1: 'mov'}, inplace = True)
y_tr.rename(columns = {2: 'rank'}, inplace = True)
y_tr['desired'] = np.argmax(np.array(y_tr.iloc[:,4:]) ,1)

## Merge two pandas df 

In [15]:
max_watch = (y_tr.groupby('user',axis = 0).sum().iloc[:
                                ,int(np.array(np.where(y_tr.columns=='rank'))):-1].sum(axis=1)).reset_index()

In [16]:
vid_feat_inp_w_key_df.rename(columns = {300: 'mov_id'}, inplace = True)
user_feat_inp_w_key_df.rename(columns = {310: 'user_id'}, inplace = True)
user_vid_time_df.rename(columns = {0: 'user_id'}, inplace = True)
user_vid_time_df.rename(columns = {1: 'mov_id'}, inplace = True)
user_w_vid_tim_and_feat=user_vid_time_df.merge(user_feat_inp_w_key_df, how='inner',on = 'user_id', sort = False)
user_vid_time_vidfeat_usefit=user_w_vid_tim_and_feat.merge(vid_feat_inp_w_key_df,how = 'inner'
                                                           ,on='mov_id',sort = False)
user_vid_time_vidfeat_usefit.rename(columns = {'2_x': 'rank'}, inplace = True)
user_vid_time_vidfeat_usefit_sorted = user_vid_time_vidfeat_usefit.sort_values(['user_id', 'rank']) 
cols_to_del = [user_vid_time_vidfeat_usefit_sorted.columns.get_loc("user_id")
             ,user_vid_time_vidfeat_usefit_sorted.columns.get_loc("mov_id")
             ,user_vid_time_vidfeat_usefit_sorted.columns.get_loc("rank")]

# Split training and testing data

In [17]:
tr_users = viewers*0.8
tr_y = y_tr[y_tr['user'] < tr_users]

##################################################

########################################

<h1>Configurations</h1>

In [18]:
tf.reset_default_graph()

In [19]:
# Network Parameters
n_opt_epoch=1000

<h3>LSTM with X units</h3>

In [20]:
n_opt_epoch = 1000
model = 'lstm'
layers = 1
n_samples = 500
top_k = 15
top_l = 10
top_r = 5

n_users = viewers #num_users
te_users = int(np.ceil(viewers*0.2))
n_feature = viewerFeat + videoFeat + contxFeat #623
lr_rat = 0.001
num_video = h.shape[1]
beta = 0.01

if tr_users > 500:
    batch_size = 500
else:
    batch_size  = tr_users
        
n_hidden = 64 # hidden layer num of features

Tensorboard saving paths

In [21]:
#! mkdir -p ./reco_rnn/tensor_plot/xent_genre/
#! mkdir -p ./reco_rnn/xent_cost_genre/
#logs_path = "/Users/onivron/Desktop/reco_rnn/tensor_plot/p_w/"
logs_path = "./reco_rnn/tensor_plot/xent_genre/"

<h3>Var creation</h3>
We build data placeholders & variables 

In [22]:
with tf.name_scope('input'):
    dynam_input=tf.placeholder("float32", [batch_size, None, n_feature], name = 'dynam_input') 
    y_true = tf.placeholder("int32", [None, 1], name = 'Input_y')
    tr_rw = tf.placeholder("int32" ,[None, 1] ,name = '_rw')
    max_batch_length = tf.placeholder("float32", [batch_size], name = 'max_leng')

with tf.name_scope("weights"):
    W = {'out_2':tf.Variable(tf.random_normal([n_hidden, num_video]), name = 'w_2')}  
with tf.name_scope("biases"):
    b = {'out_2':tf.Variable(tf.random_normal([num_video]), name='b_2')} 

<h3>Define model</h3>

In [23]:
def RNN(dynam_input, max_batch_length, W, b, model):
    lstm_cell = rnn_model(model, n_hidden, layers)
    outputs, states = tf.nn.dynamic_rnn(lstm_cell,inputs = dynam_input
                                     ,dtype = tf.float32, sequence_length = max_batch_length)
    out_shaped = tf.reshape(outputs,[-1, n_hidden]) 
    lay_2 = tf.matmul(out_shaped, W['out_2']) + b['out_2'] 
    lay_3 = tf.sigmoid(lay_2)
    
    return (lay_2, lay_3) 

<h3>Define loss, accuracy & optimizer</h3>

In [24]:
y_pred, sig_y_pred = RNN(dynam_input, max_batch_length, W, b, model)

relevant_y_pred = tf.reshape(tf.gather(params = tf.reshape(y_pred
                                        ,[-1 ,num_video]), indices = tr_rw), [-1, num_video])

with tf.name_scope('cross_entropy'):
    xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = relevant_y_pred, 
                            labels = tf.reshape(y_true, [-1])))   
    regularizer = tf.nn.l2_loss(tf.abs(W['out_2']))
    xent_regu = tf.reduce_mean(xent + beta * regularizer)

top_vals, top_indice = tf.nn.top_k(relevant_y_pred, n_samples)
    
with tf.name_scope('Accuracy_top_r'):
    top_15_indx_r = tf.slice(top_indice,[0,0],[-1, top_r])
    to_bool_r = tf.reduce_sum(tf.cast(tf.equal(y_true, top_15_indx_r), tf.float32), 1)
    accuracy_r = tf.reduce_mean(to_bool_r)   
    
with tf.name_scope('Accuracy_top_l'):
    top_15_indx_l = tf.slice(top_indice,[0,0],[-1, top_l])
    to_bool_l = tf.reduce_sum(tf.cast(tf.equal(y_true, top_15_indx_l), tf.float32), 1)
    accuracy_l = tf.reduce_mean(to_bool_l)
    
with tf.name_scope('Accuracy_top_k'):
    top_15_indx_k = tf.slice(top_indice,[0,0],[-1, top_k])
    to_bool_k = tf.reduce_sum(tf.cast(tf.equal(y_true, top_15_indx_k), tf.float32), 1)
    accuracy_k = tf.reduce_mean(to_bool_k)

In [25]:
with tf.name_scope('train'):    
    optimizer = tf.train.AdamOptimizer(learning_rate = lr_rat).minimize(xent_regu)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


<h3>Tensorboard set-up</h3>

In [26]:
tf.summary.scalar("cost", xent_regu)
tf.summary.scalar("Accuracy_top_r", accuracy_r)
tf.summary.scalar("Accuracy_top_k", accuracy_k)
tf.summary.scalar("Accuracy_top_l", accuracy_l)

summary_op = tf.summary.merge_all()
writer_opt = tf.summary.FileWriter(logs_path,graph = tf.get_default_graph())
saver = tf.train.Saver(write_version = tf.train.SaverDef.V2)

<h1>Training - optimization</h1>

In [27]:
init = tf.global_variables_initializer()
sess = tf.Session() 
sess.run(init)

### uncomment only if weigths are to be re-used.

In [29]:
# save_MDir = './reco_rnn/xent_cost_genre/'
# save_model = os.path.join(save_MDir,'accu')
# sess=tf.Session() 
# sess.run(tf.global_variables_initializer())
# saver.restore(sess = sess, save_path= save_model)

In [30]:
with open('xent_genre_accuracy.csv', 'wb') as csvfile:
    wr = csv.writer(csvfile, delimiter='\t', lineterminator='\n')
    for epoch in range(int(n_opt_epoch)):
        if(epoch != 0):
            writer_opt.add_summary(summary,epoch)
        for batch_n in range(int(n_users / batch_size)):   
            rw_to_chose = np.random.choice((tr_likes["user_id"].unique()), batch_size,replace = False) 
            x_tr = sort_pd_df_by_ext_vec(tr_likes, rw_to_chose, cols = ['user_id', 'rank'])
            y_batch = sort_pd_df_by_ext_vec(tr_y, rw_to_chose,cols = ['user_id', 'rank'])
            length_vec = sort_pd_df_by_ext_vec(df = max_watch, ext_sor_vec = rw_to_chose, cols = ['user_id'])      
            length_max = max(length_vec.iloc[:, 1])
            trial_size = (int(batch_size * length_max), n_feature)
            str_idx = algeb_geom_series(0 ,start = 0 ,jump = length_max ,length = batch_size)
            end_idx = np.append(length_vec.iloc[0, 1], length_vec.iloc[1:, 1:] + str_idx[1:].reshape(batch_size-1, -1))
            app_range = range_bet_col_t_col_n_append(str_idx, end_idx)
            zero_array_x = np.zeros(trial_size)
            _x_tr = np_pad_tr_x(x_tr.iloc[:,3:], batch_size, str_idx.astype(int),
                                            zero_array_x, length_vec.iloc[:,1].astype(int))      
            acc_r, acc_l, acc_k, xent, _, summary = sess.run([accuracy_r, accuracy_l, accuracy_k
                                                              , xent_regu, optimizer, summary_op]
                                         ,feed_dict = {dynam_input: _x_tr.reshape(batch_size, -1, n_feature)
                                         ,tr_rw: app_range.reshape(-1, 1)
                                         ,y_true: y_batch['desired'].reshape(-1, 1)
                                         ,max_batch_length: length_vec.iloc[:, 1]
                                         })
            wr.writerow([acc_r])
            wr.writerow([acc_l])
            wr.writerow([acc_k])
            if (epoch%10 == 0):
                print('epoch number:', epoch, 'loss:', xent, 'accuracy 15:', acc_k)
                print('epoch number:', epoch, 'accuracy 10:', acc_l)
                print('epoch number:', epoch, 'accuracy 5:', acc_r)

        folder='./reco_rnn/xent_cost_genre/'
        save_path=saver.save(sess,folder+'accu')

NameError: name 'tr_likes' is not defined

<h3>End session</h3>

In [None]:
sess.close()