In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import gc
import time
import os
import sys
#sys.path.append(os.path.join(os.path.dirname(__file__), '../'))
sys.path.append('../models')

from data_frame import DataFrame
from tf_base_model import TFBaseModel # for building our customized

config = tf.ConfigProto(allow_soft_placement = True)
config.gpu_options.allow_growth=True

class DataReader(object):
    '''for reading data'''
    
    def __init__(self, data_dir):
        data_cols = [
            'item_id',
            'word_id',
            'history_length',
            'label'
        ]
        #-----------------
        # loading data
        #-----------------
        if TRACE_CODE == True:
            data_train = [np.load(os.path.join(data_dir, 'train/{}_0.npy'.format(i)), mmap_mode='r') for i in data_cols]
            data_val = [np.load(os.path.join(data_dir, 'val/{}_0.npy'.format(i)), mmap_mode='r') for i in data_cols]
            data_test = [np.load(os.path.join(data_dir, 'test/{}_0.npy'.format(i)), mmap_mode='r') for i in data_cols]
        else:
            data_train = [np.load(os.path.join(data_dir, 'train/{}.npy'.format(i)), mmap_mode='r') for i in data_cols]
            data_val = [np.load(os.path.join(data_dir, 'val/{}.npy'.format(i)), mmap_mode='r') for i in data_cols]
            data_test = [np.load(os.path.join(data_dir, 'test/{}.npy'.format(i)), mmap_mode='r') for i in data_cols]

        #------------------
        # For Testing-phase
        #------------------
        self.test_df = DataFrame(columns=data_cols, data=data_test)
        print ('loaded data')
        #------------------
        # For Training-phase
        #------------------
        self.train_df = DataFrame(columns=data_cols, data=data_train)
        self.val_df = DataFrame(columns=data_cols, data=data_val)

        #self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.9)
        #self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.9, random_state = 3)

        
        print ('number of training example: {}'.format(len(self.train_df)))
        print ('number of validating example: {}'.format(len(self.val_df)))
        print ('number of testing example: {}'.format(len(self.test_df)))
        
    def train_batch_generator(self, batch_size, num_epochs=100000, shuffle = True, is_test = False):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.train_df,
            shuffle=shuffle,
            num_epochs=num_epochs,
            is_test=is_test
        )

    def val_batch_generator(self, batch_size, num_epochs=100000, shuffle = True, is_test = False):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.val_df,
            shuffle=shuffle,
            num_epochs=num_epochs,
            is_test=is_test
        )

    def test_batch_generator(self, batch_size):
        '''All row in our dataframe need to predicted as input of second-level model'''
        return self.batch_generator(
            batch_size=batch_size,
            df=self.test_df,
            shuffle=False,
            num_epochs=1,
            is_test=True
        )
    
    def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False):
        '''
        df: customized DataFrame object,
        '''
        # call our customized DataFrame object method batch_generator
        batch_gen = df.batch_generator(batch_size, shuffle = shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test)
        # batch_gen is a generator
        for batch in batch_gen:
            # what batch_gen yield is also a customized Dataframe object.
            if not is_test:
                pass
            yield batch


  return f(*args, **kwds)


In [2]:
# TRACE_CODE = False
# dr = DataReader(data_dir ='../models/data/')
word_id = np.load(os.path.join('../models/data/train/', '{}.npy'.format('word_id')))
history_length = np.load(os.path.join('../models/data/train/', '{}.npy'.format('history_length')))
label = np.load(os.path.join('../models/data/train/', '{}.npy'.format('label')))
item_id = np.load(os.path.join('../models/data/train/','{}.npy'.format('item_id')))
eval_set = np.load(os.path.join('../models/data/train/','{}.npy'.format('eval_set')))

In [3]:
print (label.shape)
print (word_id.shape)
print (history_length.shape)
print (item_id.shape)
print (eval_set.shape)

(180311, 36)
(180311, 36)
(180311,)
(180311,)
(180311,)


In [4]:
sentence = word_id[:2]
sentence.shape

(2, 36)

In [5]:
y_true = label[:2]
y_true.shape


(2, 36)

In [6]:
length = history_length[:2]
length.shape

(2,)

In [7]:
sentence

array([[ 3969, 12161, 23290,  9553,  3353,  3969, 21633,  3882,  9858,
         9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,
         9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,
         9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858],
       [ 3969,  7651,  6606, 25149,  8289, 21633, 12640, 23290, 24435,
        17761, 21855,  9858,  9858,  9858,  9858,  9858,  9858,  9858,
         9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,
         9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858,  9858]],
      dtype=int32)

In [8]:
y_true

array([[0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)

In [9]:
item_id

array([   266,    267,    268, ..., 212782, 212783, 212784], dtype=int32)

In [10]:
def get_optimizer(learning_rate, optimizer='adam'):
    '''
    It's for choosing optimizer given learning rate.
    '''
    if optimizer == 'adam':
        return tf.train.AdamOptimizer(learning_rate)
    elif optimizer == 'adagrad':
        return tf.train.AdagradOptimizer(learning_rate)
    elif optimizer == 'sgd':
        return tf.train.GradientDescentOptimizer(learning_rate)
    elif soptimizer == 'rms':
        return tf.train.RMSPropOptimizer(learning_rate, decay=0.95, momentum=0.9)
    else:
        # assert is a good way to tell other how to use this function for bug happening.
        #-------
        # standard way to pring the error
        #-------
        assert False, 'optimizer must be adam, adagrad, sgd, or rms'

def update_parameters(loss, optimizer = 'adam'):
    '''
    It's for optimizing and logging training parameters
    
    1.using gradient clipping to avoid gradient explosion and vanishment.
    
    Gradient clipping is most common in recurrent neural networks. 
    When gradients are being propagated back in time, they can vanish 
    because they they are continuously multiplied by numbers less than one.
    This is called the vanishing gradient problem. 
    This is solved by LSTMs and GRUs, and if you’re using a deep feedforward network, 
    This is solved by residual connections. 
    On the other hand, you can have exploding gradients too. 
    This is when they get exponentially large from being multiplied by numbers larger 
    than 1. Gradient clipping will clip the gradients between two numbers to prevent them from getting too large.

    '''
    #---------------
    # setting
    #---------------
    grad_clip = 5 # Clip gradients elementwise to have norm at most equal to grad_clip.
    regularization_constant = 0.1 # Regularization constant applied to all trainable parameters.
    enable_parameter_averaging = False # If true, model saves exponential weighted averages of parameters to separate checkpoint file.
    global_step = tf.Variable(0, trainable = False) # Optional Variable to increment by one after the variables have been updated.
    learning_rate_var = tf.Variable(0.0, trainable = False)
    
    #----------------
    # for understanding regularization
    #----------------
    trainable_variables_1 = tf.trainable_variables()[0]
    square_1 = tf.square(trainable_variables_1)
    sum_1 = tf.reduce_sum(square_1)
    sqrt = tf.sqrt(sum_1)
    #-----------------
    # we can customized our regularization on the parameters we like
    #-----------------
    if regularization_constant != 0:
        # l2_norm: is a 0-D tensor. 
        # we do l2-norm on each trainable's parameters.
        l2_norm = tf.reduce_sum([tf.sqrt(tf.reduce_sum(tf.square(param))) for param in tf.trainable_variables()]) # Returns list including all variables created with trainable=True
        # the smaller the loss is, the better do finish overfitting 
        loss = loss + regularization_constant*l2_norm
    #-----------------
    # optimizing
    #-----------------
    # define the optimizer
    optimizer = get_optimizer(learning_rate_var, optimizer=optimizer)
    # compute grads: return A list of (gradient, variable) pairs. Variable is always present, but gradient can be None.
    grads = optimizer.compute_gradients(loss)
    # standard way to do gradient clipping
    clipped = [(tf.clip_by_value(g, -grad_clip, grad_clip), v_) for g, v_ in grads]
    step = optimizer.apply_gradients(clipped, global_step = global_step)
    print ('step - whtat optimizer.apply_gradients returns', step)
    #-----------------
    # if using moving average techniques
    #-----------------
    if enable_parameter_averaging:
        ema = tf.train.ExponentialMovingAverage(decay=0.995)
        maintain_averages_op = ema.apply(tf.trainable_variables())
        with tf.control_dependencies([step]):
            step = tf.group(maintain_averages_op)
    else:
        step = step
#     #--------------
#     # logging
#     #--------------
#     logging.info('all parameters:')
#     logging.info(pp.pformat([(var.name, shape_of_tensor(var)) for var in tf.global_variables()]))

#     logging.info('trainable parameters:')
#     logging.info(pp.pformat([(var.name, shape_of_tensor(var)) for var in tf.trainable_variables()]))

#     logging.info('trainable parameter count:')
#     logging.info(str(np.sum(np.prod(shape_of_tensor(var)) for var in tf.trainable_variables())))
    return learning_rate_var


In [11]:
from tf_utils import temporal_convolution_layer
from tf_utils import time_distributed_dense_layer
from tf_utils import sequence_softmax_loss
from tf_utils import sequence_evaluation_metric
from data_utils import get_glove_vectors
from data_utils import load_vocab_and_return_word_to_id_dict
from tf_utils import shape_of_tensor


dim_word = 300
trainable_embedding = False
USE_PRETRAINED = True
filename_words_vec = "../models/data/wordvec/word2vec.npz".format(dim_word)
filename_words_voc = "../models/data/wordvec/words_vocab.txt"

nwords = len(load_vocab_and_return_word_to_id_dict(filename_words_voc))
embeddings = (get_glove_vectors(filename_words_vec) if USE_PRETRAINED else None)
embeddings = embeddings.astype(np.float32)
enable_parameter_averaging = False


In [12]:
#------------------
# customized evaluation metric
#------------------

def sequence_evaluation_metric(y, y_hat, sequence_lengths):
    """
    Calculates average evaluation metric on variable length sequences.

    Args:
        y: Label tensor of shape [batch size, max_sequence_length], which should be index of label.(y_true)
        y_hat: Prediction tensor, [batch size, max_sequence_length], which should be index of predicted label.(y_pred)
        sequence_lengths: Sequence lengths.  Tensor of shape [batch_size].
    Returns:
        metrics: dict. metrics["f1"] = 0.72 on each batch
    """
    #---------------
    # calculate
    #---------------
    # make y_true where tagging of each timestep is zero become False. Otherwise, True.
    y_ = tf.reduce_sum(y, axis = 1)
    y_ = tf.not_equal(y_, tf.zeros(tf.shape(y_), dtype = tf.int32)) # batch_szie

    # returns a boolean mask tensor for the first N positions of each cell.
    sequence_mask = tf.sequence_mask(lengths = sequence_lengths, maxlen=max_seq_length) # (?, max_sequence_length)
    # convert boolean into 1 or 0
    sequence_mask = tf.cast(sequence_mask, tf.float32) # (?, max_sequence_length)

    correct_preds = tf.equal(y, y_hat) # Returns boolean tensor where the truth value of (x == y) element-wise.
    # correct_preds
    correct_preds = tf.cast(tf.logical_and(y_, tf.reduce_all(correct_preds, axis = 1)), tf.float32)
    # calculate scalar
    correct_preds = tf.cast(tf.count_nonzero(correct_preds, axis = None), tf.float32)
    total_correct = tf.cast(tf.count_nonzero(tf.count_nonzero(y, axis = 1), axis = None), tf.float32)
    total_preds = tf.cast(tf.count_nonzero(tf.count_nonzero(y_hat, axis = 1), axis = None), tf.float32) # 0-D, number of our rediction which is non-zero

    #---------------
    # output
    #---------------
    p = tf.cond(tf.greater(correct_preds, tf.zeros(shape=[])), lambda : correct_preds/total_preds, lambda: tf.zeros(shape=[]))
    r = tf.cond(tf.greater(correct_preds, tf.zeros(shape=[])), lambda : correct_preds/total_correct, lambda: tf.zeros(shape=[]))
    f1 = tf.cond(tf.greater(correct_preds, tf.zeros(shape=[])), lambda : 2 * p * r / (p + r), lambda: tf.zeros(shape=[]))
    acc = correct_preds/ tf.cast(tf.shape(y_)[0], dtype = tf.float32) # correct_preds/ batch_size

    #return { "prediction": p, "recall": r, "f1": f1}
    return f1,p,r,y_,correct_preds,total_correct,total_preds,acc, sequence_mask,y
    #return {"acc": 100*acc, "prediction": p, "recall": r, "f1": f1}

def sequence_evaluation_metric(y, y_hat, sequence_lengths):
    """
    Calculates average evaluation metric on variable length sequences.

    Args:
        y: Label tensor of shape [batch size, max_sequence_length], which should be index of label.(y_true)
        y_hat: Prediction tensor, [batch size, max_sequence_length], which should be index of predicted label.(y_pred)
        sequence_lengths: Sequence lengths.  Tensor of shape [batch_size].
    Returns:
        metrics: dict. metrics["f1"] = 0.72 on each batch
    """
    #---------------
    # calculate
    #---------------
    
    # step1: element-wise comparison between y_true and y_pred
    correct_preds = tf.cast(tf.equal(y, y_hat), tf.float32) # Returns boolean tensor where the truth value of (x == y) element-wise.
    # returns a boolean mask tensor for the first N positions of each cell.
    sequence_mask = tf.sequence_mask(lengths = sequence_lengths, maxlen=max_seq_length) # (?, max_sequence_length)
    # convert boolean into 1 or 0
    sequence_mask = tf.cast(sequence_mask, tf.float32) # (?, max_sequence_length)

    # step2: dynamically consider the case their timestep do not pass his history lenghth
    correct_preds = correct_preds*sequence_mask
    
    # step3: tf.reduce_sum on temporal axi
    correct_preds = tf.reduce_sum(correct_preds, axis = 1)
    # step4: element-wise comparison between correct_preds and sequence_lengths
    correct_preds = tf.cast(tf.equal(tf.cast(correct_preds, tf.int32), sequence_lengths), tf.float32) # (batch_size,), which element representing if this sentence is correct or not
    # calculate scalar
    correct_preds = tf.cast(tf.count_nonzero(correct_preds, axis = None), tf.float32)
    total_correct = tf.cast(tf.count_nonzero(tf.count_nonzero(y, axis = 1), axis = None), tf.float32)
    total_preds = tf.cast(tf.count_nonzero(tf.count_nonzero(y_hat, axis = 1), axis = None), tf.float32) # 0-D, number of our rediction which is non-zero

    #---------------
    # output
    #---------------
    p = tf.cond(tf.greater(correct_preds, tf.zeros(shape=[])), lambda : correct_preds/total_preds, lambda: tf.zeros(shape=[]))
    r = tf.cond(tf.greater(correct_preds, tf.zeros(shape=[])), lambda : correct_preds/total_correct, lambda: tf.zeros(shape=[]))
    f1 = tf.cond(tf.greater(correct_preds, tf.zeros(shape=[])), lambda : 2 * p * r / (p + r), lambda: tf.zeros(shape=[]))
    acc = correct_preds/ tf.cast(tf.shape(y)[0], dtype = tf.float32) # correct_preds/ batch_size
    return {"acc": 100*acc, "prediction": p, "recall": r, "f1": f1}

    #return y, y_hat, correct_preds, sequence_mask, p,r,f1,acc

In [13]:
#----------
# for debugging
#-------------
max_seq_length = 36
num_layers = 2
hidden_size_cnn = 300
k = 3
ntags = 3

In [14]:
with tf.Graph().as_default() as g:
    ####################################
    # Step1: get input_sequences 
    ####################################

    #------------
    # 1-D  
    #------------
    item_id = tf.placeholder(tf.int32, [None])
    history_length = tf.placeholder(tf.int32, [None]) # It's for arg of lstm model: sequence_length, == len(is_ordered_history)
    #------------   
    # 2-D  
    #------------
    word_id = tf.placeholder(tf.int32, [None, max_seq_length]) 
    label = tf.placeholder(tf.int32, [None, max_seq_length]) # [batch_size, num_class]

    #------------
    # boolean parameter
    #------------
    keep_prob = tf.placeholder(tf.float32)
    is_training = tf.placeholder(tf.bool)
    
    #------------
    # word_embedding: get char embeddings matrix
    #------------
    if embeddings is None:
        logging.info('WARNING: randomly initializing word vectors')
        word_embeddings = tf.get_variable(
        shape = [nwords, dim_word],
        name = 'word_embeddings',
        dtype = tf.float32,
        )
    else:
        word_embeddings = tf.get_variable(
        initializer = embeddings, # it will hold the embedding
        #shape = [word2vec.shape[0], word2vec.shape[1]], # [num_vocabulary, embeddings_dim]
        trainable = trainable_embedding,
        name = 'word_embeddings',
        dtype = tf.float32
        )
    word_representation = tf.nn.embedding_lookup(params = word_embeddings, ids = word_id)
    x_word = tf.concat([
    word_representation,
    # tf_idf:for product_name
        ], axis=1) # (?, 122, 300)
    
    ####################################
    # Step2: calculate_outputs 
    ####################################
    
    #-------------------------
    # NN architecuture-Simple CNN
    #-------------------------
    print ('Original features : {}'.format(x_word.shape))
    for i in range(num_layers):
        if i == 0:
            conv = temporal_convolution_layer(x_word, 
                                       output_units = hidden_size_cnn,
                                       convolution_width = k,
                                       dilated = False,
                                       causal = False,
                                       bias=True,
                                       activation=None, 
                                       dropout=None,
                                       scope='cnn-{}'.format(i),
                                       reuse = False,
                                      )
        else:
            conv = temporal_convolution_layer(conv, 
                                       output_units = hidden_size_cnn,
                                       convolution_width = k,
                                       dilated = False,
                                       causal = False,
                                       bias=True,
                                       activation=None, 
                                       dropout=None,
                                       scope='cnn-{}'.format(i),
                                       reuse = False,
                                      )
            
        print ('CNN-{} layer : {}'.format(i, conv.shape))
    # output layer (linear)
    y_hat = time_distributed_dense_layer(conv, ntags, activation=None, scope='output-layer') # (?, 122, 3)
    print ('Output layer : {}'.format(y_hat.shape))
    print ('y_true : {}'.format(label.shape))
    #--------------
    # for second-level model
    #--------------
    prediction_tensors = {
        'item_id':item_id,
        'word_id':word_id,
        'final_states':conv, # 修改不要全部max_seq_lenghth都存, 只存到history_length的長度(save memory)
        'final_predictions':y_hat,
    }
    
    ####################################
    # Step3: calculate_loss +evaluation score+ optimizing
    ####################################
    loss = sequence_softmax_loss(y = label, y_hat = y_hat, sequence_lengths = history_length, max_sequence_length = max_seq_length)
    learning_rate_var  = update_parameters(loss)
    
    
    labels_pred = tf.cast(tf.argmax(y_hat, axis= 2),tf.int32) # (?, max_seq_length)
    score = sequence_evaluation_metric(y = label, y_hat = labels_pred, sequence_lengths = history_length)['f1']
    #score,p,r,y_,correct_preds,total_correct,total_preds,acc,sequence_mask,y = sequence_evaluation_metric(y = label, y_hat = labels_pred, sequence_lengths = history_length)
    #y_ture_input, y_pred_input, correct_preds,sequence_mask, p,r,f1,acc = sequence_evaluation_metric_1(y = label, y_hat = labels_pred, sequence_lengths = history_length)

#     ####################################
#     # Step4: saving the model 
#     ####################################    
#     # create saver object
#     # max_to_keep: indicates the maximum number of recent checkpoint files to keep.
#     saver = tf.train.Saver(max_to_keep = 1)
#     if enable_parameter_averaging:
#         saver_averaged = tf.train.Saver(ema.variables_to_restore(), max_to_keep=1)    

    #-------------------------
    # standard
    #-------------------------
    init = tf.global_variables_initializer()
    

Original features : (?, 36, 300)
CNN-0 layer : (?, 36, 300)
CNN-1 layer : (?, 36, 300)
Output layer : (?, 36, 3)
y_true : (?, 36)
step - whtat optimizer.apply_gradients returns name: "Adam"
op: "AssignAdd"
input: "Variable"
input: "Adam/value"
attr {
  key: "T"
  value {
    type: DT_INT32
  }
}
attr {
  key: "_class"
  value {
    list {
      s: "loc:@Variable"
    }
  }
}
attr {
  key: "use_locking"
  value {
    b: false
  }
}



In [20]:
import os
from collections import deque # for computing Train/validation losses are averaged over the last loss_averaging_window
import tensorflow as tf
warm_start_init_step = 0 # If nonzero, model will resume training a restored model beginning at warm_start_init_step.
batch_size = 128
loss_averaging_window = 10
num_validation_batches = 1
num_training_steps = 10
learning_rate=0.001
log_interval = 1
min_steps_to_checkpoint =1
early_stopping_steps = 10


base_dir = './'
checkpoint_dir = os.path.join(base_dir, 'checkpoints')

with tf.Session(graph=g, config = config) as sess:
    ####################################
    # 1. fit
    ####################################
    if warm_start_init_step:
        # continue the optimization at a recent checkpoint instead of having to restart the optimization from the beginning
        restore(warm_start_init_step)
        step = warm_start_init_step
    else:
        # start the optimization from the beginning
        sess.run(init) # Run the initializer
        step = 0
     
    score_ = sess.run(fetches = score, feed_dict = {word_id:sentence,
                                                 label: y_true,
                                                 history_length: length})
#     y_ture_input_, y_pred_input_,correct_preds_,sequence_mask_, history_length_, p_,r_,f1_,acc_ = sess.run(fetches = [y_ture_input, y_pred_input,correct_preds,sequence_mask,history_length, p,r,f1,acc], 
#                                                  feed_dict = {word_id:sentence,
#                                                  label: y_true,
#                                                  history_length: length}) 
#                                                                                                            labels_pred, 
#     label_, labels_pred_, score,p_,r_,y__,correct_preds_,total_correct_,total_preds_,acc_,history_length,sequence_mask_,y_ = sess.run(fetches = [label, 
#                                                                                                            labels_pred, 
#                                                                                                            score,
#                                                                                                            p,
#                                                                                                            r,
#                                                                                                            y_,
#                                                                                                            correct_preds,
#                                                                                                            total_correct,
#                                                                                                            total_preds,
#                                                                                                            acc, 
#                                                                                                            history_length,
#                                                                                                            sequence_mask,y
                                                                                                        
#                                                                                                                                              ], 
#                                     feed_dict = {word_id:sentence,
#                                                  label: y_true,
#                                                  history_length: length}) 

In [None]:
# 1.if 要把n_tag改成2, 因為class越多, 本來難度就越高啊!!!
# 2.

In [21]:
score_

0.0

# input

In [None]:
y_ture_input_

In [None]:
y_pred_input_

In [None]:
correct_preds_

In [None]:
sequence_mask_

In [None]:
history_length_

In [None]:
p_

In [None]:
r_

In [None]:
f1_

In [None]:
acc_

In [None]:
sequence_mask_

In [None]:
p_

In [None]:
r_

In [None]:
y__

In [None]:
correct_preds_

In [None]:
total_correct_

In [None]:
total_preds_

In [None]:
acc_