In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import gc
import time
import os
import sys
#sys.path.append(os.path.join(os.path.dirname(__file__), '../'))
sys.path.append('../models')

from data_frame import DataFrame
from tf_base_model import TFBaseModel # for building our customized

config = tf.ConfigProto(allow_soft_placement = True)
config.gpu_options.allow_growth=True

class DataReader(object):
    '''for reading data'''
    
    def __init__(self, data_dir):
        data_cols = [
            'item_id',
            'word_id',
            'history_length',
            'label'
        ]
        #-----------------
        # loading data
        #-----------------
        if TRACE_CODE == True:
            data_train = [np.load(os.path.join(data_dir, 'train/{}_0.npy'.format(i)), mmap_mode='r') for i in data_cols]
            data_val = [np.load(os.path.join(data_dir, 'val/{}_0.npy'.format(i)), mmap_mode='r') for i in data_cols]
            data_test = [np.load(os.path.join(data_dir, 'test/{}_0.npy'.format(i)), mmap_mode='r') for i in data_cols]
        else:
            data_train = [np.load(os.path.join(data_dir, 'train/{}.npy'.format(i)), mmap_mode='r') for i in data_cols]
            data_val = [np.load(os.path.join(data_dir, 'val/{}.npy'.format(i)), mmap_mode='r') for i in data_cols]
            data_test = [np.load(os.path.join(data_dir, 'test/{}.npy'.format(i)), mmap_mode='r') for i in data_cols]

        #------------------
        # For Testing-phase
        #------------------
        self.test_df = DataFrame(columns=data_cols, data=data_test)
        print ('loaded data')
        #------------------
        # For Training-phase
        #------------------
        self.train_df = DataFrame(columns=data_cols, data=data_train)
        self.val_df = DataFrame(columns=data_cols, data=data_val)

        #self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.9)
        #self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.9, random_state = 3)

        
        print ('number of training example: {}'.format(len(self.train_df)))
        print ('number of validating example: {}'.format(len(self.val_df)))
        print ('number of testing example: {}'.format(len(self.test_df)))
        
    def train_batch_generator(self, batch_size, num_epochs=100000, shuffle = True, is_test = False):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.train_df,
            shuffle=shuffle,
            num_epochs=num_epochs,
            is_test=is_test
        )

    def val_batch_generator(self, batch_size, num_epochs=100000, shuffle = True, is_test = False):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.val_df,
            shuffle=shuffle,
            num_epochs=num_epochs,
            is_test=is_test
        )

    def test_batch_generator(self, batch_size):
        '''All row in our dataframe need to predicted as input of second-level model'''
        return self.batch_generator(
            batch_size=batch_size,
            df=self.test_df,
            shuffle=False,
            num_epochs=1,
            is_test=True
        )
    
    def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False):
        '''
        df: customized DataFrame object,
        '''
        # call our customized DataFrame object method batch_generator
        batch_gen = df.batch_generator(batch_size, shuffle = shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test)
        # batch_gen is a generator
        for batch in batch_gen:
            # what batch_gen yield is also a customized Dataframe object.
            if not is_test:
                pass
            yield batch


  return f(*args, **kwds)


In [2]:
word_id = np.load(os.path.join('../models/data/train/', '{}.npy'.format('word_id')))
history_length = np.load(os.path.join('../models/data/train/', '{}.npy'.format('history_length')))
label = np.load(os.path.join('../models/data/train/', '{}.npy'.format('label')))
item_id = np.load(os.path.join('../models/data/train/','{}.npy'.format('item_id')))
char_id = np.load(os.path.join('../models/data/train/','{}.npy'.format('char_id')))
word_lengths = np.load(os.path.join('../models/data/train/','{}.npy'.format('word_length')))

# for debuggin
sentence = word_id[:2]
y_true = label[:2]
length = history_length[:2]
char_id_____ = char_id[:2]
word_lengths______ = word_lengths[:2]

sentence.shape, y_true.shape, length.shape, char_id_____.shape, word_lengths______.shape

((2, 36), (2, 36), (2,), (2, 36, 54), (2, 36))

In [3]:
def get_optimizer(learning_rate, optimizer='adam'):
    '''
    It's for choosing optimizer given learning rate.
    '''
    if optimizer == 'adam':
        return tf.train.AdamOptimizer(learning_rate)
    elif optimizer == 'adagrad':
        return tf.train.AdagradOptimizer(learning_rate)
    elif optimizer == 'sgd':
        return tf.train.GradientDescentOptimizer(learning_rate)
    elif soptimizer == 'rms':
        return tf.train.RMSPropOptimizer(learning_rate, decay=0.95, momentum=0.9)
    else:
        # assert is a good way to tell other how to use this function for bug happening.
        #-------
        # standard way to pring the error
        #-------
        assert False, 'optimizer must be adam, adagrad, sgd, or rms'

def update_parameters(loss, optimizer = 'adam'):
    '''
    It's for optimizing and logging training parameters
    
    1.using gradient clipping to avoid gradient explosion and vanishment.
    
    Gradient clipping is most common in recurrent neural networks. 
    When gradients are being propagated back in time, they can vanish 
    because they they are continuously multiplied by numbers less than one.
    This is called the vanishing gradient problem. 
    This is solved by LSTMs and GRUs, and if you’re using a deep feedforward network, 
    This is solved by residual connections. 
    On the other hand, you can have exploding gradients too. 
    This is when they get exponentially large from being multiplied by numbers larger 
    than 1. Gradient clipping will clip the gradients between two numbers to prevent them from getting too large.

    '''
    #---------------
    # setting
    #---------------
    grad_clip = 5 # Clip gradients elementwise to have norm at most equal to grad_clip.
    regularization_constant = 0.1 # Regularization constant applied to all trainable parameters.
    enable_parameter_averaging = False # If true, model saves exponential weighted averages of parameters to separate checkpoint file.
    global_step = tf.Variable(0, trainable = False) # Optional Variable to increment by one after the variables have been updated.
    learning_rate_var = tf.Variable(0.0, trainable = False)
    
    #----------------
    # for understanding regularization
    #----------------
    trainable_variables_1 = tf.trainable_variables()[0]
    square_1 = tf.square(trainable_variables_1)
    sum_1 = tf.reduce_sum(square_1)
    sqrt = tf.sqrt(sum_1)
    #-----------------
    # we can customized our regularization on the parameters we like
    #-----------------
    if regularization_constant != 0:
        # l2_norm: is a 0-D tensor. 
        # we do l2-norm on each trainable's parameters.
        l2_norm = tf.reduce_sum([tf.sqrt(tf.reduce_sum(tf.square(param))) for param in tf.trainable_variables()]) # Returns list including all variables created with trainable=True
        # the smaller the loss is, the better do finish overfitting 
        loss = loss + regularization_constant*l2_norm
    #-----------------
    # optimizing
    #-----------------
    # define the optimizer
    optimizer = get_optimizer(learning_rate_var, optimizer=optimizer)
    # compute grads: return A list of (gradient, variable) pairs. Variable is always present, but gradient can be None.
    grads = optimizer.compute_gradients(loss)
    # standard way to do gradient clipping
    clipped = [(tf.clip_by_value(g, -grad_clip, grad_clip), v_) for g, v_ in grads]
    step = optimizer.apply_gradients(clipped, global_step = global_step)
    print ('step - whtat optimizer.apply_gradients returns', step)
    #-----------------
    # if using moving average techniques
    #-----------------
    if enable_parameter_averaging:
        ema = tf.train.ExponentialMovingAverage(decay=0.995)
        maintain_averages_op = ema.apply(tf.trainable_variables())
        with tf.control_dependencies([step]):
            step = tf.group(maintain_averages_op)
    else:
        step = step
    return learning_rate_var


In [5]:
from tf_utils import temporal_convolution_layer
from tf_utils import time_distributed_dense_layer
from tf_utils import sequence_softmax_loss
from tf_utils import sequence_evaluation_metric
from data_utils import get_glove_vectors
from data_utils import load_vocab_and_return_word_to_id_dict
from tf_utils import shape_of_tensor


dim_word = 300 # lstm on word embeddings
hidden_size_char = 100 # lstm on chars

trainable_embedding = False
USE_PRETRAINED = True
USE_CHARS = True
filename_words_vec = "../models/data/wordvec/word2vec.npz".format(dim_word)
filename_words_voc = "../models/data/wordvec/words_vocab.txt"
filename_chars_voc = "../models/data/wordvec/chars_vocab.txt"

nwords = len(load_vocab_and_return_word_to_id_dict(filename_words_voc))
nchars = len(load_vocab_and_return_word_to_id_dict(filename_chars_voc))
embeddings = (get_glove_vectors(filename_words_vec) if USE_PRETRAINED else None)
embeddings = embeddings.astype(np.float32)
enable_parameter_averaging = False

#----------
# for debugging
#-------------
max_seq_length = 36
max_word_length = 54
num_layers = 2
hidden_size_cnn = 300
dim_char = 100
k = 3
ntags = 3
dilation_rate = [2**i for i in range(num_layers)]

In [6]:
with tf.Graph().as_default() as g:
    ####################################
    # Step1: get input_sequences 
    ####################################

    #------------
    # 1-D  
    #------------
    item_id = tf.placeholder(tf.int32, [None])
    history_length = tf.placeholder(tf.int32, [None]) # It's for arg of lstm model: sequence_length, == len(is_ordered_history)
    #------------   
    # 2-D  
    #------------
    word_id = tf.placeholder(tf.int32, [None, max_seq_length]) 
    label = tf.placeholder(tf.int32, [None, max_seq_length]) # [batch_size, num_class]
    if USE_CHARS:
        word_lengths = tf.placeholder(tf.int32, shape=[None, max_seq_length])
    #------------   
    # 3-D  
    #------------
    if USE_CHARS:
        char_ids = tf.placeholder(tf.int32, shape=[None, max_seq_length, max_word_length]) # [batch_size, max_seq_length, max_word_length]

    #------------
    # boolean parameter
    #------------
    keep_prob = tf.placeholder(tf.float32)
    is_training = tf.placeholder(tf.bool)
    
    #------------
    # word_embedding: get word embeddings matrix
    #------------
    if embeddings is None:
        logging.info('WARNING: randomly initializing word vectors')
        word_embeddings = tf.get_variable(
        shape = [nwords, dim_word],
        name = 'word_embeddings',
        dtype = tf.float32,
        )
    else:
        word_embeddings = tf.get_variable(
        initializer = embeddings, # it will hold the embedding
        trainable = trainable_embedding,
        name = 'word_embeddings',
        dtype = tf.float32
        )
    word_representation = tf.nn.embedding_lookup(params = word_embeddings, ids = word_id)
    #------------
    # char_embedding: get char embeddings matrix
    #------------
    if USE_CHARS:
        # get char embeddings matrix
        char_embeddings = tf.get_variable(
                shape=[nchars, dim_char],
                name="char_embeddings",
                trainable = True,
                dtype=tf.float32,
        )
        # get char_representation, 4-D, [batch_size, max_seq_length, max_word_length, dim_char]
        char_representation = tf.nn.embedding_lookup(params = char_embeddings, ids = char_ids) 
        # convert 4-D into 3-D: put the timestep on axis=1 and should be charater-level axis
        s = tf.shape(char_representation) # 1-D tensor, (batch_size, max_seq_length, max_word_length, dim_char)
        char_representation = tf.reshape(char_representation, shape=[ s[0]*s[1], s[-2], dim_char]) # [batch_size * max_seq_length, max_word_length, dim_char]
        # for computing bi lstm on chars
        word_lengths_ = tf.reshape(word_lengths, shape=[s[0]*s[1]]) # 1-D tensor
        # bi lstm on chars
        cell_fw = tf.contrib.rnn.LSTMCell(hidden_size_char,
                state_is_tuple=True)
        cell_bw = tf.contrib.rnn.LSTMCell(hidden_size_char,
                state_is_tuple=True)
        _output = tf.nn.bidirectional_dynamic_rnn(
                cell_fw, 
                cell_bw, 
                inputs = char_representation,
                sequence_length = word_lengths_, 
                dtype=tf.float32) 
        """
        Return A tuple (outputs, output_states) 
          -outputs: A tuple (output_fw, output_bw) containing the forward and the backward rnn output
               1.output_fw with shape of [batch_szie*max_word_lenght, max_word_length, hidden_size_char].For example, (72, 54, 100)
               2.output_bw with shape of [batch_szie*max_word_lenght, max_word_length, hidden_size_char]
          -output_states: A tuple (output_state_fw, output_state_bw) containing the forward and the backward final states of bidirectional rnn.
               1.final_state_output_fw with shape of [batch_szie*max_word_lenght, hidden_size_char]. For instance, (72, 100)
               2.final_state_output_bw with shape of [batch_szie*max_word_lenght, hidden_size_char]
        
        """
        # get word level representation from characters embeddings
        _, ((_, output_fw_final_state), (_, output_bw_final_state)) = _output
        output = tf.concat([output_fw_final_state, output_bw_final_state], axis=-1)# concat on char_embedding level, [batch_szie*max_word_lenght, 2*hidden_size_char]
        # reshape to word level representation
        word_representation_extracted_from_char = tf.reshape(output, shape=[s[0], s[1], 2* hidden_size_char]) # [batch_size, max_seq_length, 2*hidden_size_char]

    x = tf.concat([
    word_representation,
    word_representation_extracted_from_char
        ], axis = 2) # (?, 122, 300)
    
    ####################################
    # Step2: calculate_outputs 
    ####################################
    
    #-------------------------
    # NN architecuture-Simple CNN
    #-------------------------
    print ('Original features : {}'.format(x.shape))
    for i, dilation in zip(range(num_layers), dilation_rate):
        if i == 0:
            conv = temporal_convolution_layer(x , 
                                       output_units = hidden_size_cnn,
                                       convolution_width = k,
                                       dilated = True,
                                       dilation_rate = [dilation],
                                       causal = False,
                                       bias=True,
                                       activation=None, 
                                       dropout=None,
                                       scope='cnn-{}'.format(i),
                                       reuse = False,
                                      )
        else:
            conv = temporal_convolution_layer(conv, 
                                       output_units = hidden_size_cnn,
                                       convolution_width = k,
                                       dilated = True,
                                       dilation_rate = [dilation],
                                       causal = False,
                                       bias=True,
                                       activation=None, 
                                       dropout=None,
                                       scope='cnn-{}'.format(i),
                                       reuse = False,
                                      )
            
        print ('CNN-{} layer : {}'.format(i, conv.shape))
    # output layer (linear)
    y_hat = time_distributed_dense_layer(conv, ntags, activation=None, scope='output-layer') # (?, 122, 3)
    print ('Output layer : {}'.format(y_hat.shape))
    print ('y_true : {}'.format(label.shape))
    #--------------
    # for second-level model
    #--------------
    prediction_tensors = {
        'item_id':item_id,
        'word_id':word_id,
        'final_states':conv, # 修改不要全部max_seq_lenghth都存, 只存到history_length的長度(save memory)
        'final_predictions':y_hat,
    }
    
    ####################################
    # Step3: calculate_loss +evaluation score+ optimizing
    ####################################
    loss = sequence_softmax_loss(y = label, y_hat = y_hat, sequence_lengths = history_length, max_sequence_length = max_seq_length)
    learning_rate_var  = update_parameters(loss)
    
    
    labels_pred = tf.cast(tf.argmax(y_hat, axis= 2),tf.int32) # (?, max_seq_length)
    score = sequence_evaluation_metric(y = label, y_hat = labels_pred, sequence_lengths = history_length, max_sequence_length = max_seq_length)['f1']
    #score,p,r,y_,correct_preds,total_correct,total_preds,acc,sequence_mask,y = sequence_evaluation_metric(y = label, y_hat = labels_pred, sequence_lengths = history_length)
    #y_ture_input, y_pred_input, correct_preds,sequence_mask, p,r,f1,acc = sequence_evaluation_metric_1(y = label, y_hat = labels_pred, sequence_lengths = history_length)

#     ####################################
#     # Step4: saving the model 
#     ####################################    
#     # create saver object
#     # max_to_keep: indicates the maximum number of recent checkpoint files to keep.
#     saver = tf.train.Saver(max_to_keep = 1)
#     if enable_parameter_averaging:
#         saver_averaged = tf.train.Saver(ema.variables_to_restore(), max_to_keep=1)    

    #-------------------------
    # standard
    #-------------------------
    init = tf.global_variables_initializer()
    

Original features : (?, 36, 500)


ValueError: len(dilation_rate)=2 but should be 1

In [None]:
[2**i for i in range(num_layers)]