In [34]:
import tensorflow as tf
import numpy as np
import os
import tensorflow_hub as hub

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False
try:
    from tensorflow.python.util import module_wrapper as deprecation
except ImportError:
    from tensorflow.python.util import deprecation_wrapper as deprecation
deprecation._PER_MODULE_WARNING_LIMIT = 0

import pickle as pk

with open('x_tr','rb') as f:
    X_train = pk.load(f)
    
with open('x_te','rb') as f:
    X_test = pk.load(f)
    
with open('y_tr','rb') as f:
    y_train = pk.load(f)
    
with open('y_te','rb') as f:
    y_test = pk.load(f)


class Elmo_word_model(object):
    
    def __init__(self, 
                no_of_labels,
                learning_rate, 
                rnn_units, 
                train_elmo = True, 
                output_type = 'state_output', 
                max_sentence_words = 150):

        tf.reset_default_graph()

        # feature extraction network  ------------------------------------------>>
        
        # pass raw string 
        # one hot labels
        sentences             = tf.placeholder(tf.string, (None), name='sentences')
        self.targets          = tf.placeholder(tf.int32, [None, None], name='labels' )
        sequence_length       = tf.placeholder(tf.int32, (None), name='sequence_len')

        keep_prob             = tf.placeholder(tf.float32, name='dropout')



        self.placeholders     = {
                                'sentence': sentences, 
                                'labels': self.targets, 
                                'drop': keep_prob, 
                                'sequence_length': sequence_length
                                }

        module                = hub.Module('https://tfhub.dev/google/elmo/2', trainable = train_elmo )
        module_features       = module(dict(tokens=sentences, sequence_len = sequence_length),
                                 signature='tokens', as_dict=True)
        embeddings            = module_features["elmo"]



        # sequence learning network -------------------------------------------------------->
         #bilstm model
        with tf.variable_scope('forward'):
            fr_cell = tf.contrib.rnn.LSTMCell(num_units = rnn_units)
            dropout_fr = tf.contrib.rnn.DropoutWrapper(fr_cell, output_keep_prob = 1. - keep_prob)
            
        with tf.variable_scope('backward'):
            bw_cell = tf.contrib.rnn.LSTMCell(num_units = rnn_units)
            dropout_bw = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob = 1. - keep_prob)
            
        with tf.variable_scope('encoder') as scope:
            model,last_state = tf.nn.bidirectional_dynamic_rnn(dropout_fr,
                                                               dropout_bw,
                                                               inputs = embeddings,
                                                               dtype=tf.float32)

        
        if output_type == 'flat':

            logits = tf.reshape(model[0], (-1, rnn_units * max_sentence_words))
            # dense layer with xavier weights
            fc_layer = tf.get_variable(name='fully_connected',
                                    shape=[rnn_units * max_sentence_words, no_of_labels],
                                    dtype=tf.float32,
                                    initializer=tf.contrib.layers.xavier_initializer())
            
            # bias 
            bias    = tf.get_variable(name='bias',
                                    shape=[no_of_labels],
                                    dtype=tf.float32,
                                    initializer=tf.contrib.layers.xavier_initializer())
            
            #final output 
            self.x_ = tf.add(tf.matmul(logits,fc_layer),bias)

        else:

            logits = tf.concat([last_state[0].c,last_state[1].c],axis=-1)
             # dense layer with xavier weights
            fc_layer = tf.get_variable(name='fully_connected',
                                    shape=[2*rnn_units, no_of_labels],
                                    dtype=tf.float32,
                                    initializer=tf.contrib.layers.xavier_initializer())
            
            # bias 
            bias    = tf.get_variable(name='bias',
                                    shape=[no_of_labels],
                                    dtype=tf.float32,
                                    initializer=tf.contrib.layers.xavier_initializer())
            
            #final output 
            self.x_ = tf.add(tf.matmul(logits,fc_layer),bias)


         #optimization and loss calculation ---------------------------------->>
        
        self.cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits = self.x_, labels = tf.cast(self.targets,tf.float32))
        self.loss = tf.reduce_mean(tf.reduce_sum(self.cross_entropy, axis=1))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.loss)
        self.predictions = tf.cast(tf.sigmoid(self.x_) > 0.5, tf.int32)
        
# # let's take one dataset for example
# import pickle as pk
# from multilab.datasets import reuter
# sentences, labels = reuter()

# from multilab.preprocess import Text_preprocessing
# text_preprocessing = Text_preprocessing()

# dataframe = text_preprocessing.labels_to_dataframe(sentences,labels)
# preprocessded_dataset = text_preprocessing.initial_preprocess(dataframe, chunk_value = 25)
# dataset, frequency_list = text_preprocessing.keep_labels(preprocessded_dataset,keep_ratio=0.10)
# slice_dataset = text_preprocessing.dataset_slice(dataset,ratio=0.25)

# import numpy as np
# all_sente = list(slice_dataset['text'])
# all_label = np.array(slice_dataset.drop('text', 1))

# X_train, X_test, y_train, y_test = text_preprocessing.split_dataset(all_sente, all_label)

In [25]:
from tqdm import tqdm
import nltk
def pad_sentences(sentences):
        

    padded_sentences = []
    actual_length    = []

    sentences = [seq.split() for seq in sentences]
    sequence_len = max(list(map(len, sentences)))



    for sentence in tqdm(sentences):
        if not isinstance(sentence, list):
            token = nltk.word_tokenize(sentence)
        else:
            token = sentence

        if len(token) < sequence_len:
            actual_length.append(len(token))
            token = token + [''] * (sequence_len-len(token))
        else:
            actual_length.append(len(token))

        padded_sentences.append(token)
    return padded_sentences, actual_length

In [38]:
def actual_len(padded_list):

    actual_ = []
    for sequence in padded_list:
        actual = [sub_ for sub_ in sequence if sub_!='']
        actual_.append(len(actual))
    return actual_

def max_length(sequences):
    
    actual_ = [len(sequence.split()) for sequence in sequences]
    return max(actual_)


# X_train,_ = pad_sentences(X_train)
# y_train = y_train
# X_val,_   = pad_sentences(X_test)
# y_val   = y_test


# train_len    = actual_len(X_train)
# test_len     = actual_len(X_val)
# train_len.extend(test_len)

# max_len = max(train_len)



In [42]:
X_train   = X_train
y_train   = y_train
X_val    = X_test
y_val   = y_test
max_len   = max_length(X_train)

old_configuration = {}

config = {
                         'no_of_labels'               :  y_train.shape[1],
                         'learning_rate'              : 0.001,
                         'rnn_units'                  : 100,
                         'epoch'                      : 1,
                         'batch_size'                 : 12,
                         'dropout'                    : 0.2,
                         'output_type'                : 'state_output',
                         'train_elmo'                 : True,
                         'result_path'                : '/Users/monk/Desktop',
                        }

old_configuration.update(config)

In [45]:
def get_train_data(batch_size, slice_no):


    batch_data_j = X_train[slice_no * batch_size:(slice_no + 1) * batch_size]
    batch_labels = y_train[slice_no * batch_size:(slice_no + 1) * batch_size]
    batch_data_j,lens  = pad_sentences(batch_data_j)


    return {'sentenc': np.array(batch_data_j), 'labels': np.array(batch_labels) ,'sequence_len': lens}
    
    
    # test data loader
def get_test_data(self, batch_size,slice_no):


    batch_data_j = X_val[slice_no * batch_size:(slice_no + 1) * batch_size]
    batch_labels = y_val[slice_no * batch_size:(slice_no + 1) * batch_size]
    batch_data_j,lens  = pad_sentences(batch_data_j)


    return {'sentenc': np.array(batch_data_j), 'labels': np.array(batch_labels) ,'sequence_len': lens}



100%|██████████| 10/10 [00:00<00:00, 10425.81it/s]


{'sentenc': array([['argentine', 'port', 'workers', 'take', 'industrial', 'action',
         'argentine', 'port', 'workers', 'began', 'an', 'indefinite',
         'protest', 'against', 'safety', 'conditions', 'at', 'the',
         'port', 'of', 'buenos', 'aires', 'stopping', 'work', 'for'],
        ['green', 'tree', 'acceptance', 'inc', 'ltgnt', 'sets',
         'dividend', 'qtly', 'dividend', 'cts', 'vs', 'cts', 'pay',
         'march', 'record', 'march', '', '', '', '', '', '', '', '', ''],
        ['time', 'lttl', 'to', 'sell', 'part', 'of', 'unit', 'time',
         'inc', 'said', 'its', 'timelife', 'video', 'inc', 'subsidiary',
         'has', 'agreed', 'in', 'principle', 'to', 'sell', 'its',
         'institutional', 'training', 'business'],
        ['gulf', 'barge', 'freight', 'higher', 'in', 'nearbys', 'on',
         'call', 'gulf', 'barge', 'freight', 'rates', 'continued', 'to',
         'show', 'a', 'firmer', 'tone', 'in', 'the', 'nearbys', 'on',
         'the', 'assumption', 