### Load Dependencies

In [1]:
#!pip3 install tensorflow

In [2]:
import pandas as pd
import numpy as np
import csv
# from sklearn.naive_bayes import GaussianNB
# from sklearn import svm
# from sklearn import tree
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
# from sklearn.cross_validation import train_test_split
import tensorflow as tf

### Load Data

In [3]:
def csv_to_list(csv_file):
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        your_list = list(reader)

    return your_list

In [4]:
job_result = csv_to_list('Log_Data_0307/job_result11740.csv')
keyword_in_job = csv_to_list('Log_Data_0307/keyword_in_job11740.csv')

In [5]:
# Calculate maximum no of keywords in a record
max_records = np.max(list(map(lambda x: len(x), keyword_in_job)))
max_records

380503

In [6]:
def transform_to_NN_format(x):
    x = list(map(int, x)) # change 'str' to 'int'
    
    # change 0 to 999
    if x == [0]:
        x = [999]
    
    # padding
    if len(x) < max_records:
        padding = (max_records - len(x)) * [0]
        new = padding + x
        return new
    else: 
        return x

In [7]:
# transform keyword_in_job
keyword_in_job_transformed = list(map(transform_to_NN_format, keyword_in_job))

In [8]:
# transform job_result
from itertools import chain
job_result = list(chain.from_iterable(job_result)) # un-nest
job_result_transformed  = list(map(int, job_result)) # change 'str' to 'int'

### Display length of keyword in job and result

In [9]:
#keyword_in_job_transformed

In [10]:
#len(keyword_in_job_transformed) # this variable is in format [[l1],[l2],[l3],...] #11740

In [11]:
#job_result_transformed

In [12]:
#len(job_result_transformed) # this variable is in format [1,0,-1,...]

In [13]:
# Taking a sample of 1000 logs and then will clean it in next step to remove logs with job result -1
keyword_in_job_transformed_sample = keyword_in_job_transformed[:1000]
job_result_transformed_sample = job_result_transformed[:1000]

In [14]:
clean_temp = []
job_result_transformed_cleaned = []
keyword_in_job_transformed_cleaned = []
for i, value in enumerate(job_result_transformed_sample):
    if value == -1:
        clean_temp.append(i)
    else:
        job_result_transformed_cleaned.append(value)

for i, value in enumerate(keyword_in_job_transformed_sample):
    if i not in clean_temp:
        keyword_in_job_transformed_cleaned.append(value)

In [15]:
len(job_result_transformed_cleaned)

515

In [16]:
len(keyword_in_job_transformed_cleaned)

515

In [17]:
# # Taking a sample of 100 logs as model is taking a lot of time
# keyword_in_job_transformed_cleaned_sample = keyword_in_job_transformed_cleaned[:100]
# job_result_transformed_cleaned_sample = job_result_transformed_cleaned[:100]

In [18]:
X_train = keyword_in_job_transformed_cleaned[:400]
y_train = job_result_transformed_cleaned[:400]
X_test = keyword_in_job_transformed_cleaned[400:]
y_test = job_result_transformed_cleaned[400:]

In [19]:
def create_batch_generator(x, y=None, batch_size=50):
    n_batches = len(x)//batch_size
    x= x[:n_batches*batch_size]
    if y is not None:
        y = y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        if y is not None:
            yield x[ii:ii+batch_size], y[ii:ii+batch_size]
        else:
            yield x[ii:ii+batch_size]

In [20]:
class AnalyzeLogRNN(object):
    def __init__(self, n_words, seq_len=380503,
                 lstm_size=256, num_layers=2, batch_size=50,
                 learning_rate=0.0001, embed_size=100):
        self.n_words = n_words
        self.seq_len = seq_len
        self.lstm_size = lstm_size 
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.embed_size = embed_size

        self.g = tf.Graph()
        with self.g.as_default():
            tf.set_random_seed(123)
            self.build()
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()

    def build(self):
        tf_x = tf.placeholder(tf.int32,
                              shape=(self.batch_size, self.seq_len),
                              name='tf_x')
        tf_y = tf.placeholder(tf.float32,
                              shape=(self.batch_size),
                              name='tf_y')
        tf_keepprob = tf.placeholder(tf.float32,
                                     name='tf_keepprob')
        
        embedding = tf.Variable(
            tf.random_uniform(
                (self.n_words, self.embed_size),
                minval=-1, maxval=1),
            name='embedding')
        embed_x = tf.nn.embedding_lookup(
            embedding, tf_x,
            name='embeded_x')

        cells = tf.contrib.rnn.MultiRNNCell(
            [tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.BasicLSTMCell(self.lstm_size),
                output_keep_prob=tf_keepprob)
                for i in range(self.num_layers)])

        self.initial_state = cells.zero_state(
            self.batch_size, tf.float32)
        print('  << initial state >> ', self.initial_state)

        lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
            cells, embed_x,
            initial_state=self.initial_state)
        
        print('\n  << lstm_output   >> ', lstm_outputs)
        print('\n  << final state   >> ', self.final_state)

        logits = tf.layers.dense(
            inputs=lstm_outputs[:, -1],
            units=1, activation=None,
            name='logits')

        logits = tf.squeeze(logits, name='logits_squeezed')
        print ('\n  << logits        >> ', logits)

        y_proba = tf.nn.sigmoid(logits, name='probabilities')
        predictions = {
            'probabilities': y_proba,
            'labels' : tf.cast(tf.round(y_proba), tf.int32,
                               name='labels')
        }
        print('\n  << predictions   >> ', predictions)

        cost = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                labels=tf_y, logits=logits),
            name='cost')

        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.minimize(cost, name='train_op')

    def train(self, X_train, y_train, num_epochs):
        with tf.Session(graph=self.g) as sess:
            sess.run(self.init_op)
            iteration = 1
            for epoch in range(num_epochs):
                state = sess.run(self.initial_state)

                for batch_x, batch_y in create_batch_generator(
                        X_train, y_train, self.batch_size):
                    feed = {'tf_x:0': batch_x,
                            'tf_y:0': batch_y,
                            'tf_keepprob:0': 0.5,
                            self.initial_state : state}
                    loss, _, state = sess.run(
                        ['cost:0', 'train_op',
                         self.final_state],
                        feed_dict=feed)

                    if iteration % 20 == 0:
                        print("Epoch: %d/%d Iteration: %d "
                              "| Train loss: %.5f" % (
                                  epoch + 1, num_epochs,
                                  iteration, loss))

                    iteration +=1
                if (epoch+1)%10 == 0:
                    self.saver.save(sess,
                                    "model/sentiment-%d.ckpt" % epoch)

    def predict(self, X_data, return_proba=False):
        preds = []
        with tf.Session(graph = self.g) as sess:
            self.saver.restore(
                sess, tf.train.latest_checkpoint('model/'))
            test_state = sess.run(self.initial_state)
            for ii, batch_x in enumerate(
                    create_batch_generator(
                        X_data, None, batch_size=self.batch_size), 1):
                feed = {'tf_x:0' : batch_x,
                        'tf_keepprob:0': 1.0,
                        self.initial_state : test_state}
                if return_proba:
                    pred, test_state = sess.run(
                        ['probabilities:0', self.final_state],
                        feed_dict=feed)
                else:
                    pred, test_state = sess.run(
                        ['labels:0', self.final_state],
                        feed_dict=feed)

                preds.append(pred)

        return np.concatenate(preds)


In [21]:
n_words = 42
sequence_length = 380503

In [None]:
rnn = AnalyzeLogRNN(n_words=n_words,
                   seq_len=sequence_length,
                   embed_size=100,
                   lstm_size=256,
                   num_layers=2,
                   batch_size=50,
                   learning_rate=0.0001)

  << initial state >>  (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(50, 256) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(50, 256) dtype=float32>), LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState_1/BasicLSTMCellZeroState/zeros:0' shape=(50, 256) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState_1/BasicLSTMCellZeroState/zeros_1:0' shape=(50, 256) dtype=float32>))

  << lstm_output   >>  Tensor("rnn/transpose_1:0", shape=(50, 380503, 256), dtype=float32)

  << final state   >>  (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(50, 256) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(50, 256) dtype=float32>), LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_5:0' shape=(50, 256) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_6:0' shape=(50, 256) dtype=float32>))

  << logits        >>  Ten

In [None]:
rnn.train(X_train, y_train, num_epochs=5)

In [None]:
proba = rnn.predict(X_test, return_proba=True)

In [None]:
proba

In [None]:
pred = rnn.predict(X_test)
pred #predicted by RNN LSTM model

In [None]:
y_test # Actual label

In [None]:
#label: 1 - success, 0 - failure
#So the test data had three failures and two success logs
#RNN LSTM model predicted both successes correctly, it predicted two out of three failures correctly and 
# misclassified one as success