In [133]:
## Import the neccessary libraries and packages
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np 

import os
import glob
import csv
import matplotlib.pyplot as plt

# ***************************************************************************************************************
# ***************************************************************************************************************


# Read the dataset
def read_data(file_path):
    dfs = pd.DataFrame()

    # Retrieve all CSV file from 'sample_data' folder
    file_names = glob.glob(file_path + '*.csv')

    for file_name in file_names:
        df = pd.read_csv(file_name)
        dfs = dfs.append(df, ignore_index = True)
        del df

    # Remove the '_' (uderscore) with from 'event_value' column
    dfs['event_value'] = dfs['event_value'].str.replace('_', '')
    
    return dfs

# ****************************************************************************************************************
# ****************************************************************************************************************

def data_preprocessing(df):
    # Data Preprocessing
    corpus_ = df['event_value'].values.flatten()

    corpus = []
    sentence_size = 20
    count = 0
    temp_sentence = ''

    for item in corpus_:
        count += 1
        if count < sentence_size:
            temp_sentence += item + ' '
        else:
            temp_sentence += item
            corpus.append(temp_sentence)
            temp_sentence = ''
            count = 0
            
    # Create tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)

    # Define the total words. Add 1 for the index `0` which is just the padding token.
    total_words = len(tokenizer.word_index) + 1

    word_index = tokenizer.word_index
    
    # Initialize the sequences list
    input_sequences = []

    # Loop over the line several times to generate the subphrases
    for line in corpus:
        # Tokenize the current line
        token_list = tokenizer.texts_to_sequences([line])[0]

        for i in range(1, len(token_list)):

            # Generate the subphrase
            n_gram_sequence = token_list[:i+1]

            # Append the subphrase to the sequences list
            input_sequences.append(n_gram_sequence)
            
    
    # Get the length of the longest line
    max_sequence_len = max([len(x) for x in input_sequences])

    # Pad all sequences
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    # Create inputs and label by splitting the last token in the subphrases
    xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

    # Convert the label into one-hot arrays
    ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
    
    return xs, labels, ys, max_sequence_len, total_words


# ****************************************************************************************************************
# ****************************************************************************************************************

# Build the model
def build_model(input_length, total_words):
    model = Sequential([
                Embedding(total_words, 128, input_length=max_sequence_len-1),
                Bidirectional(LSTM(128, return_sequences=True)),
                Bidirectional(LSTM(64)),
                Dense(total_words, activation='softmax')
    ])

    # Use categorical crossentropy because this is a multi-class problem
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Print the model summary
    print('\nModel Summary:')
    model.summary()
    
    return model


# ****************************************************************************************************************
# ****************************************************************************************************************

def predict_text(seed):
    # Define seed text
    seed_text = seed
    prediction = [seed_text]

    # Define total words to predict
    next_words = 50

    # Loop until desired length is reached
    for _ in range(next_words):

        # Convert the seed text to a token sequence
        token_list = tokenizer.texts_to_sequences([seed_text])[0]

        # Pad the sequence
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

        # Feed to the model and get the probabilities for each index
        probabilities = model.predict(token_list)

        # Get the index with the highest probability
        predicted = np.argmax(probabilities, axis=-1)[0]

        # Ignore if index is 0 because that is just the padding.
        if predicted != 0:

            # Look up the word associated with the index. 
            output_word = tokenizer.index_word[predicted]

            # Combine with the seed text
            seed_text += " " + output_word
            prediction.append(output_word)

    prediction_unique = []
    for item in prediction:
        if item not in prediction_unique:
            prediction_unique.append(item)
    
    return prediction, prediction_unique
        
# ****************************************************************************************************************
# ****************************************************************************************************************

def main():
    # Read the data
    file_path = os.path.join(os.getcwd(), 'sample_data/')
    print(f'\nFile Path: {file_path}')
    df = read_data(file_path)
    print(f'\nDataFrame: \n{df.head()}')
    
    xs, labels, ys, max_sequence_len, total_words = data_preprocessing(df)
    
    # get the model
    model = build_model(max_sequence_len, total_words)
    
    # Train the model
    print('\nTraining Time!')
    history = model.fit(xs, ys, epochs=100)
    
    seed = 'dooron'
    
    predicted_list, unique_predicted_list = predict_text(seed)
    
    print(f'\nPredicted list for "{seed}" is: \n{predicted_list}')
    print(f'\nPredicted list for "{seed}" (uniqe) is: \n{unique_predicted_list}')

# ****************************************************************************************************************
# ****************************************************************************************************************
main()


File Path: /home/jupyter/ijahan1/textProcessing/sample_data/

DataFrame: 
                    timestamp event_value    sensor_value
0  2019-04-04 00:00:00.000049       camon    cam_on_obsrv
1  2019-04-04 00:00:14.747171       fanon    fan_on_obsrv
2  2019-04-04 00:00:15.031492        pcon  pc_on_no_obsrv
3  2019-04-04 00:00:15.139317       scron    scr_on_obsrv
4  2019-04-04 00:02:12.125632      fanoff   fan_off_obsrv

Model Summary:
Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 19, 128)           2944      
_________________________________________________________________
bidirectional_38 (Bidirectio (None, 19, 256)           263168    
_________________________________________________________________
bidirectional_39 (Bidirectio (None, 128)               164352    
_________________________________________________________________
dense_22 (