In [1]:
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
#import tensorflow_addons as tfa
import numpy as np
import sys
from random import randint
import datetime
from sklearn.utils import shuffle
import pickle
import os

# Removes an annoying Tensorflow warning
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

def createTrainMatrix(conversationFileName, wList, maxLen):
    conversationDictionary = np.load(conversationFileName,allow_pickle=True).item()
    numExamples = len(conversationDictionary)
    xTrain = np.zeros((numExamples, maxLen), dtype='int32')
    yTrain = np.zeros((numExamples, maxLen), dtype='int32')
    for index,(key,value) in enumerate(conversationDictionary.items()):
        # Will store integerized representation of strings here (initialized as padding)
        encoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
        decoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
        # Getting all the individual words in the strings
        keySplit = key.split()
        valueSplit = value.split()
        keyCount = len(keySplit)
        valueCount = len(valueSplit)
        # Throw out sequences that are too long or are empty
        if (keyCount > (maxLen - 1) or valueCount > (maxLen - 1) or valueCount == 0 or keyCount == 0):
            continue
        # Integerize the encoder string
        for keyIndex, word in enumerate(keySplit):
            try:
                encoderMessage[keyIndex] = wList.index(word)
            except ValueError:
                # TODO: This isnt really the right way to handle this scenario
                encoderMessage[keyIndex] = 0
        encoderMessage[keyIndex + 1] = wList.index('<EOS>')
        # Integerize the decoder string
        for valueIndex, word in enumerate(valueSplit):
            try:
                decoderMessage[valueIndex] = wList.index(word)
            except ValueError:
                decoderMessage[valueIndex] = 0
        decoderMessage[valueIndex + 1] = wList.index('<EOS>')
        xTrain[index] = encoderMessage
        yTrain[index] = decoderMessage
    # Remove rows with all zeros
    yTrain = yTrain[~np.all(yTrain == 0, axis=1)]
    xTrain = xTrain[~np.all(xTrain == 0, axis=1)]
    numExamples = xTrain.shape[0]
    return numExamples, xTrain, yTrain

        

    
    

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def getTrainingBatch(localXTrain, localYTrain, localBatchSize, maxLen):
    num = randint(0,numTrainingExamples - localBatchSize - 1)
    arr = localXTrain[num:num + localBatchSize]
    labels = localYTrain[num:num + localBatchSize]
    # Reversing the order of encoder string apparently helps as per 2014 paper
    reversedList = list(arr)
    for index,example in enumerate(reversedList):
        reversedList[index] = list(reversed(example))

    # Lagged labels are for the training input into the decoder
    laggedLabels = []
    EOStokenIndex = wordList.index('<EOS>')
    padTokenIndex = wordList.index('<pad>')
    for example in labels:
        eosFound = np.argwhere(example==EOStokenIndex)[0]
        shiftedExample = np.roll(example,1)
        shiftedExample[0] = EOStokenIndex
        # The EOS token was already at the end, so no need for pad
        if (eosFound != (maxLen - 1)):
            shiftedExample[eosFound+1] = padTokenIndex
        laggedLabels.append(shiftedExample)

    # Need to transpose these
    reversedList = np.asarray(reversedList).T.tolist()
    labels = labels.T.tolist()
    laggedLabels = np.asarray(laggedLabels).T.tolist()
    return reversedList, labels, laggedLabels

def translateToSentences(inputs, wList, encoder=False):
    EOStokenIndex = wList.index('<EOS>')
    padTokenIndex = wList.index('<pad>')
    numStrings = len(inputs[0])
    numLengthOfStrings = len(inputs)
    listOfStrings = [''] * numStrings
    for mySet in inputs:
        for index,num in enumerate(mySet):
            if (num != EOStokenIndex and num != padTokenIndex):
                if (encoder):
                    # Encodings are in reverse!
                    listOfStrings[index] = wList[num] + " " + listOfStrings[index]
                else:
                    listOfStrings[index] = listOfStrings[index] + " " + wList[num]
    listOfStrings = [string.strip() for string in listOfStrings]
    return listOfStrings

def getTestInput(inputMessage, wList, maxLen):
    encoderMessage = np.full((maxLen), wList.index('<pad>'), dtype='int32')
    inputSplit = inputMessage.lower().split()
    for index,word in enumerate(inputSplit):
        try:
            encoderMessage[index] = wList.index(word)
        except ValueError:
            continue
    encoderMessage[index + 1] = wList.index('<EOS>')
    encoderMessage = encoderMessage[::-1]
    encoderMessageList=[]
    for num in encoderMessage:
        encoderMessageList.append([num])
    return encoderMessageList

def intList2Sentence(ids, wList):
    EOStokenIndex = wList.index('<EOS>')
    padTokenIndex = wList.index('<pad>')
    myStr = ""
    listOfResponses=[]
    for num in ids:
        if (num[0] == EOStokenIndex or num[0] == padTokenIndex):
            listOfResponses.append(myStr)
            myStr = ""
        else:
            myStr = myStr + wList[num[0]] + " "
    if myStr:
        listOfResponses.append(myStr)
    listOfResponses = [i for i in listOfResponses if i]
    return listOfResponses



In [None]:
#main model development
#tf.compat.v1.disable_eager_execution()   #tfv2
#disable_eager_execution()
datarange = {'2015-01'}
batchSize = 25
maxEncoderLength = 95                                        #represents max length of input /output sentence
maxDecoderLength = maxEncoderLength
lstmUnits = 112
embeddingDim = lstmUnits
numLayersLSTM = 3
numIterations = 15000
for timeframe in datarange:
    with open("wordList.txt","rb") as fp:
        wordList = pickle.load(fp)
    
    wordList.append('<pad>')
    wordList.append('<EOS>')
    vocabSize = len(wordList)
    #question = 'How many dimensions do you want your word vectors to be?: '
    #wordVecDimensions = int(input(question))
    #     if (os.path.isfile('embeddingMatrix.npy')):
    #         wordVectors = np.load('embeddingMatrix.npy')
    #         wordVecDimensions = wordVectors.shape[1]
    
    wordVecDimensions = 5
    padVector = np.zeros((1,wordVecDimensions), dtype = 'int32')
    EOSVector = np.zeros((1,wordVecDimensions), dtype = 'int32')
    generateTrainDataFlag= True
    if(generateTrainDataFlag):
        numTrainingExamples, xTrain, yTrain = createTrainMatrix("./chatdata/comrepTrain{}.npy".format(timeframe), wordList, maxEncoderLength)
        np.save('seq2sexXTrain.npy',xTrain)
        np.save('seq2sexYTrain.npy',yTrain)
    else:   
        if (os.path.isfile('seq2sexXTrain.npy') and os.path.isfile('seq2sexYTrain.npy')):
            xTrain= np.load('seq2sexXTrain.npy')
            yTrain= np.load('seq2sexYTrain.npy')
            numTrainingExamples = xTrain.shape[0]
        else:
            print("Train Data not found ! please generate train data first.")

    
    print("Training matrix created !")
    
    print("padding index in wordlist",wordList.index('<pad>'),"eos at",wordList.index('<EOS>'))
    tfV2 = """    encoderInputs = [tf.keras.backend.placeholder(dtype=tf.int32, shape=(None,)) for i in range(maxEncoderLength)]
    decoderInputs = [tf.keras.backend.placeholder(dtype=tf.int32, shape=(None,)) for i in range(maxDecoderLength)]
    decoderLabels = [tf.keras.backend.placeholder(dtype=tf.int32, shape=(None,)) for i in range(maxDecoderLength)]
    feedPrevious = tf.keras.backend.placeholder(dtype=tf.bool)
    #long short-term memory
    encoderLstm = tf.compat.v1.nn.rnn_cell.LSTMCell(lstmUnits, state_is_tuple=True)
    encoderLstmKeras = tf.keras.layers.LSTMCell(lstmUnits)
    #     encoderLSTM = tf.nn.rnn_cell.MultiRNNCell([singleCell]*numLayersLSTM, state_is_tuple=True)
    #^ for cloud training 
    decoderOutput, decoderFinalState = tfa.seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLstm,vocabSize, vocabSize,embeddingDim,feedPrevious_previous=feedPrevious) 
    
    #PREV:09032021  [tf.nn.seq2seq.embedding_rnn_seq2seq][tf.models.rnn.seq2seq.embedding_rnn_seq2seq]
    #     decoderPrediction = tf.argmax(decoderOutput,2)
    
    #     lossWeights = [tf.ones_like(l,dtype=tf.float32) for l in decoderLabels]
    #     loss = tfa.legacy_seq2seq.sequence_loss(decoderOutput, decoderLabels, lossWeights, vocabSize)
    #     optimizer = tf.train.AdamOptimizer(1e-4).minimize(loss)
    
    """    
    #tf v1

    tf.reset_default_graph()
    encoderInputs = [tf.placeholder(tf.int32, shape=(None,)) for i in range(maxEncoderLength)]
    decoderLabels = [tf.placeholder(tf.int32,shape=(None,)) for i in range(maxDecoderLength)]
    decoderInputs = [tf.placeholder(tf.int32,shape=(None,)) for i in range(maxDecoderLength)]
    feedPrevious = tf.placeholder(tf.bool)
    encoderLSTM = tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, state_is_tuple=True)
    
    decoderOutputs, decoderFinalState = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(encoderInputs, decoderInputs, encoderLSTM,vocabSize,vocabSize,embedding_size= embeddingDim, feed_previous = feedPrevious)
   
    decoderPrediction = tf.argmax(decoderOutputs,2)
    
    lossWeights = [tf.ones_like(l,dtype= tf.float32) for l in decoderLabels]
    loss = tf.contrib.legacy_seq2seq.sequence_loss(decoderOutputs,decoderLabels,lossWeights,vocabSize)
    optimizer = tf.train.AdamOptimizer(1e-4).minimize(loss)
    sess = tf.Session()
    saver = tf.train.Saver()
    #loading a saved model
    #saver.restore(sess,tf.train.latest_checkpoint('models/'))
    sess.run(tf.global_variables_initializer())
    
    #forwarding result to tensorboard
    tf.summary.scalar('loss',loss)
    merged= tf.summary.merge_all()
    logDir = "tensorboard/"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")+"/"
    writer = tf.summary.FileWriter(logDir, sess.graph)
     # test strings
    encoderTestStrings = ["Hi","hey ! who are you","hi there","you wanna chill","bob n vagene"]
    zeroVector = np.zeros((1),dtype= 'int32')
    for i in range(numIterations):
        encoderTrain, decodertargerTrain, decoderInputTrain, = getTrainingBatch(xTrain,yTrain, batchSize , maxEncoderLength)
        feedDict = {encoderInputs[t]:encoderTrain[t] for t in range(maxEncoderLength)}
        feedDict.update({decoderLabels[t]:decodertargerTrain[t] for t in range(maxDecoderLength)})
        feedDict.update({decoderInputs[t]:decoderInputTrain[t] for t in range(maxDecoderLength)})
        feedDict.update({feedPrevious:False})
        curLoss ,_,pred = sess.run([loss, optimizer, decoderPrediction],feed_dict = feedDict)
        if(i%50 == 0):
            print("Current Loss",curLoss, "at i = ",i)
            summary = sess.run(merged,feed_dict=feedDict)
            writer.add_summary(summary,i)
        if(i%25==0 and i!=0):
            num = randint(0,len(encoderTestStrings)-1)
            print("Encoder Test String",encoderTestStrings[num])
            inputVector = getTestInput(encoderTestStrings[num],wordList,maxEncoderLength)
            feedDict = {encoderInputs[t]:inputVector[t] for t in range(maxEncoderLength)}
            feedDict.update({decoderLabels[t]:zeroVector for t in range(maxDecoderLength)})
            feedDict.update({decoderInputs[t]:zeroVector for t in range(maxDecoderLength)})
            feedDict.update({feedPrevious: True})
            ids = (sess.run(decoderPrediction,feed_dict= feedDict))
            print(intList2Sentence(ids,wordList))
        if(i%1000 == 0 and i!=0):
            savePath= saver.save(sess,"models/pretrained_seq2seq.ckpt",global_step=i)
            
            
        
    
    
    


Training matrix created !
padding index in wordlist 8866 eos at 8867
Current Loss 9.091503 at i =  0
Encoder Test String you wanna chill
["I'm I'm "]
Current Loss 8.988535 at i =  50
Encoder Test String Hi
['9/10 9/10 ']
Encoder Test String bob n vagene
['9/10 9/10 9/10 ']
Current Loss 6.918695 at i =  100
Encoder Test String you wanna chill
[]
Encoder Test String bob n vagene
[]
Current Loss 5.3039618 at i =  150
Encoder Test String hi there
[]
Encoder Test String bob n vagene
[]
Current Loss 3.9432845 at i =  200
Encoder Test String Hi
[]
Encoder Test String Hi
[]
Current Loss 2.693292 at i =  250
Encoder Test String hey ! who are you
[]
Encoder Test String you wanna chill
[]
Current Loss 2.3886664 at i =  300
Encoder Test String you wanna chill
[]
Encoder Test String Hi
[]
Current Loss 2.4439323 at i =  350
Encoder Test String Hi
[]
Encoder Test String hey ! who are you
[]
Current Loss 1.9808328 at i =  400
Encoder Test String hey ! who are you
[]
Encoder Test String bob n vagene
[]