In [24]:
import tensorflow as tf
from tensorflow.keras import layers , activations , models , preprocessing , utils
import pandas as pd
from tqdm import tqdm
from convokit import Corpus, Speaker, Utterance
from collections import defaultdict
import ast
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
tf.test.is_gpu_available()


False

In [2]:
# Prepare ending classifier

f = open("dialogues_text.txt", 'r', encoding='utf-8', errors='ignore')

convos = f.readlines()
ending = []
not_ending = []
starter = []
for convo in convos:
    sentences = convo.split('__eou__')[:-1]
    ending.append(sentences[-1])
    starter.append(sentences[0])
    not_ending += sentences[:-1]



X_train = np.array(ending+not_ending)
y_train_text = np.array(["e"]*len(ending)+["c"]*len(not_ending))

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train_text, test_size=0.2, random_state=42)

target_names = ['e', 'c']

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_train)

classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train, Y)
predicted = classifier.predict(X_test)
all_labels = mlb.inverse_transform(predicted)

count = 0
for i in range(len(all_labels)):
    if all_labels[i][0] == y_test[i]:
        count += 1

print("accuracy: ", count/len(all_labels))

accuracy:  0.8767236356574092


In [3]:
# Prepare question classifier

f = open("dialogues_text.txt", 'r', encoding='utf-8', errors='ignore')

convos = f.readlines()
questions = []
non_question = []
for convo in convos:
    sentences = convo.split('__eou__')[:-1]
    for i in range(len(sentences)-1):
        if '?' == sentences[i][-2]:
            questions.append(sentences[i][:-2])
        else:
            non_question.append(sentences[i])

X_train = np.array(questions+non_question)
y_train_text = np.array(["e"]*len(questions)+["c"]*len(non_question))

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train_text, test_size=0.2, random_state=42)

target_names = ['e', 'c']

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_train)

question_classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))])

question_classifier.fit(X_train, Y)
predicted = question_classifier.predict(X_test)
all_labels = mlb.inverse_transform(predicted)

count = 0
for i in range(len(all_labels)):
    if all_labels[i][0] == y_test[i]:
        count += 1

print("accuracy: ", count/len(all_labels))

accuracy:  0.8591776553719468


In [4]:
def prepare_model(data_path):
    
    input_texts = []
    target_texts = []
    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
    for line in lines[: min(2000, len(lines) - 1)]:
        input_text = line.split('\t')[0]
        target_text = line.split('\t')[1]
        input_texts.append(input_text)
        target_texts.append(target_text)
   
    zippedList =  list(zip(input_texts, target_texts))
    lines = pd.DataFrame(zippedList, columns = ['input' , 'output']) 
    
    input_lines = list()
    for line in lines.input:
        input_lines.append( line ) 

    tokenizer = preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts( input_lines ) 
    tokenized_input_lines = tokenizer.texts_to_sequences( input_lines ) 

    length_list = list()
    for token_seq in tokenized_input_lines:
        length_list.append( len( token_seq ))
    max_input_length = np.array( length_list ).max()
    print( 'Input max length is {}'.format( max_input_length ))

    padded_input_lines = preprocessing.sequence.pad_sequences( tokenized_input_lines , maxlen=max_input_length , padding='post' )
    encoder_input_data = np.array( padded_input_lines )
    print( 'Encoder input data shape -> {}'.format( encoder_input_data.shape ))

    input_word_dict = tokenizer.word_index
    num_input_tokens = len( input_word_dict )+1
    print( 'Number of Input tokens = {}'.format( num_input_tokens))
    
    output_lines = list()
    for line in lines.output:
        output_lines.append( '<START> ' + line + ' <END>' )  

    tokenizer = preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts( output_lines ) 
    tokenized_output_lines = tokenizer.texts_to_sequences( output_lines ) 

    length_list = list()
    for token_seq in tokenized_output_lines:
        length_list.append( len( token_seq ))
    max_output_length = np.array( length_list ).max()
    print( 'Output max length is {}'.format( max_output_length ))

    padded_output_lines = preprocessing.sequence.pad_sequences( tokenized_output_lines , maxlen=max_output_length, padding='post' )
    decoder_input_data = np.array( padded_output_lines )
    print( 'Decoder input data shape -> {}'.format( decoder_input_data.shape ))

    output_word_dict = tokenizer.word_index
    num_output_tokens = len( output_word_dict )+1
    print( 'Number of Output tokens = {}'.format( num_output_tokens))



    return input_word_dict, max_input_length, encoder_input_data, decoder_input_data, output_word_dict, max_output_length

In [5]:
def str_to_tokens( sentence : str, input_word_dict, max_input_length ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( input_word_dict[ word ] )
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=max_input_length , padding='post')


In [6]:
def simple_chatbot():
    enc_model = tf.keras.models.load_model("enc_model_q3.h5")
    dec_model = tf.keras.models.load_model("dec_model_q3.h5")
    input_word_dict, max_input_length, encoder_input_data, decoder_input_data, output_word_dict, max_output_length = prepare_model("q3.txt")

    for epoch in range( encoder_input_data.shape[0] ):
        states_values = enc_model.predict( str_to_tokens( input( 'User: ' ), input_word_dict, max_input_length ) )
        empty_target_seq = np.zeros( ( 1 , 1 ) )
        empty_target_seq[0, 0] = output_word_dict['start']
        stop_condition = False
        decoded_translation = ''
        while not stop_condition :
            dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
            sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
            sampled_word = None
            for word , index in output_word_dict.items() :
                if sampled_word_index == index :
                    decoded_translation += ' {}'.format( word )
                    sampled_word = word

            if sampled_word == 'end' or len(decoded_translation.split()) > max_output_length:
                stop_condition = True

            empty_target_seq = np.zeros( ( 1 , 1 ) )  
            empty_target_seq[ 0 , 0 ] = sampled_word_index
            states_values = [ h , c ] 

        print( "Bot:" +decoded_translation.replace(' end', '') )
        print()


In [7]:
def respond(user_in, enc_model, dec_model, input_word_dict, max_input_length, encoder_input_data, decoder_input_data, output_word_dict, max_output_length):

    states_values = enc_model.predict( str_to_tokens( user_in, input_word_dict, max_input_length ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = output_word_dict['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in output_word_dict.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word

        if sampled_word == 'end' or len(decoded_translation.split()) > max_output_length:
            stop_condition = True

        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( "Bot:" +decoded_translation.replace(' end', '') )

In [8]:
def advanced_chatbot():
    enc_model_q1 = tf.keras.models.load_model("enc_model_q1.h5")
    dec_model_q1 = tf.keras.models.load_model("dec_model_q1.h5")
    input_word_dict_q1, max_input_length_q1, encoder_input_data_q1, decoder_input_data_q1, output_word_dict_q1, max_output_length_q1 = prepare_model("q1.txt")
    
    enc_model_q2 = tf.keras.models.load_model("enc_model_q2.h5")
    dec_model_q2 = tf.keras.models.load_model("dec_model_q2.h5")
    input_word_dict_q2, max_input_length_q2, encoder_input_data_q2, decoder_input_data_q2, output_word_dict_q2, max_output_length_q2 = prepare_model("q2.txt")
    
    enc_model_q3 = tf.keras.models.load_model("enc_model_q3.h5")
    dec_model_q3 = tf.keras.models.load_model("dec_model_q3.h5")
    input_word_dict_q3, max_input_length_q3, encoder_input_data_q3, decoder_input_data_q3, output_word_dict_q3, max_output_length_q3 = prepare_model("q3.txt")
    
    for epoch in range( encoder_input_data_q3.shape[0] ):
        user_in = input( 'User: ' )
        if question_classifier.predict([user_in])[0][0]==0:
            # enter q2: answering question
            print("currently at q2")
            respond(user_in, enc_model_q2, dec_model_q2, input_word_dict_q2, max_input_length_q2, encoder_input_data_q2, decoder_input_data_q2, output_word_dict_q2, max_output_length_q2)
        else:
            if classifier.predict([user_in])[0][0]==0:
                # enter q1: starting topic
                print("currently at q1")
                respond(user_in, enc_model_q1, dec_model_q1, input_word_dict_q1, max_input_length_q1, encoder_input_data_q1, decoder_input_data_q1, output_word_dict_q1, max_output_length_q1)
            else:
                # enter q3: making comment
                print("currently at q3")
                respond(user_in, enc_model_q3, dec_model_q3, input_word_dict_q3, max_input_length_q3, encoder_input_data_q3, decoder_input_data_q3, output_word_dict_q3, max_output_length_q3)
        print()
    

In [None]:
print('Please choose a model to begin your conversation with our chatbot: ')
print('Press 1 to use the simple encoder-decoder chatbot')
print('Press 2 to use the advanced encoder-decoder chatbot which integrated a finite-state automaton')
user_input = input( "Your choice: " )
if(user_input == '1'):
    print('Start to chat with the simple encoder-decoder chatbot')
    simple_chatbot()
elif(user_input == '2'):
    print('Start to chat with our advanced encoder-decoder chatbot')
    advanced_chatbot()
else:
    print("Invalid input, please input only 1 or 2, restart the program to begin your conversation")

Please choose a model to begin your conversation with our chatbot: 
Press 1 to use the simple encoder-decoder chatbot
Press 2 to use the advanced encoder-decoder chatbot which integrated a finite-state automaton
Your choice: 2
Start to chat with our advanced encoder-decoder chatbot
Input max length is 256
Encoder input data shape -> (2000, 256)
Number of Input tokens = 2806
Output max length is 61
Decoder input data shape -> (2000, 61)
Number of Output tokens = 2798
Input max length is 47
Encoder input data shape -> (2000, 47)
Number of Input tokens = 1955
Output max length is 88
Decoder input data shape -> (2000, 88)
Number of Output tokens = 2758
Input max length is 148
Encoder input data shape -> (2000, 148)
Number of Input tokens = 2935
Output max length is 150
Decoder input data shape -> (2000, 150)
Number of Output tokens = 2561
User: hello
currently at q3
Bot: hello i'd like to have a vacant apartment

User: how are you
currently at q2
Bot: fine and you

User: I am good
currentl