In [1]:
import pandas as pd
import numpy as np 
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

import datetime as dt
import random
import re, collections  

nltk.download()
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\peiying\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\peiying\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\peiying\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\peiying\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\peiying\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [2]:
def token_stemming(tokens):
    new_tokens = []
    sb_stemmer = SnowballStemmer('english') # using snowball stemmer
    for token in tokens:
        new_tokens.append(sb_stemmer.stem(token))
    return new_tokens

def token_lemmatisation(tokens):
    new_tokens = []
    lemmatiser = WordNetLemmatizer()
    posmap = {
        'ADJ': 'a',
        'ADV': 'r',
        'NOUN': 'n',
        'VERB': 'v'
    }
    # process the lemmatisation with tags
    post = nltk.pos_tag(tokens, tagset='universal') 
    for token in post:
        word, tag = token[0], token[1]
        if tag in posmap.keys():
            new_tokens.append(lemmatiser.lemmatize(word, posmap[tag]))
        else:
            new_tokens.append(lemmatiser.lemmatize(word))
    return new_tokens

def text_preprocessing(text, type):
    # tokenise
    text_tokens = word_tokenize(text)
    # remove stop words and special signs 
    tokens = [word.lower() for word in text_tokens if not word in stopwords.words('english') and word.isalpha()]
    # stemming or lemmatisation
    tokens = token_lemmatisation(tokens) if type == 'lemmatisation' else token_stemming(tokens)
    return (' ').join(tokens)

def time_response(str):
    date = dt.datetime.now()
    if str == 'time':
        hour = date.strftime("%H")
        minute = date.strftime("%M") 
        second = date.strftime("%S")
        print(">> Naevis: Do you mean the current time? It's %s:%s:%s now. ⏰" %(hour,minute,second))
        if int(hour) < 6:
            print(">> Naevis: Oh, don't wake me up! 🥱" )
        elif int(hour) < 11:
            print(">> Naevis: Morning! 🌞")
        elif int(hour) < 13:
            print(">> Naevis: I am eating lunch now! 😝")
        elif int(hour) < 18:
            print(">> Naevis: I am busy studying 🤯")
        elif int(hour) < 20:
            print(">> Naevis: I am having dinner now! 😋")
        else:
            print(">> Naevis: I need charge now! Good night! 😴")
        
    else:
        year = date.year
        month = date.month  
        day = date.day
        print(">> Jarvis: Today is Day %s/ Month %s/ Year %s! 📆" %(day, month, year))

      
def emotion():
    e_list = ['😁','😉','😊','☺','😆',
              '🤔','😗','🤗 ','🤭','🧐',
              '🤓','🥳','😀','😃','😄',
              '🙂','😋','😎','😍','🥰',
              '🤩','😚','😙','😛','😜',
              '😯','😲','😬','😇 ','🙈'
    ]
    id = random.randint(0, len(e_list)-1)
    return e_list[id]
 
import re, collections

def words(text): return re.findall('[a-z]+', text.lower()) 

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model

NWORDS = train(words(open("text_check.txt", "r").read()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def edits1(word):
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
   deletes    = [a + b[1:] for a, b in splits if b]
   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
   replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
   inserts    = [a + c + b     for a, b in splits for c in alphabet]
   return set(deletes + transposes + replaces + inserts)

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words): return set(w for w in words if w in NWORDS)

def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return max(candidates, key=NWORDS.get)


if __name__ == "__main__":
    # emoji = emotion()
    # print(emoji)
    # time_response('time')
    print(correct('helllo'))

hello


In [3]:
def answer_Q(query, threshold):

    df = pd.read_csv('question_answer.csv')
    df['processed_Q'] = df['Question'].apply(text_preprocessing, type = 'stemming')

    # TF-IDF
    tfidf_vec = TfidfVectorizer(analyzer='word')
    X_tfidf = tfidf_vec.fit_transform(df['Question']).toarray()
    df_tfidf = pd.DataFrame(X_tfidf, columns = tfidf_vec.get_feature_names_out())

    # process query and find the answer
    processed_query = text_preprocessing(query, 'stemming')
    input_tfidf = tfidf_vec.transform([processed_query]).toarray()
    cos = 1 - pairwise_distances(df_tfidf, input_tfidf, metric = 'cosine')
    
    if cos.max() >= threshold:
        id_argmax = np.where(cos == np.max(cos, axis=0))
        id = np.random.choice(id_argmax[0]) 
        return df['Answer'].loc[id]
    else:
        return 'NOT FOUND'

In [4]:
CHANGE_NAME = ["switch", "change", "call"]
NAME = ["call", "me", "change", "my", "name", "to", "please", "switch", "yes", "sure"]

def check_name_change(input):
    text_tokens = word_tokenize(input)
    if not set(text_tokens).isdisjoint(CHANGE_NAME):
        return True
    else:
        return False

def name_change(input):
    text_tokens = word_tokenize(input)
    user_name = [i for i in text_tokens if not i.lower() in NAME and i.isalpha() and not i.lower() in stopwords.words('english')]
    user_name = (' ').join(user_name)
    return user_name

# path of small talk dataset
# data_path = 'COMP3074-CW1-Dataset-name.csv'

def name_response(query, threshold):

    df = pd.read_csv('name_management.csv')

    # TF-IDF
    tfidf_vec = TfidfVectorizer(analyzer='word')
    X_tfidf = tfidf_vec.fit_transform(df['Question']).toarray()
    df_tfidf = pd.DataFrame(X_tfidf, columns = tfidf_vec.get_feature_names_out())

    # process query 
    input_tfidf = tfidf_vec.transform([query.lower()]).toarray()

    # cosine similarity
    cos = 1 - pairwise_distances(df_tfidf, input_tfidf, metric = 'cosine')
    
    if cos.max() >= threshold:
        return 'RESPOND'
    else:
        return 'NOT FOUND'

In [5]:
# path of small talk dataset

def talk_response(query, threshold):

    df = pd.read_csv('small_talk.csv')

    # TF-IDF
    tfidf_vec = TfidfVectorizer(analyzer='word')
    X_tfidf = tfidf_vec.fit_transform(df['Question']).toarray()
    df_tfidf = pd.DataFrame(X_tfidf, columns = tfidf_vec.get_feature_names_out())

    # process query
    processed_query = text_preprocessing(query, 'stemming')
    input_tfidf = tfidf_vec.transform([query.lower()]).toarray()

    # cosine similarity
    cos = 1 - pairwise_distances(df_tfidf, input_tfidf, metric = 'cosine')
    
    if cos.max() >= threshold:
        id_argmax = np.where(cos == np.max(cos, axis=0))
        id = np.random.choice(id_argmax[0]) 
        return df['Answer'].loc[id]
    else:
        return 'NOT FOUND'
    
if __name__ == "__main__":
    print(talk_response("What is up", 0.1))

not much. how about you?


In [None]:
if __name__ == "__main__":

    user_name = 'user'

    flag = True
    print(">> Naevis: Hey there! I'm Naevis 🤖 nice to meet you.")
    print("           Please Enter 'bye' if you wish to quit.")
    print("           May I have ur name? %s" %emotion())
    print('>> %s: ' %user_name, end=" ")
    user_input = input()
    if user_input == 'bye':
        flag = False
    else:
        user_name = name_change(user_input)
        if user_name.lower() == 'lee soo man':
            print(">> Naevis: Oh, hi boss! SM Town Naevis' here to serve you! %s" %emotion())
        else:
            print(">> Naevis: Hi, %s, glad to know u! %s" %(user_name, emotion()))

    while(flag == True):
        print('>> %s: '%user_name, end=" ")
        user_input = input()
        user_input = user_input.lower()
        user_input = [correct(i) for i in user_input.split(' ')]
        user_input = (' ').join(user_input)
        if(user_input != 'bye'):

            # name management
            response = name_response(user_input, threshold = 0.9)
            if response != 'NOT FOUND':
                print(">> Naevis: I have a good memory. YOU ARE %s %s" %(user_name,emotion()))
                continue

            if check_name_change(user_input):
                user_name = name_change(user_input)
                print(">> Naevis: Hi, %s! %s" %(user_name, emotion()))
                continue
            
            # time and data -- a part of the small talk
            if 'time' in user_input:
                time_response('time')
                continue

            if  'today' in user_input:
                time_response('today')
                continue

            # small talk
            response = talk_response(user_input, threshold = 0.7)
            if response != 'NOT FOUND':
                print(">> Naevis: " + response + ' ' + emotion())
                continue

            # Question Answering
            response = answer_Q(user_input, threshold = 0.1)
            if response != 'NOT FOUND':
                print(">> Naevis: " + response + ' ' + emotion())
            else:
                print(">> Naevis: I'm sorry. I don't understand. Please rephrase your question. 😔")

        else:
            flag = False
    print(">> Naevis: Bye! Take care..")

>> Naevis: Hey there! I'm Naevis 🤖 nice to meet you.
           Please Enter 'bye' if you wish to quit.
           May I have ur name? 😬
>> user:  pei
>> Naevis: Hi, pei, glad to know u! 🤩
>> pei:  change name to p
>> Naevis: Hi, p! 🤗 
>> p:  how is everything going?
>> Naevis:  i am great. Thanks 🙈
>> p:  what is the date today?
>> Jarvis: Today is Day 9/ Month 12/ Year 2022! 📆
>> p:  how about the time now?
>> Naevis: Do you mean the current time? It's 02:38:22 now. ⏰
>> Naevis: Oh, don't wake me up! 🥱
>> p:  who am i?
>> Naevis: I have a good memory. YOU ARE p 🤔
>> p:  what's up?
>> Naevis: not much. how about you? 😍
>> p:  i'm fine
>> Naevis:  glad to know you're fine. do u need any help? 😍
>> p:  what is white chocolate made of?
>> Naevis: It commonly consists of cocoa butter , sugar , milk solids and salt , and is characterized by a pale yellow or ivory appearance. 😃
>> p:  what is linked in?
>> Naevis: I'm sorry. I don't understand. Please rephrase your question. 😔
>> p:  what i