In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('wordnet')
import re 
from spellchecker import SpellChecker
import string
from string import punctuation

import re
import joblib

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Load Classifier

In [2]:
# Load the saved classifier
clf_logistic = joblib.load('.\\classifiers\\logistic_regression\\classifier.logistic_regression')
vectorizer_logistic = joblib.load('.\\classifiers\\logistic_regression\\vectors.logistic_regression')
clf_naive = joblib.load('.\\classifiers\\naive_bayes\\classifier.naive_bayes')
vectorizer_naive = joblib.load('.\\classifiers\\naive_bayes\\vectors.naive_bayes')

# Create synonyms

In [3]:
# Building a list of Keywords
list_words=['hello','analyze', 'quit']
list_syn={}
for word in list_words:
    synonyms=[]

    # Create synonyms for each Keyword
    for syn in wordnet.synsets(word):
        for lem in syn.lemmas():

            # Remove any special characters from synonym strings
            lem_name = re.sub('[^a-zA-Z0-9 \n\.]', ' ', lem.name())
            synonyms.append(lem_name)
    list_syn[word]=set(synonyms)


print (list_syn['analyze'])

{'study', 'take apart', 'analyze', 'canvas', 'break down', 'psychoanalyse', 'dissect', 'examine', 'analyse', 'psychoanalyze', 'canvass'}


# Input Cleaning

In [4]:
# English Stopwords for clean_user_input function
stop_words = set(stopwords.words("english"))
# Spellchecker
spell = SpellChecker()
# Lemmatzier
lemmatizer = WordNetLemmatizer()


def clean_user_input(user_input):
    # Remove HTML tags from user input
    user_input_no_html = re.sub('<.*?>', '', user_input)

    # Word Tokenization
    user_input_token = word_tokenize(user_input_no_html)

    # Lowercase and Remove punctuation
    user_input_punct_lower = [x.lower() for x in user_input_token if x not in punctuation]

    # Remove Numbers
    user_input_no_num = [x for x in user_input_punct_lower if not x.isdigit()]

    # Spellchecker
    correct_words = [spell.correction(word) for word in user_input_no_num]

    # Remove Stopwords
    filtered_words = [word for word in correct_words if word not in stop_words]

    # Lemmatization
    base_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    # Join words into sentence
    clean_response = ' '.join(base_words)

    return clean_response

# Chatbot

In [5]:
def generate_response(user_input, whole_input, test_logistic, test_naive):
    patterns = {
        r'(?i)({}).*'.format('|'.join(list_syn['hello'])): "Hi, which sentence should i analyze?",
        r'(?i)({}).*'.format('|'.join(list_syn['analyze'])): "I will {}, but with which technique? Logistic Regression or Naive Bayes?".format(user_input),
        r'(?i)logistic.*': "The result is {}".format(test_logistic),
        r'(?i)naive.*': "The result is {}".format(test_naive),
    }


    for pattern, response in patterns.items():
        match = re.match(pattern, user_input)
        if match:
            if pattern == r'logistic$' or pattern == r'naive$':
                return None
            else:
                return response
    return "I'm sorry, but I'm not sure I understand."

In [6]:
def chat():
    print("Hi")
    whole_input = ""
    test_logistic = ""
    test_naive = ""

    while True:
        user_input = input("You: ")

        response = generate_response(user_input, whole_input, test_logistic, test_naive)
        if response is not None:
            print(response)

        match_test = re.match(r'(?i)(?:{})\s*(.*)'.format('|'.join(list_syn['analyze'])), user_input)
        if match_test:
            whole_input = match_test.group(1)  # Extract the content after "analyze "
            print("whole_input:", whole_input)

        if user_input.lower() == 'logistic':
            test_logistic = clf_logistic.predict(vectorizer_logistic.transform([clean_user_input(whole_input)]))
            print(whole_input) # Remove - just testing
            if test_logistic == 0:
                test_logistic = 'negative'
                print(test_logistic)  
            else:
                test_logistic = 'psotive'
                print(test_logistic)        

        if user_input.lower() == 'naive':
            test_naive = clf_naive.predict(vectorizer_naive.transform([clean_user_input(whole_input)]))
            print(whole_input) # Remove - just testing
            if test_naive == 0:
                test_naive = 'negative'
                print(test_naive)
            else:
                test_naive = 'psotive'
                print(test_naive)
                

        else:
            None
# Example usage
chat()


Hi
I'm sorry, but I'm not sure I understand.
Hi, which sentence should i analyze?
Hi, which sentence should i analyze?
I will analyze best movie ever, but with which technique? Logistic Regression or Naive Bayes?
whole_input: best movie ever
I'm sorry, but I'm not sure I understand.
The result is 
best movie ever
psotive


# Testing

In [5]:
def generate_response(user_input, whole_input, test_logistic, test_naive):
    patterns = {
        r'(?i)({}).*'.format('|'.join(list_syn['hello'])): "Hi, which sentence should I analyze?",
        r'(?i)({}).*'.format('|'.join(list_syn['analyze'])): "I will {}, but with which technique? Logistic Regression or Naive Bayes?".format(user_input),
        r'(?i)logistic$': "The result is {}".format(test_logistic),
        r'(?i)naive$': "The result is {}".format(test_naive),
    }

    for pattern, response in patterns.items():
        match = re.match(pattern, user_input)
        if match:
            return response

    return "I'm sorry, but I'm not sure I understand."


In [6]:
def chat():
    print("Hi")
    whole_input = ""
    test_logistic = ""
    test_naive = ""

    while True:
        user_input = input("You: ")

        response = generate_response(user_input, whole_input, test_logistic, test_naive)
        if response is not None:
            print(response)

        match_test = re.match(r'(?i)(?:{})\s*(.*)'.format('|'.join(list_syn['analyze'])), user_input)
        if match_test:
            whole_input = match_test.group(1)  # Extract the content after "analyze "
            print("whole_input:", whole_input)

        if user_input.lower() == 'logistic':
            test_logistic = clf_logistic.predict(vectorizer_logistic.transform([clean_user_input(whole_input)]))
            print(whole_input) # Remove - just testing
            if test_logistic == 0:
                test_logistic = 'negative'
                print(test_logistic)  
            else:
                test_logistic = 'psotive'
                print(test_logistic)        

        if user_input.lower() == 'naive':
            test_naive = clf_naive.predict(vectorizer_naive.transform([clean_user_input(whole_input)]))
            print(whole_input) # Remove - just testing
            if test_naive == 0:
                test_naive = 'negative'
                print(test_naive)
            else:
                test_naive = 'psotive'
                print(test_naive)
                

        else:
            None
# Example usage
chat()


Hi
I'm sorry, but I'm not sure I understand.
Hi, which sentence should I analyze?
Hi, which sentence should I analyze?
I will analyze Best Love, but with which technique? Logistic Regression or Naive Bayes?
whole_input: Best Love
I will analyze Best Love, but with which technique? Logistic Regression or Naive Bayes?
whole_input: Best Love
I will analyze Best Love, but with which technique? Logistic Regression or Naive Bayes?
whole_input: Best Love
The result is 
Best Love
psotive
