In [1]:
# Required libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.metrics import pairwise_distances
import pickle

In [2]:
# Reading data
df = pd.read_excel('chat_bot.xlsx')
print("Reading data")

Reading data


In [3]:
df

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,I can't find function defined called yourself.
3,Describe yourself,All I can describe is I'll be at your service ...
4,tell me about yourself,People call me Bot. Located in a sever.
...,...,...
1500,can we chat,Ofcourse we are doing it
1501,I'll be back in a few minutes,I'll be waiting.
1502,I'll be back,All right. I'll be here.
1503,I'll get back to you in a moment,Till next time.


In [4]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

In [5]:
# Text Normalization
def text_normalization(text):
    lema_sent = []
    try :
        pre_text = text.lower()
        words = re.sub(r'[^a-z0-9]'," ",pre_text)
        tag_list = pos_tag(nltk.word_tokenize(words),tagset=None)
        for token, pos_token in tag_list:
            if pos_token.startswith("V"):
                pos_val = "v"
            elif pos_token.startswith("J"):
                pos_val = "a"
            elif pos_token.startswith("R"):
                pos_val = "r"
            else:
                pos_val = "n"
            lema_token = lemmatizer.lemmatize(token,pos_val)
            lema_sent.append(lema_token)
        return " ".join(lema_sent)
    except:
        pass

In [13]:
lemmatized_df = text_normalization(df['Context'][1501])

In [14]:
lemmatized_df

'i ll be back in a few minute'

In [18]:
df['lemmatized_text'] = df['Context'].apply(text_normalization)

In [19]:
df

Unnamed: 0,Context,Text Response,lemmatized_text
0,Tell me about your personality,Just think of me as the ace up your sleeve.,tell me about your personality
1,I want to know you better,I can help you work smarter instead of harder,i want to know you good
2,Define yourself,I can't find function defined called yourself.,define yourself
3,Describe yourself,All I can describe is I'll be at your service ...,describe yourself
4,tell me about yourself,People call me Bot. Located in a sever.,tell me about yourself
...,...,...,...
1500,can we chat,Ofcourse we are doing it,can we chat
1501,I'll be back in a few minutes,I'll be waiting.,i ll be back in a few minute
1502,I'll be back,All right. I'll be here.,i ll be back
1503,I'll get back to you in a moment,Till next time.,i ll get back to you in a moment


In [25]:
# After finishing the normalization of text, we should proceed to tfidf vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [26]:
x_tfidf = tfidf.fit_transform(df['lemmatized_text']).toarray()

In [27]:
x_tfidf

array([[0.        , 0.        , 0.41437693, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.64124787,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [28]:
def validation(x_tfidf, query_ask):
    cos = 1-pairwise_distances(x_tfidf, query_ask,metric='cosine')
    ind = cos.argmax()
    threshold = cos[ind]
    if threshold > 0.2:
        result = df['Text Response'].loc[ind]
    else:
        result = df['Text Response'].loc[51]
    return result

In [40]:
user_input = input("Enter any message: ")
query_ask = text_normalization(user_input)
query_ask = tfidf.transform([query_ask]).toarray()
response = validation(x_tfidf,query_ask)
print("bot replays:",response)


Enter any message: what happened
bot replays: I don't what is happening in backend
