Importing the required libraries

In [None]:
import numpy as np
import nltk
import string
import random

Importing and reading the corpus

In [None]:
f=open('Chatbot.txt','r',errors = 'ignore')
raw_doc=f.read()
raw_doc=raw_doc.lower() #converting text into lowercase
nltk.download('punkt') #using the punkt tokenizer
nltk.download('punkt_tab') #using the punkt tokenizer
nltk.download('wordnet') #using the wordnet dictionary
sent_tokens = nltk.sent_tokenize(raw_doc) #converts doc to list of sentences
word_tokens = nltk.word_tokenize(raw_doc) #converts doc to list of words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Examples of sentence tokens

In [None]:
sent_tokens[:2]

['\n\nwikipediathe free encyclopedia\nsearch wikipedia\nsearch\ndonate\ncreate account\nlog in\nbanner logo\t\nparticipate in the 2025 international science photo competition!',
 'contents hide\n(top)\nfoundations\netymology\n\nearly usage\nmodern usage\ndata science and data analysis\ncloud computing for data science\nethical consideration in data science\nsee also\nreferences\ndata science\n\narticle\ntalk\nread\nedit\nview history\n\ntools\nappearance hide\ntext\n\nsmall\n\nstandard\n\nlarge\nwidth\n\nstandard\n\nwide\ncolor (beta)\n\nautomatic\n\nlight\n\ndark\nfrom wikipedia, the free encyclopedia\nnot to be confused with information science or computer science.']

Exaples of word tokens

In [None]:
word_tokens[:2]

['wikipediathe', 'free']

Text pre-processing

In [None]:
lemmer = nltk.stem.WordNetLemmatizer()
#WordNet is a semantically-oriented dictionary of English included in NLTK.
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

Defining the greeting function

In [None]:
GREET_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey","hola")
GREET_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]
def great(sentence):

  for word in sentence.split():
    if word.lower() in GREET_INPUTS:
      return random.choice(GREET_RESPONSES)

Response generation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def response(user_response):
  robo1_response=''
  TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
  tfidf = TfidfVec.fit_transform(sent_tokens)
  vals = cosine_similarity(tfidf[-1], tfidf)
  idx=vals.argsort()[0][-2]
  flat = vals.flatten()
  flat.sort()
  req_tfidf = flat[-2]
  if(req_tfidf==0):
    robo1_response=robo1_response+"I am sorry! I don't understand you"
    return robo1_response
  else:
    robo1_response = robo1_response+sent_tokens[idx]
    return robo1_response

Defining conversation start/end Protocols

In [None]:
flag=True
print("BOT : My name is stark. lets have a conversation! Also if you want  to exit anytime, just type Bye!")
while(flag==True):
  user_response = input()
  user_response=user_response.lower()
  if(user_response!='bye'):
    if(user_response=='thanks' or user_response=='thank you' ):
      flag=False
      print("BOT: You are welcome..")
    else:
      if(great(user_response)!=None):
        print("BOT: "+great(user_response))
      else:
        sent_tokens.append(user_response)
        word_tokens=word_tokens+nltk.word_tokenize(user_response)
        final_words=list(set(word_tokens))
        print("BOT: ",end='')
        print(response(user_response))
        sent_tokens.remove(user_response)
  else:
    flag=False
    print("BOT: Goodbye! take care <3 ")

BOT : My name is stark. lets have a conversation! Also if you want  to exit anytime, just type Bye!
hii
BOT: I am sorry! I don't understand you
hey
BOT: I am glad! You are talking to me
foundations
BOT: wikipediaÂ® is a registered trademark of the wikimedia foundation, inc., a non-profit organization.
wikipedia
BOT: 

wikipediathe free encyclopedia
search wikipedia
search
donate
create account
log in
banner logo	
participate in the 2025 international science photo competition!
search
BOT: 

wikipediathe free encyclopedia
search wikipedia
search
donate
create account
log in
banner logo	
participate in the 2025 international science photo competition!
impact of data science
BOT: "big data's disparate impact".
bye
BOT: Goodbye! take care <3 
