<a href="https://colab.research.google.com/github/naveenrvr-data/DataBot/blob/main/DataBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import bs4 as bs
import warnings
import urllib.request
import nltk
import random
import string
import re
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import wordnet


In [None]:
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
warnings.filterwarnings('ignore')

In [None]:
synonyms = []
for syn in wordnet.synsets('hello'):
    for lem in syn.lemmas():
        lem_name = re.sub(r'\[[0-9]*\]', ' ', lem.name())
        lem_name = re.sub(r'\s+', ' ', lem.name())
        synonyms.append(lem_name)

In [None]:
# inputs for greeting
greeting_inputs = ['hey', 'whats up', 'good morning', 'good evening', 'happy morning','morning', 'evening', 'hello there', 'hey there']
# concatenating the synonyms and the inputs for greeting
greeting_inputs = greeting_inputs + synonyms

# greeting outputs by the bot
greeting_outputs = ['Hello! How can I help you?',
                      'Hey there! So what do you want to know?',
                      'Hi, you can ask me anything regarding Data Science.',
                      'Hey! wanna know about Data Science ? Just ask']


# inputs for a Chats
chat_inputs = ['how are you', 'how are you doing', 'you good']

# Chat output by the bot
chat_output = ['Great! what about you?']
# Chat replies by the user
chat_replies = ['great', 'i am fine', 'fine', 'good', 'super', 'superb', 'super great', 'nice']
# few limited questions and answers given as dictionary
question_answers = {'what are you': 'I am a data-bot',
                    'who are you': 'I am a data-bot',
                    'what can you do': 'Answer questions regarding Data Science!',
                    'what do you do': 'Answer questions regarding Data Science!'}

In [None]:
# fetching html data about Data Science from wiki
data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Data_science')
# processing the raw html into more readable data
data = data.read()

In [None]:
data

In [None]:
# converting html into text
article = bs.BeautifulSoup(data, 'lxml')



In [None]:
article

In [None]:
# extracting paras from the above xml and concatenating with article_text
paragraphs = article.find_all('p')

article_text = ''

for p in paragraphs:
    article_text += p.text

article_text = article_text.lower()

In [None]:
article_text

In [None]:
# getting rid of all the special characters
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
article_text = re.sub(r'\s+', ' ', article_text)


In [None]:
# extracting sentences from the article
sentences = nltk.sent_tokenize(article_text)
# extracting words from the article
words = nltk.word_tokenize(article_text)

lemma = nltk.stem.WordNetLemmatizer()


In [None]:
# lemmatizing words for data pre-processing
def perform_lemmatization(tokens):
    return [lemma.lemmatize(token) for token in tokens]


# removing punctuation
remove_punctuation = dict((ord(punc), None) for punc in string.punctuation)


In [None]:

# function to pre-process all the tokens from the above functions
def processed_data(document):
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(remove_punctuation)))


# function for punctuation removal
def punc_remove(str):
    punctuations = r'''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    no_punct = ''

    for char in str:
        if char not in punctuations:
            no_punct = no_punct + char

    return no_punct

In [None]:
# function to generate an output to greetings
def generate_greeting_output(hello):
    if punc_remove(hello.lower()) in greeting_inputs:
        return random.choice(greeting_outputs)


# function to generate an output to chats
def generate_chat_output(str):
    if punc_remove(str.lower()) in chat_inputs:
        return random.choice(chat_output)

In [None]:
# Function to generate a answers to questions
def generate_answers(str):
    if punc_remove(str.lower()) in question_answers:
        return question_answers[punc_remove(str.lower())]

In [None]:
# Function to generate response to queries regarding Data Science
def generate_response(user):
    datarobo_output = ''
    sentences.append(user)

    word_vectorizer = TfidfVectorizer(tokenizer=processed_data, stop_words='english')
    all_word_vectors = word_vectorizer.fit_transform(sentences)
    similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
    similar_sentence_number = similar_vector_values.argsort()[0][-2]

    matched_vector = similar_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0.0:
        datarobo_output = datarobo_output + 'Sorry, my database doesn\'t have the response. Please try ' \
                                                'something different that is related to Data Science '
        return datarobo_output
    else:
        datarobo_output = datarobo_output + sentences[similar_sentence_number]
        return datarobo_output

In [None]:
# chatting with the databot -->
chat = True
print('Hi! I am DataRobo. You can ask me anything regarding Data Science and I shall try answering them: ')
while chat:
    user_input = input().lower()
    user_input = punc_remove(user_input)
    if user_input != 'bye':
        if user_input == 'thanks' or user_input == 'thank you very much' or user_input == 'thank you':
            chat = False
            print('DataRobo: Welcome, Any time...')
        elif user_input in chat_replies:
            print('That\'s nice! How may I assist you')
            continue
        else:
            if generate_greeting_output(user_input) is not None:
                print('DataRobo: ' + generate_greeting_output(user_input))
            elif generate_chat_output(user_input) is not None:
                print('DataRobo: ' + generate_chat_output(user_input))
            elif generate_answers(user_input) is not None:
                print('DataRobo: ' + generate_answers(user_input))
            else:
                print('DataRobo: ', end='')
                print(generate_response(user_input))
                sentences.remove(user_input)
    else:
        chat = False
        print('DataRobo: Bye Bye, take care!')

Hi! I am DataRobo. You can ask me anything regarding Data Science and I shall try answering them: 
data
DataRobo: this can involve tasks such as data cleaning, data visualization, and exploratory data analysis to gain insights into the data and develop hypotheses about relationships between variables.
data science
DataRobo: however, data science is different from computer science and information science.
cell phone
DataRobo: Sorry, my database doesn't have the response. Please try something different that is related to Data Science 
who is a data scientist
DataRobo: a data scientist is a professional who creates programming code and combines it with statistical knowledge to create insights from data.
what is data cleaning
DataRobo: this can involve tasks such as data cleaning, data visualization, and exploratory data analysis to gain insights into the data and develop hypotheses about relationships between variables.
statistics is data science
DataRobo: however, data science is differe