# CHATBOT FOR DATA SCIENCE QUESTIONS 

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import SGD
import random
import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NITHU\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Loading dataset 

In [2]:
data = pd.read_csv('data_new.csv').drop(['Unnamed: 0','Q.no.'],axis=1)
data

Unnamed: 0,topics,questions,answers
0,basic data science,What is Data Science? List the differences bet...,"Data Science is a blend of various tools, algo..."
1,Basic Data Science,What is Selection Bias?,Selection bias is a kind of error that occurs ...
2,Basic Data Science,What is bias variance trade off?,Bias: Bias is an error introduced in your mode...
3,Basic Data Science,What is a confusion matrix?,The confusion matrix is a 2X2 table that conta...
4,Statistics,What is the difference between �long� and �wid...,"In the wide-format, a subject�s repeated respo..."
...,...,...,...
105,nlp,What do you understand by Natural Language Pro...,Natural Language Processing is a field of comp...
106,nlp,List any two real-life applications of Natural...,Google Translate: Google Translate is one of t...
107,nlp,What are stop words?,Stop words are said to be useless data for a s...
108,nlp,What is NLTK?,"NLTK is a Python library, which stands for Nat..."


## Data Preprocessing

In [3]:
import re
import string
import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from collections import Counter
import joblib
from nltk import pos_tag
import gensim
from gensim import corpora, models, similarities
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NITHU\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NITHU\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NITHU\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def clean_text(text):
    text = text.lower().strip()                                               # lower case
    text = re.sub(r"\S*https?:\S*", "", text)                                 # removing links
    text = re.sub('\[.*?\]', '', text)                                        # removing text in square brackets
    text = re.sub(r"[’…�]", "", text)                                        # removing special chracters
    text = re.sub("[0-9" "]+"," ",text)                                       # removing numbers
    text = text.translate(str.maketrans('', '', string.punctuation))          # remove punctuations 
    return text
clean = lambda x: clean_text(x)

In [5]:
data['topics'] = data['topics'].apply(clean)
data['questions'] = data['questions'].apply(clean)
data['answers'] = data['answers'].apply(clean)

In [6]:
def lemmatize_text(questions):
    stop_words = stopwords.words("english")
    newStopWords = ['different','example','used','also','likely','non','use','take','often','recent','see','head','tell','back',
                    'etc','give','using','would','given','due','called','make','many','image','understand',
                    'difference','preferred','eg','fruit',
                    'help','may','try','u','explain','happen','define','reason','describe','cite','coin','star','girl']
    stop_words.extend(newStopWords)
    questions = [re.sub('[^a-z(+)(#)]', ' ', x.lower()) for x in questions]
    questions_tokens = [nltk.word_tokenize(t) for t in questions]
    questions_stop = [[t for t in tokens if (t not in stop_words) and (3 < len(t.strip()) < 15)]
                      for tokens in questions_tokens]
    questions_stop = pd.Series(questions_stop)
    return questions_stop

In [7]:
questions = data['questions']
questions_pp = lemmatize_text(questions)

In [8]:
data = pd.DataFrame({'topics': list(data['topics']),
                     'answers': list(data['answers']),
                     'questions': list(data['questions']),
                     'questions_tokens': questions_pp})
data.head()

Unnamed: 0,topics,answers,questions,questions_tokens
0,basic data science,data science is a blend of various tools algor...,what is data science list the differences betw...,"[data, science, list, differences, supervised,..."
1,basic data science,selection bias is a kind of error that occurs ...,what is selection bias,"[selection, bias]"
2,basic data science,bias bias is an error introduced in your model...,what is bias variance trade off,"[bias, variance, trade]"
3,basic data science,the confusion matrix is a x table that conta...,what is a confusion matrix,"[confusion, matrix]"
4,statistics,in the wideformat a subjects repeated response...,what is the difference between long and wide f...,"[long, wide, format, data]"


In [9]:
data.topics.value_counts()

machine learning      30
statistics            23
data analytics        20
deep learning         16
nlp                   15
basic data science     6
Name: topics, dtype: int64

## Word2Vec Modeling 

In [10]:
def train_model(train_data):
    """Function trains and creates Word2vec Model using parsed
    data and returns trained model"""
    model = gensim.models.Word2Vec(train_data, min_count=1)
    return model

In [11]:
dict_language = {'0': 'machine learning', '1': 'statistics', '2': 'deep learning', '3': 'data analytics', '4': 'nlp',
                 '5': 'basic data science'}

data['Question_Vectors'] = None
data['Average_Pooling'] = None
    
for key, value in dict_language.items():
    questions_data = list(data[data['topics'] == value]['questions_tokens'])
    model_name = 'word2vec_model_' + value
    trained_model = train_model(questions_data)
    trained_model.save(model_name)
    print('Saved %s model successfully' % model_name)
    word2vec_pickle_path = 'chatbot_word2vec_' + value + '.bin'
    f = open(word2vec_pickle_path, 'wb')
    pickle.dump(trained_model, f) 
    f.close()
    
    model = gensim.models.KeyedVectors.load(word2vec_pickle_path) 
    pickle.dump(model,open('model.pkl','wb'))
    

    for i in range(len(data)):
        if data['topics'][i] == value:
            questions_tokens = data['questions_tokens'][i]
            question_vectors = []
            for token in questions_tokens:
                try:
                    vector = model.wv[token]
                    question_vectors.append(vector)
                except:
                    continue
            data['Question_Vectors'][i] = question_vectors
            data['Average_Pooling'][i] = list(pd.DataFrame(question_vectors).mean())

Saved word2vec_model_machine learning model successfully
Saved word2vec_model_statistics model successfully
Saved word2vec_model_deep learning model successfully
Saved word2vec_model_data analytics model successfully
Saved word2vec_model_nlp model successfully
Saved word2vec_model_basic data science model successfully


In [12]:
data['questions_tokens'] = [" ".join(l) for l in data['questions_tokens']]
length = data['questions_tokens'].apply(len)
data = data.assign(question_length=length)
data.head()

Unnamed: 0,topics,answers,questions,questions_tokens,Question_Vectors,Average_Pooling,question_length
0,basic data science,data science is a blend of various tools algor...,what is data science list the differences betw...,data science list differences supervised unsup...,"[[0.009735549, -0.009780383, -0.0064994907, 0....","[0.0013802927652640001, -0.0022631725961608546...",62
1,basic data science,selection bias is a kind of error that occurs ...,what is selection bias,selection bias,"[[-0.007139015, 0.0012410306, -0.0071767163, -...","[-0.0035222255246480927, 0.0021591746481135488...",14
2,basic data science,bias bias is an error introduced in your model...,what is bias variance trade off,bias variance trade,"[[9.456396e-05, 0.0030773187, -0.0068126465, -...","[0.00012164851068519056, 0.006934074296926458,...",19
3,basic data science,the confusion matrix is a x table that conta...,what is a confusion matrix,confusion matrix,"[[-0.008620501, 0.0036665255, 0.0051904307, 0....","[-0.004578363907057792, 0.0019514778177835979,...",16
4,statistics,in the wideformat a subjects repeated response...,what is the difference between long and wide f...,long wide format data,"[[-0.0006022507, 0.0014123637, -0.001195034, -...","[0.00039192516123875976, 0.0003868712228722870...",21


## Greetings function for bot 

In [13]:
GREETING_INPUTS = ("hello", "hi", "greetings", "hello i need help", "good day","hey","i need help", "greetings")
GREETING_RESPONSES = ["Good day, How may i help u?", "Hello, How can i help?", "hello", "I am glad! You are talking to me."]
           
def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

## Function for talking to bot

In [14]:
def talk_to_bot(data_language, model):
    sentence_pp = lemmatize_text(pd.Series(sentence)) 
    cosines = []
    try:
        question_vectors = []
        for token in sentence_pp:
            try:
                vector = model.wv[token]
                question_vectors.append(vector)
            except:
                continue
        question_ap = list(pd.DataFrame(question_vectors[0]).mean())
        for t in data_language['Average_Pooling']:
            if t is not None and len(t) == len(question_ap):
                val = cosine_similarity([question_ap], [t])
                cosines.append(val[0][0])
            else:
                cosines.append(0)
    except:
        pass
            
    if len(cosines) == 0:
        not_understood = "OOPS, I don't understand. Can you rephrase it?"
        return not_understood, 999   
    else: 
        index_s =[]
        score_s = []
        for i in range(len(cosines)):
            x = cosines[i]
            if x >= 0.5:
                index_s.append(i)
                score_s.append(cosines[i])
        reply_indexes = pd.DataFrame({'index': index_s, 'score': score_s})
        reply_indexes = reply_indexes.sort_values(by="score" , ascending=False)
        r_index = int(reply_indexes['index'].iloc[0])
        r_score = float(reply_indexes['score'].iloc[0])
        reply = str(data_language.iloc[:,2][r_index])  
    return reply, r_score

## Python framework for conversation

In [15]:
flag_language = True
flag_query = True
dict_language = {'0': 'machine learning', '1': 'statistics', '2': 'deep learning', '3': 'data analytics', '4': 'nlp',
                 '5': 'basic data science'}
print('......................................................................................')
print('\x1b[1;37;40m' + 'Dat-Sci' + '\x1b[0m' + ': ' + 'My name is Dat-Sci, Data Science Question Answer Bot.')
print('\x1b[1;37;40m' + 'Dat-Sci' + '\x1b[0m' + ': ' + 'I will try my best to answer your query.')
print('\x1b[1;37;40m' + 'Dat-Sci' + '\x1b[0m' + ': ' + 'If you want to exit, you can type < bye >.')

while(flag_language == True):
    print("......................................................................................")
    print('\x1b[1;37;40m' + 'Dat-Sci' + '\x1b[0m' + ': ' + 'Please select which area you want to enquire, ' +
      'you can type:')
    print('\x1b[1;37;40m' + 'Dat-Sci' + '\x1b[0m' + ': ' + '< 0 > for machine learning     < 1 > for statistics     < 2 > deep learning')
    print('\x1b[1;37;40m' + 'Dat-Sci' + '\x1b[0m' + ': ' + '< 3 > for data analytics       < 4 > for nlp            < 5 > for basic data science')
    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' + 'USER  ' + '\x1b[0m' + ':' )
    print("......................................................................................")
    
    if(sentence.lower() != 'bye'):
        if (sentence.lower() in list(dict_language.keys())):
            language = dict_language[sentence.lower()]
            data_language = data[data['topics'] == language]
            data_language = pd.DataFrame({'question': list(data_language['questions']),
                                          'question_tokens': list(data_language['questions_tokens']),
                                          'answer': list(data_language['answers']),
                                          'topics': list(data_language['topics']),
                                          'Question_Vectors': list(data_language['Question_Vectors']),
                                          'Average_Pooling': list(data_language['Average_Pooling'])
                                         })
            
            word2vec_pickle_path = 'chatbot_word2vec_' + language + '.bin'
            model = gensim.models.KeyedVectors.load(word2vec_pickle_path)
            
            flag_language = False
            flag_query = True
    else:
        flag_language = False
        flag_query = False

print('\x1b[1;37;40m' + 'Dat-Sci' + '\x1b[0m' + ': ' + 'Let''s start! Please input your question now.')
    
while(flag_query == True):
    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' + 'USER  ' + '\x1b[0m' + ':')
    print("......................................................................................")

    if(sentence.lower() != 'bye'):
        if(greeting(sentence.lower()) != None):
            print('\x1b[1;37;40m' + 'Dat-Sci' + '\x1b[0m' + ': ' + greeting(sentence.lower()))
        else:
            reply, score = talk_to_bot(data_language, model)
            print('\x1b[1;37;40m' + 'DatSci'+'\x1b[0m'+': '+reply)
    else:
        flag_query = False
print('\x1b[1;37;40m' + 'Dat-Sci' + '\x1b[0m' + ': ' + 'Bye! Hope that I am useful to you. Have a nice day.')

......................................................................................
[1;37;40mDat-Sci[0m: My name is Dat-Sci, Data Science Question Answer Bot.
[1;37;40mDat-Sci[0m: I will try my best to answer your query.
[1;37;40mDat-Sci[0m: If you want to exit, you can type < bye >.
......................................................................................
[1;37;40mDat-Sci[0m: Please select which area you want to enquire, you can type:
[1;37;40mDat-Sci[0m: < 0 > for machine learning     < 1 > for statistics     < 2 > deep learning
[1;37;40mDat-Sci[0m: < 3 > for data analytics       < 4 > for nlp            < 5 > for basic data science
......................................................................................
USER  :0
......................................................................................
[1;37;40mDat-Sci[0m: Lets start! Please input your question now.
...............................................................................

In [16]:
pickle.dump(data_language,open('data_language_ml.pkl','wb'))

In [None]:
pickle.dump(data_language,open('data_language_da.pkl','wb'))

In [None]:
pickle.dump(data_language,open('data_language_stat.pkl','wb'))

In [None]:
pickle.dump(data_language,open('data_language_bds.pkl','wb'))

In [None]:
pickle.dump(data_language,open('data_language_dl.pkl','wb'))

In [None]:
pickle.dump(data_language,open('data_language_nlp.pkl','wb'))