In [22]:
import pandas as pd
import os
import json
import re
import numpy as np
import random
import string
import pickle
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag
import gensim
from gensim import corpora, models, similarities
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
#from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter('ignore')
import tensorflow
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Convolution1D
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.applications import VGG16
from tensorflow.keras.metrics import categorical_accuracy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD

In [23]:
#path = './Data/'
data = pd.read_csv('StackOverflow.csv')
data['split'] = np.random.randn(data.shape[0], 1)

msk = np.random.rand(len(data)) <= 0.5

test = data[msk]
train = data[~msk]
train.head()

Unnamed: 0,Question,Answer,Class,split
0,Should I use nested classes in this case?,I would be a bit reluctant to use nested class...,c++,0.802965
1,How do I connect to a database and loop over a...,Very roughly and from memory since I don't hav...,c#,0.28215
2,"How to get the value of built, encoded ViewState?","Rex, I suspect a good place to start looking i...",c#,-0.200608
3,How do I delete a file which is locked by anot...,"You can use this program, Handle, to find whic...",c#,-0.677644
4,.NET Unit Testing packages?,"I like MbUnit, er, Gallio. Most importantly t...",c#,-0.3725


In [24]:
del test['Class']
test.head()

Unnamed: 0,Question,Answer,split
5,How do you disable browser Autocomplete on web...,"Firefox 30 ignores autocomplete=""off"" for pass...",0.321705
7,What code analysis tools do you use for your J...,We use FindBugs and JDepend integrated with An...,0.863964
10,"Is a ""Confirm Email"" input good practice when ...",I agree with you in that it is quite an annoya...,-0.233234
13,How do you open a file in C++?,"There are three ways to do this, depending on ...",-0.266663
16,Any good advice on using emacs for C++ project?,"No specific article, really, but I've found Em...",-0.07855


In [25]:
def pre_process(questions):
    stop_words = stopwords.words("english")
    
    # Remove non english words
    questions = [re.sub('[^a-z(c++)(c#)]', ' ', x.lower()) for x in questions]
    # Tokenlization
    questions_tokens = [nltk.word_tokenize(t) for t in questions]
    # Removing Stop Words
    questions_stop = [[t for t in tokens if (t not in stop_words) and (3 < len(t.strip()) < 15)]
                      for tokens in questions_tokens]
    
    questions_stop = pd.Series(questions_stop)
    return questions_stop

In [26]:
# Initial preprocessing training data
questions = data['Question']
questions_tokens = pre_process(questions)
questions_tokens

0                                   [nested, classes, case]
1                      [connect, database, loop, recordset]
2                        [value, built, encoded, viewstate]
3                  [delete, file, locked, another, process]
4                                 [unit, testing, packages]
                                ...                        
159202                          [print, random, line, file]
159203          [answer, user, validation, restart, option]
159204            [datatable, column, value, without, loop]
159205    [convert, stream, request, containing, json, j...
159206     [execute, multiline, python, code, bash, script]
Length: 159207, dtype: object

In [27]:
category_le = LabelEncoder()
train['Class_no']= category_le.fit_transform(train['Class'])
train['Class_no'].unique()

array([2, 1, 5, 6, 8, 3, 9, 7, 4, 0])

In [28]:
y_train = train['Class_no']

In [29]:
train['question_tokens'] = questions_tokens

In [30]:
x_train_corpus = train['Question'].values
x_test_corpus = test['Question'].values
cat_vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,1), min_df=0.001)
cat_vectorizer.fit(x_train_corpus)
df_tfidf = pd.DataFrame(cat_vectorizer.idf_, index=CountVectorizer.get_feature_names(cat_vectorizer), columns=['idf_weights'])
df_tfidf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
to,1.706742
how,1.716413
in,2.187459
the,2.558452
is,2.776964
...,...
expected,7.893256
rendering,7.893256
plot,7.893256
cookie,7.893256


In [31]:
cat_vectorizer.fit(x_test_corpus)
df_tfidf = pd.DataFrame(cat_vectorizer.idf_, index=CountVectorizer.get_feature_names(cat_vectorizer), columns=['idf_weights'])
df_tfidf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
to,1.704003
how,1.711083
in,2.172820
the,2.553698
is,2.772235
...,...
footer,7.887490
100,7.887490
zend,7.887490
pressed,7.887490


In [32]:
cat_vocab = cat_vectorizer.get_feature_names()
cat_vocab

['10',
 '100',
 '11',
 '2d',
 'able',
 'about',
 'abstract',
 'accept',
 'access',
 'across',
 'action',
 'active',
 'activity',
 'actually',
 'add',
 'added',
 'adding',
 'address',
 'after',
 'again',
 'ajax',
 'alert',
 'algorithm',
 'align',
 'all',
 'allow',
 'already',
 'also',
 'alternative',
 'always',
 'am',
 'amount',
 'an',
 'anchor',
 'and',
 'android',
 'angular',
 'angularjs',
 'animate',
 'animation',
 'anonymous',
 'another',
 'any',
 'anyone',
 'anything',
 'apache',
 'api',
 'apk',
 'app',
 'appear',
 'append',
 'apple',
 'application',
 'applications',
 'apply',
 'approach',
 'apps',
 'are',
 'area',
 'argument',
 'arguments',
 'around',
 'array',
 'arraylist',
 'arrays',
 'as',
 'asp',
 'assign',
 'async',
 'asynchronous',
 'at',
 'attribute',
 'attributes',
 'audio',
 'authentication',
 'auto',
 'autocomplete',
 'automatically',
 'available',
 'avoid',
 'back',
 'backbone',
 'background',
 'bad',
 'bar',
 'base',
 'based',
 'basic',
 'be',
 'been',
 'before',
 'beh

In [33]:
x_train_vec = cat_vectorizer.transform(x_train_corpus).toarray()
x_test_vec = cat_vectorizer.transform(x_test_corpus).toarray()
x_train_vec

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [34]:
lstm_x_train = train['Question'].values
lstm_x_test = test['Question'].values

In [35]:
num_words = 1000000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(x_train_corpus)
encoded_docs = tokenizer.texts_to_sequences(lstm_x_train)
encoded_docs

[[96, 6, 27, 350, 235, 4, 33, 419],
 [2, 12, 6, 486, 1, 3, 102, 11, 139, 252, 3, 6404, 4, 25],
 [2, 1, 22, 5, 48, 8, 922, 1490, 4538],
 [2, 12, 6, 287, 3, 35, 77, 7, 2651, 57, 85, 319, 4, 25],
 [101, 654, 714, 1272],
 [2, 12, 70, 2028, 3, 356, 255, 25, 176, 15, 931],
 [2, 9, 6, 55, 851, 216, 135, 30, 4, 25, 101],
 [2, 9, 6, 32, 6405, 71, 1491, 493, 706],
 [2, 1, 749, 15, 180, 909, 238, 3470],
 [201, 1015, 8, 619, 1306, 15, 30, 4, 529, 207],
 [130, 602, 15, 187, 48, 1458, 38, 465],
 [16, 12, 6, 22, 5, 87, 910, 1, 178, 5, 655, 26, 401, 12439],
 [20, 53, 5, 577, 1459, 15, 14, 368, 375, 10, 31, 698],
 [2, 96, 6, 654, 220, 3, 45, 1537],
 [891, 189, 4, 18, 693, 296, 380, 2725, 4, 3188],
 [2, 12, 70, 55, 3, 524, 91, 37, 44, 1207, 3, 75, 507, 62, 135, 3618],
 [2, 12, 6, 128, 3, 47, 1, 14, 764, 4, 25],
 [153, 5, 130, 36, 1, 22, 1825, 10, 3471],
 [9, 70, 63, 34, 118, 7, 5, 330, 290, 1058],
 [871, 30, 13, 2029, 7, 33, 30, 37, 3320],
 [852, 3, 32, 634, 1, 856, 1249, 17, 4860],
 [2, 29, 64, 1102, 1

In [36]:
max_length = max([len(s.split()) for s in lstm_x_train])
x_train_padded = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
x_train_padded.shape

(79833, 32)

In [37]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

25529

In [38]:
encoded_docs = tokenizer.texts_to_sequences(lstm_x_test)
x_test_padded = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [39]:
y_train_classes = np.unique(y_train)
y_train_classes_len = len(y_train_classes)

y_test_classes = np.unique(y_train)
y_test_classes_len = len(y_test_classes)

categorical_y_train = to_categorical(y_train, y_train_classes_len)
categorical_y_train

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [40]:
from sklearn.utils import compute_class_weight
classWeight = compute_class_weight('balanced', y_train.unique() , y_train) 
classWeight = dict(enumerate(classWeight))

#### قسمت ال اس تی ام را برای جلوگیری از ترین دوباره کامنت کردم

In [41]:
# batch_size = 16
# epochs = 30
# model = Sequential()
# model.add(Embedding(vocab_size, 100, input_length=max_length))
# model.add(SpatialDropout1D(rate=0.5))
# model.add(Bidirectional(LSTM(50, return_sequences=True, name='lstm_layer')))
# model.add(GlobalMaxPool1D())
# model.add(Dropout(rate=0.5))
# model.add(Dense(100,kernel_regularizer=l2(0.0001), bias_regularizer=l2(0.0001), activation='sigmoid'))
# model.add(Dropout(rate=0.001))
# model.add(Dense(y_train_classes_len, activation='softmax'))

# opt = SGD(lr=0.09)
# model.compile(loss='categorical_crossentropy',
#               optimizer=opt,
#               metrics=[categorical_accuracy])
# model.summary()

# hist = model.fit(x_train_padded, categorical_y_train, batch_size=batch_size, epochs=epochs, validation_split=0.33, class_weight=classWeight)

# model.save('chat_model.h5')

In [42]:
# e = range(epochs)
# acc = hist.history['categorical_accuracy']
# plt.plot(e, acc, label='Accuracy')
# acc = hist.history['val_categorical_accuracy']
# plt.plot(e, acc, label='val_Accuracy')
# plt.show

NameError: name 'epochs' is not defined

In [43]:
# plt.plot(hist.history['loss'])
# plt.plot(hist.history['val_loss'])
# plt.title('model train vs validation loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'validation'], loc='upper right')
# plt.show()

In [44]:
model_1 = load_model('chat_model.h5')
y_pred = model_1.predict_classes(x_test_padded)
print(len(y_pred))
prediction = list(category_le.inverse_transform(y_pred))
test['prediction'] = prediction
count = test['prediction'].value_counts()
print(count)
count_prediction = pd.DataFrame(count)
dictt = count_prediction.to_dict()
json_dump = json.dumps(dictt, ensure_ascii=False)
print(json_dump)

with open('New_chat_output.JSON', 'w') as outfile:
    json.dump(dictt, outfile)
with open('New_chat_output.JSON', 'r') as outfile:
    data_json = json.load(outfile)
    
print(data_json)

79374
javascript    17122
java          10975
php           10329
android       10246
python         7191
c#             6962
html           5368
jquery         4617
ios            3361
c++            3203
Name: prediction, dtype: int64
{"prediction": {"javascript": 17122, "java": 10975, "php": 10329, "android": 10246, "python": 7191, "c#": 6962, "html": 5368, "jquery": 4617, "ios": 3361, "c++": 3203}}
{'prediction': {'javascript': 17122, 'java': 10975, 'php': 10329, 'android': 10246, 'python': 7191, 'c#': 6962, 'html': 5368, 'jquery': 4617, 'ios': 3361, 'c++': 3203}}


In [45]:
ques_tok = []
ques = []
ans = []
labels = []
ques_vec = []
avg_pool = []

for index, row in data.iterrows():
    #ques_tok.append(row['question_tokens'])
    ques.append(row['Question'])
    ans.append(row['Answer'])
    labels.append(row['Class'])
    
train_copy = pd.DataFrame({'Label': labels,
                             'Question': ques,
                             'Question_Tokens': None,
                             'Answer': ans,
                             'Question_Vectors': None,
                             'Average_Pooling': None})
train_copy['Question_Tokens'] = questions_tokens
train_copy

Unnamed: 0,Label,Question,Question_Tokens,Answer,Question_Vectors,Average_Pooling
0,c++,Should I use nested classes in this case?,"[nested, classes, case]",I would be a bit reluctant to use nested class...,,
1,c#,How do I connect to a database and loop over a...,"[connect, database, loop, recordset]",Very roughly and from memory since I don't hav...,,
2,c#,"How to get the value of built, encoded ViewState?","[value, built, encoded, viewstate]","Rex, I suspect a good place to start looking i...",,
3,c#,How do I delete a file which is locked by anot...,"[delete, file, locked, another, process]","You can use this program, Handle, to find whic...",,
4,c#,.NET Unit Testing packages?,"[unit, testing, packages]","I like MbUnit, er, Gallio. Most importantly t...",,
...,...,...,...,...,...,...
159202,python,Print random line from txt file?,"[print, random, line, file]",You want to use random.choice import random wi...,,
159203,python,Yes or No answer from user with Validation and...,"[answer, user, validation, restart, option]","def get_choice(prompt=""Enter y/n?"",choices=[""Y...",,
159204,c#,C# DataTable - How to set a column to value wi...,"[datatable, column, value, without, loop]",Use 10 assignment statements. No loop required.,,
159205,c#,How to convert a txt stream web request contai...,"[convert, stream, request, containing, json, j...",Problem one - you use reader.ReadToEnd() twice...,,


In [46]:
for value in train_copy['Label'].unique():
    questions_data = list(train_copy[train_copy['Label'] == value]['Question_Tokens'])
    # Train model
    model_name = 'word2vec_model_' + value
    trained_model = gensim.models.Word2Vec(questions_data, min_count=2)
    trained_model.save(model_name)
    print('Saved %s model successfully' % model_name)
    
    # Save Word2Vec model
    word2vec_pickle_path = 'word2vec_' + value + '.bin'
    f = open(word2vec_pickle_path, 'wb')
    pickle.dump(trained_model, f) 
    f.close()
    
    model = gensim.models.KeyedVectors.load(word2vec_pickle_path)
    
    # Calculate the vectors for each question
    for i in range(len(train_copy)):
        if train_copy['Label'][i] == value:
            question_tokens = train_copy['Question_Tokens'][i]
            question_vectors = []
            for token in question_tokens:
                try:
                    vector = model[token]
                    question_vectors.append(vector)
                except:
                    continue
            # Vectors for each tokens
            train_copy['Question_Vectors'][i] = question_vectors
            # Average Pooling of all tokens
            train_copy['Average_Pooling'][i] = list(pd.DataFrame(question_vectors).mean())

Saved word2vec_model_c++ model successfully
Saved word2vec_model_c# model successfully
Saved word2vec_model_html model successfully
Saved word2vec_model_java model successfully
Saved word2vec_model_javascript model successfully
Saved word2vec_model_php model successfully
Saved word2vec_model_python model successfully
Saved word2vec_model_jquery model successfully
Saved word2vec_model_ios model successfully
Saved word2vec_model_android model successfully


In [47]:
length = train_copy['Question_Tokens'].apply(len)
train_copy = train_copy.assign(Question_Length=length)
train_copy.head()

Unnamed: 0,Label,Question,Question_Tokens,Answer,Question_Vectors,Average_Pooling,Question_Length
0,c++,Should I use nested classes in this case?,"[nested, classes, case]",I would be a bit reluctant to use nested class...,"[[0.09820967, 0.11215254, -0.13629147, 0.12434...","[0.15067571898301443, 0.17731636514266333, -0....",3
1,c#,How do I connect to a database and loop over a...,"[connect, database, loop, recordset]",Very roughly and from memory since I don't hav...,"[[-0.09296864, 0.045143556, 0.104177065, 0.066...","[-0.1222715672920458, 0.05410991114331409, 0.1...",4
2,c#,"How to get the value of built, encoded ViewState?","[value, built, encoded, viewstate]","Rex, I suspect a good place to start looking i...","[[-0.21630205, 0.088317715, 0.2325599, 0.14812...","[-0.10228477697819471, 0.04468563664704561, 0....",4
3,c#,How do I delete a file which is locked by anot...,"[delete, file, locked, another, process]","You can use this program, Handle, to find whic...","[[-0.15794575, 0.075462796, 0.16259193, 0.1027...","[-0.16060101464390755, 0.08878587856888771, 0....",5
4,c#,.NET Unit Testing packages?,"[unit, testing, packages]","I like MbUnit, er, Gallio. Most importantly t...","[[-0.15983889, 0.06931193, 0.17412436, 0.11733...","[-0.11168848723173141, 0.045669072618087135, 0...",3


In [48]:
data_json = json.loads(train_copy.to_json(orient='records'))

with open('Word2Vec.json', 'w') as outfile:
    json.dump(data_json, outfile)

In [55]:
def top_five(top_five_q):
    j = 0
    top_five_q['rep'] = None
    top_five_q['ans'] = None
    for i in top_five_q['index'].iloc[:5]:
        top_five_q['rep'].iloc[j] = str(data_language.iloc[:,0][i])
        top_five_q['ans'].iloc[j] = str(data_language.iloc[:,2][i])
        top_five_q['index'].iloc[j] = j + 1
        j = j + 1
    return top_five_q

In [56]:
def similarity(data_language, model):
    cosines = []
    try:
        # Get vectors and average pooling
        question_vectors = []
        for token in sentence_pp:
            try:
                vector = model[token]
                question_vectors.append(vector)
            except Exception as e:
                print('error1 : ', e)
                continue
        question_ap = list(pd.DataFrame(question_vectors[0]).mean())

        # Calculate cosine similarity
        for t in data_language['Average_Pooling']:
            if t is not None and len(t) == len(question_ap):
                val = cosine_similarity([question_ap], [t])
                cosines.append(val[0][0])
            else:
                cosines.append(0)
    except Exception as e:
        print('error2 : ', e)
        pass
            
    # If not in the topic trained
    if len(cosines) == 0:
        not_understood = "Apology, I do not understand. Can you rephrase?"
        return not_understood, 999
    
    else: 
        # Sort similarity
        index_s =[]
        score_s = []
        for i in range(len(cosines)):
            x = cosines[i]
            if x >= 0.2:
                index_s.append(i)
                score_s.append(cosines[i])

        reply_indexes = pd.DataFrame({'index': index_s, 'score': score_s})
        reply_indexes = reply_indexes.sort_values(by="score" , ascending=False)
        #print(reply_indexes)
        global top_five_df
        top_five_df = reply_indexes.iloc[1:6]
        # Find Top Questions and Score
        r_index = int(reply_indexes['index'].iloc[0])
        r_score = float(reply_indexes['score'].iloc[0])

        reply = str(data_language.iloc[:,0][r_index])
        print('\x1b[1;37;40m' + 'BOT'+'\x1b[0m'+': '+ str(data_language.iloc[:,2][r_index]))
        return reply, r_score

In [57]:
GREETING_INPUTS = ("hello", "hi", "greetings", "hello i need help", "good day","hey","i need help", "greetings")
GREETING_RESPONSES = ["Good day, How may i of help?", "Hello, How can i help?", "hello", "I am glad! You are talking to me."]
GOODBYE_INPUTS = ('thanks', 'thank you', 'tnx', 'bye')
GOODBYE_RESPONSES = ['Good Luck!', 'GoodBye!' , 'Hope to see you soon!']
def greeting(sentence, query):
    #for word in sentence.split():
        if sentence.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES), query
        if sentence.lower() in GOODBYE_INPUTS:
            query = False
            return random.choice(GOODBYE_RESPONSES), query        

In [58]:
def yes_no(inp):
    yes = {'yes', 'ye', 'y'}
    no = {'no', 'n'}
    if inp.lower() in yes:
        return True
    elif inp.lower() in no:
        return False
    else:
        print('\x1b[1;37;40m' + 'BOT' + '\x1b[0m' + ': ' + 'Please answer with <yes> or <no> !')

In [59]:
def was_helpful():
    print('\x1b[1;37;40m' + 'BOT' + '\x1b[0m' + ': ' + 'Was this answer helpful for you? [yes/no]')
    yes_no_ = input('\x1b[0;30;47m' + 'USER  ' + '\x1b[0m' + ':')
    if yes_no(yes_no_):
        return True
    else:
        return False

In [60]:
flag_query = True

print('......................................................................................')
print('\x1b[1;37;40m' + 'BOT' + '\x1b[0m' + ': ' + 'Hi! Ask me a question :)')

while flag_query:

    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' + 'USER  ' + '\x1b[0m' + ':')
    print("......................................................................................")
    
    if(greeting(sentence.lower(), flag_query) is not None):
        res, flg = greeting(sentence.lower(), flag_query)
        print('\x1b[1;37;40m' + 'BOT'+'\x1b[0m'+': '+ res)
        flag_query = flg
    else:
        sentence_pp = pre_process(pd.Series(sentence))
        sentence_pp = [' '.join(map(str, t)) for t in sentence_pp]
        if len(sentence_pp[0]) != 0:
            cat = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=0.01)
            cat.fit(sentence_pp)
            sentence_pp = [word_tokenize(t) for t in sentence_pp]
            
            encoded_input_docs = tokenizer.texts_to_sequences(sentence_pp)
            x_input_padded = pad_sequences(encoded_input_docs, maxlen=max_length, padding='post')
            y_predict = np.argmax(model_1.predict(x_input_padded), axis=-1)
            question_label = category_le.inverse_transform(y_predict)
            print('\x1b[1;37;40m' + 'BOT' + '\x1b[0m' + ': ' + 'Your question label is : < ' + str(question_label[0]) + ' >')
            data_language = train_copy[train_copy['Label'] == question_label[0]]
            data_language = pd.DataFrame({'Question': list(data_language['Question']),
                                          'Question_Tokens': list(data_language['Question_Tokens']),
                                          'Answer': list(data_language['Answer']),
                                          'Class': list(data_language['Label']),
                                          'Question_Vectors': list(data_language['Question_Vectors']),
                                          'Average_Pooling': list(data_language['Average_Pooling'])
                                         })
            
            # Read word2vec model
            word2vec_pickle_path = 'word2vec_' + question_label[0] + '.bin'
            model = gensim.models.KeyedVectors.load(word2vec_pickle_path)
            
            reply, score = similarity(data_language, model)
            print('\x1b[1;37;40m' + 'BOT'+'\x1b[0m'+': '+reply)
            print("SCORE: " + str(score))
            flag_language = False
            flag_query = True
            try:
                if was_helpful():
                    print('\x1b[1;37;40m' + 'BOT' + '\x1b[0m' + ': ' + 'Do you have another question?')
                    print("......................................................................................")
                    pass
                else:
                    top5 = top_five(top_five_df)
                    print(top5[['rep', 'index']])
                    print("......................................................................................")
                    print('\x1b[1;37;40m' + 'BOT' + '\x1b[0m' + ': ' + 'Please insert your considered number... ')
                    try:
                        ind = int(input('\x1b[0;30;47m' + 'USER  ' + '\x1b[0m' + ':'))
                    except:
                        print('\x1b[1;37;40m' + 'BOT' + '\x1b[0m' + ': ' + 'Do you have another question?')
                        print("......................................................................................") 
                        continue
                    ans = top5['ans'].iloc[ind-1]
                    print('\x1b[1;37;40m' + 'BOT' + '\x1b[0m' + ': ' + str(ans))
                    print("......................................................................................")
                    if was_helpful():
                        print('\x1b[1;37;40m' + 'BOT' + '\x1b[0m' + ': ' + 'Do you have another question?')
                        print("......................................................................................") 
                        continue
                    else:
                        print('\x1b[1;37;40m' + 'BOT' + '\x1b[0m' + ': ' + 'Sorry for not being helpful :( ')
                        print("......................................................................................") 
                        
                        flag_query = False
            except Exception as e:
                print(e)
                continue
        else:
            print('\x1b[1;37;40m' + 'BOT' + '\x1b[0m' + ': ' + 'Do you have another question?')
            print("......................................................................................")

......................................................................................
[1;37;40mBOT[0m: Hi! Ask me a question :)
......................................................................................
USER  :hey
......................................................................................
[1;37;40mBOT[0m: Good day, How may i of help?
......................................................................................
USER  :How do I connect to a database
......................................................................................
[1;37;40mBOT[0m: Your question label is : < android >
[1;37;40mBOT[0m: I see from the stacktrace that you create DB helper in the activity constructor or field initializer possibly. In both cases activity isn't initialized yet (moreover you don't need to override activity constructor in the most cases). Move DB helper assignment inside the activity onCreate() method. 
[1;37;40mBOT[0m: Where get database path in dat