##  Chat bot for Stackoverflow

In [2]:
import pandas as pd


tsv_file='tagged_posts.tsv'
csv_table=pd.read_table(tsv_file,sep='\t')
csv_table.to_csv('tagged.csv',index=False)

In [3]:
import numpy as np
import pandas as pd
import pickle
import re
import csv

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Utility functions

In [4]:
RESOURCE_PATH = {
    'INTENT_RECOGNIZER': 'intent_recognizer.pkl',
    'TAG_CLASSIFIER': 'tag_classifier.pkl',
    'TFIDF_VECTORIZER': 'tfidf_vectorizer.pkl',
    'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags',
    'WORD_EMBEDDINGS': 'starspace_embedding.tsv',
}

In [5]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

def text_prepare(text):

    """Performs tokenization and simple preprocessing."""
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))
    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = GOOD_SYMBOLS_RE.sub('', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
    return text.strip()

def array_to_string(arr):
    return '\n'.join(str(num) for num in arr)

def matrix_to_string(matrix):
    return '\n'.join('\t'.join(str(num) for num in line) for line in matrix)

def load_embeddings(embeddings_path):
    embeddings = {}
    with open(embeddings_path, newline='') as embedding_file:
        reader = csv.reader(embedding_file, delimiter='\t')
        for line in reader:
            word = line[0]
            embedding = np.array(line[1:]).astype(np.float32)
            embeddings[word] = embedding
        dim = len(line) - 1
    return embeddings, dim
  
def question_to_vec(question, embeddings, dim):
    """Transforms a string to an embedding by averaging word embeddings."""
    vec = np.zeros((dim,), dtype=np.float32)
    count = 0
    for w in question.split():
        if w in embeddings:
            count += 1
            vec += embeddings[w]
    if count == 0:
        return vec
    return vec/count


def unpickle_file(filename):
    """Returns the result of unpickling the file content."""
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [36]:
load_embeddings(RESOURCE_PATH['WORD_EMBEDDINGS'])

({'using': array([ 2.90864e-03, -2.88960e-02,  2.12205e-02,  1.35664e-02,
         -6.30750e-03, -1.41167e-02, -5.61143e-03,  1.61088e-02,
         -2.12005e-02, -1.30414e-02,  5.43032e-03, -2.51913e-02,
         -7.73055e-03,  1.07451e-02,  1.10539e-02, -4.47586e-03,
          1.27072e-02, -1.04604e-03, -2.11246e-02, -9.24311e-03,
          6.33884e-03, -2.11617e-02,  7.28058e-03, -2.08950e-03,
          1.98767e-03, -3.33090e-02,  8.17718e-03,  1.36657e-02,
         -2.67670e-02,  7.93403e-03, -7.74065e-03, -7.34807e-03,
         -7.99624e-03, -1.14872e-02,  2.30938e-02,  5.42704e-03,
         -2.48428e-03,  1.02364e-02,  9.13954e-03, -8.89186e-03,
          1.37481e-02, -6.46873e-03, -1.19451e-02,  1.14664e-02,
         -1.81378e-02, -1.39776e-02,  1.99984e-02,  2.07600e-02,
         -1.24146e-02,  1.03035e-02,  1.57176e-03,  4.19409e-03,
         -2.15659e-02, -1.19237e-02, -1.28772e-02,  8.11553e-03,
         -1.77140e-02,  4.94526e-03, -1.01367e-03,  7.24465e-03,
         -1.2495

In [6]:
def tfidf_features(X_train, X_test):
    """Performs TF-IDF transformation and dumps the model."""
    
    #Training a vectorizer on X_train data
    tfidf_vectorizer = TfidfVectorizer(token_pattern='(\S+)', min_df=5, max_df=0.9, ngram_range=(1,2)) 

    #Transforming X_train and X_test data
    tfidf_vectorizer.fit(X_train)
    X_train = tfidf_vectorizer.transform(X_train)
    X_test = tfidf_vectorizer.transform(X_test)
    
    #Pickling the trained vectorizer to 'vectorizer_path'
    
#    
    with open('tfidf_vectorizer.pkl', 'wb') as fin:
        pickle.dump(tfidf_vectorizer, fin)

    return X_train, X_test

In [7]:
sample_size = 200000

dialogue_df = pd.read_csv('dialogues.tsv', sep='\t').sample(sample_size, random_state=0)
stackoverflow_df = pd.read_csv('tagged_posts.tsv', sep='\t').sample(sample_size, random_state=0)

#stackoverflow_df = pd.read_csv()

In [29]:
dialogue_df.head()

Unnamed: 0,text,tag
82925,"Donna, you are a muffin.",dialogue
48774,He was here last night till about two o'clock....,dialogue
55394,"All right, then make an appointment with her s...",dialogue
90806,"Hey, what is this-an interview? We're supposed...",dialogue
107758,Yeah. He's just a friend of mine I was trying ...,dialogue


In [15]:
stackoverflow_df.head()

Unnamed: 0,post_id,title,tag
2168983,43837842,Efficient Algorithm to compose valid expressio...,python
1084095,15747223,Why does this basic thread program fail with C...,c_cpp
1049020,15189594,Link to scroll to top not working,javascript
200466,3273927,Is it possible to implement ping on windows ph...,c#
1200249,17684551,GLSL normal mapping issue,c_cpp


In [8]:
dialogue_df['text'] = dialogue_df['text'].map(lambda text:  text_prepare(text))
stackoverflow_df['title'] = stackoverflow_df['title'].map(lambda text:  text_prepare(text))

## TF-IDF for Tone tuning

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [10]:
X = np.concatenate([dialogue_df['text'].values, stackoverflow_df['title'].values])
y = ['dialogue'] * dialogue_df.shape[0] + ['stackoverflow'] * stackoverflow_df.shape[0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) 
print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))

X_train_tfidf, X_test_tfidf = tfidf_features(X_train, X_test)


Train size = 360000, test size = 40000


In [97]:
y_df = pd.DataFrame(y)
y_df

X_df = pd.DataFrame(X)
X_df

Unnamed: 0,0
0,donna muffin
1,last night till two oclock hear really got stu...
2,right make appointment see
3,hey thisan interview supposed making love
4,yeah hes friend mine trying help
...,...
399995,"Cakephp, ordering associated tables"
399996,setProperty value from requested getParameter
399997,"How can I send some HTML, render a view, then ..."
399998,What's the standard way of trapping system mes...


In [11]:
intent_recognizer = LogisticRegression(penalty='l2', C=10, random_state=0)
intent_recognizer.fit(X_train_tfidf, y_train)



LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
#Checking test accuracy.
y_test_pred = intent_recognizer.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))

Test accuracy = 0.991575


In [13]:
pickle.dump(intent_recognizer, open(RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))

## Topic classifier for Programming topics

In [14]:
X = stackoverflow_df['title'].values
y = stackoverflow_df['tag'].values

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))

Train size = 160000, test size = 40000


In [16]:
vectorizer = pickle.load(open(RESOURCE_PATH['TFIDF_VECTORIZER'], 'rb'))

X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)

In [17]:
from sklearn.multiclass import OneVsRestClassifier

In [18]:
tag_classifier=OneVsRestClassifier(LogisticRegression(penalty='l2', C=5, random_state=0))
tag_classifier.fit(X_train_tfidf, y_train)



OneVsRestClassifier(estimator=LogisticRegression(C=5, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=0, solver='warn',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False),
                    n_jobs=None)

In [19]:
#Checking test accuracy.
y_test_pred = tag_classifier.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))

Test accuracy = 0.800725


In [20]:
pickle.dump(tag_classifier, open(RESOURCE_PATH['TAG_CLASSIFIER'], 'wb'))

## Relevant Questions

In [21]:
starspace_embeddings , embeddings_dim = load_embeddings('starspace_embedding.tsv')

In [22]:
posts_df = pd.read_csv('tagged_posts.tsv', sep='\t')

In [98]:
posts_df

Unnamed: 0,post_id,title,tag
0,9,Calculate age in C#,c#
1,16,Filling a DataSet or DataTable from a LINQ que...,c#
2,39,Reliable timer in a console application,c#
3,42,Best way to allow plugins for a PHP application,php
4,59,"How do I get a distinct, ordered list of names...",c#
...,...,...,...
2171570,45887455,What is the difference between node.js and ayo...,javascript
2171571,45887857,Why do sequential containers have both size_ty...,c_cpp
2171572,45892983,"why 1 + + ""1"" === 2; +""1"" + + ""1"" === 2 and ""1...",javascript
2171573,45893693,Why does the first line work but the second li...,javascript


In [23]:
counts_by_tag = posts_df.groupby(['tag']).count()

In [58]:
counts_by_tag

Unnamed: 0_level_0,post_id,title
tag,Unnamed: 1_level_1,Unnamed: 2_level_1
c#,394451,394451
c_cpp,281300,281300
java,383456,383456
javascript,375867,375867
php,321752,321752
python,208607,208607
r,36359,36359
ruby,99930,99930
swift,34809,34809
vb,35044,35044


In [24]:
counts_by_tag = posts_df['tag'].value_counts().to_dict()

In [25]:
import os
os.makedirs(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], exist_ok=True)

for tag, count in counts_by_tag.items():
    tag_posts = posts_df[posts_df['tag'] == tag]
    
    tag_post_ids = tag_posts['post_id'].values
    
    tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)
    for i, title in enumerate(tag_posts['title']):
        tag_vectors[i, :] = question_to_vec(title, starspace_embeddings, embeddings_dim) 

    # Dumping post ids and vectors to a file.
    filename = os.path.join(RESOURCE_PATH['THREAD_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % tag))
    pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))

# Dialogue Manager

In [26]:
import os
from sklearn.metrics.pairwise import pairwise_distances_argmin

In [27]:
class ThreadRanker(object):
    def __init__(self, paths):
        self.word_embeddings, self.embeddings_dim = load_embeddings(paths['WORD_EMBEDDINGS'])
        self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER']

    def __load_embeddings_by_tag(self, tag_name):
        embeddings_path = os.path.join(self.thread_embeddings_folder, tag_name + ".pkl")
        thread_ids, thread_embeddings = unpickle_file(embeddings_path)
        return thread_ids, thread_embeddings

    def get_best_thread(self, question, tag_name):
        """ Returns id of the most similar thread for the question.
            The search is performed across the threads with a given tag.
        """
        thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)
        question_vec = question_to_vec(question, self.word_embeddings, self.embeddings_dim)
        best_thread = pairwise_distances_argmin(
            X=question_vec.reshape(1, self.embeddings_dim),
            Y=thread_embeddings,
            metric='cosine'
        )
        return thread_ids[best_thread][0]

    
class DialogueManager(object):
    def __init__(self, paths):
        print("Loading resources...")
        # Intent recognition:
        self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER'])
        self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER'])
        self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s'
        # Goal-oriented part:
        self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER'])
        self.thread_ranker = ThreadRanker(paths)

    def generate_answer(self, question):
        """Combines stackoverflow and chitchat parts using intent recognition."""
        prepared_question = text_prepare(question)
        features = self.tfidf_vectorizer.transform([prepared_question])
        intent = self.intent_recognizer.predict(features)[0]

        # Chit-chat part:
        if intent == 'dialogue':
            # Passing question to chitchat_bot to generate a response.
            return "Hi, please post your specific programming related query"
        # Goal-oriented part:
        else:
            # Passing features to tag_classifier to get predictions.
            tag = self.tag_classifier.predict(features)[0]
            # Passing prepared_question to thread_ranker to get predictions.
            thread_id = self.thread_ranker.get_best_thread(prepared_question, tag)
            return self.ANSWER_TEMPLATE % (tag, thread_id)

In [37]:
dm_obj=DialogueManager(RESOURCE_PATH)

Loading resources...


# Main Bot

In [28]:
import requests
import time
import argparse
import os
import json

In [29]:
def get_answer(question):
    if question == '/start':
        return "Hi, I am your project bot. How can I help you today?"
    
    return dialogue_manager.generate_answer(question)
    
dialogue_manager = DialogueManager(RESOURCE_PATH)


Loading resources...


In [30]:
def reply(query):
    return dialogue_manager.generate_answer(query)

In [52]:
reply("php")


'I think its about php\nThis thread might help you: https://stackoverflow.com/questions/104329'

In [None]:
inp1 = True
print("Hi, I am your project bot. How can I help you today?")

while inp1:
    a = input()
    if a == "exit":
        inp1 = False
    else:
        print(get_answer(a))

Hi, I am your project bot. How can I help you today?
i have a query
I think its about php
This thread might help you: https://stackoverflow.com/questions/2900
hi i am 
Hi, please post your specific programming related query
doubt in functions and methods
I think its about javascript
This thread might help you: https://stackoverflow.com/questions/1188464
Best way to allow plugins for a PHP application
I think its about php
This thread might help you: https://stackoverflow.com/questions/42


In [31]:
def bothandler():
    inp1 = True
    print("Hi, I am your project bot. How can I help you today?")
    while inp1:
        a = input()
        if a == "exit":
            inp1 = False
        else:
            print(get_answer(a))

In [36]:
bothandler()

Hi, I am your project bot. How can I help you today?
hi
Hi, please post your specific programming related query
jo
Hi, please post your specific programming related query
array and linked list
I think its about java
This thread might help you: https://stackoverflow.com/questions/12390430
exit


In [None]:
def send():
    msg = EntryBox.get("1.0", 'end-lc').strip()
    EntryBox.delete("0.0", END)
    
    if msg != '':
        ChatLog.config(state=NORMAL)
        ChatLog.insert(END, "You: " + msg + '\n\n')
        ChatLog.config(foreground="#442265", font=("Verdana", 12))
        
        res = chatbot_response(msg)
        ChatLog.insert(END, "Bot: " + res + '\n\n')
        
        ChatLog.config(state=DISABLED)
        ChatLog.yview(END)

### GUI 

from tkinter import *

root = Tk()

root.title('Stackoverflow - Chat Bot')

root.geometry('400x500')

chatWindow =Text(root, bd=1, width = 50, height = 8)
chatWindow.place(x=6, y=6, height= 385, width = 370)

messageWindow = Text(root, width = 30 )
messageWindow.place(x=128, y=400, height=88, width = 260)



root.mainloop()

In [93]:
from tkinter import *

window = Tk()
window.title('Chat Bot')


messages = Text(window, width= 50 )
messages.place(x=6, y=6, height= 385, width = 370)
messages.pack()

input_user = StringVar()
input_field = Entry(window, text=input_user, width = 30)
input_field.place(x=128, y=400, height=88, width = 260)
input_field.pack(side=BOTTOM, fill=X)

def Enter_pressed(event):
    input_get = input_field.get()
    #print(input_get)
    messages.insert(INSERT, '%s\n' % input_get) ##goes inside the window
    # label = Label(window, text=input_get)
    
    input_user.set('')
    # label.pack()
    ans= reply(input_get)
    messages.insert(INSERT, '%s\n' % ans)
    return "break"


frame = Frame(window)  # , width=300, height=300)
input_field.bind("<Return>", Enter_pressed)
frame.pack()

window.mainloop()
