In [1]:
import os 
import numpy as np
import pandas as pd
from pathlib import Path
import csv
import pickle
import nmslib
from datetime import datetime
from collections import defaultdict
import sqlite3
DB_NAME = 'StackOverflow.db'

connection = sqlite3.connect(DB_NAME)
c = connection.cursor()


DATA_PATH = Path('data/')
MODEL_PATH = Path('models/')

In [2]:
# def avg_word_vectors(question, embeddings, dim):
#     words_embedding = [embeddings[word] for word in question.lower().split() if word in embeddings]
#     if not words_embedding:
#         return np.zeros(dim)
#     words_embedding = np.array(words_embedding).astype(np.float32)
#     return words_embedding.mean(axis=0)

# def average_tfidf_vectors(question, embeddings, dim, vect):
#     # get idf weights
#     split_question = [word for word in question.lower().split() if word in embeddings]
#     if not split_question:
#         return np.zeros(dim).astype(np.float32)
#     words_embedding = np.zeros((dim, len(split_question))).astype(np.float32)
#     for i, token in enumerate(split_question):
#         if token in embeddings:
#             embed_score = embeddings[token]
#         else: embed_score = 0
#         idf_score = idf_scores[token]
#         # word vectors multiply by their TF-IDF scores
#         words_embedding[:, i] = embed_score * idf_score    
#     return words_embedding.mean(axis=1)


# def get_embeddings(filename):
#     embeddings = {}
#     with open(MODEL_PATH/filename, newline='') as f:
#         reader = csv.reader(f, delimiter='\t')
#         embed_list = list(reader)
#     for line in embed_list:
#         embeddings[line[0]] = np.asarray(line[1:], dtype=np.float32)
        
#     dim = len(embeddings['code'])
#     return embeddings, dim


# def unpickle(filename):
#     with open(filename, 'rb') as f:
#         return pickle.load(f)
    

# with open(MODEL_PATH/'tf_idf.pkl', mode='rb') as f:
#     vect = pickle.load(f)
    
# idf_scores = defaultdict(lambda:0, zip(vect.get_feature_names(), vect.idf_))

# parent_comment_map = pickle.load((DATA_PATH/'parent_comment_map.pkl').open('rb'))

In [2]:
# utils file 

import nltk
import pickle
import re
import numpy as np
import html

from nltk.corpus import stopwords
stopwords_set = set(stopwords.words('english'))


def avg_word_vectors(question, embeddings, dim):
    words_embedding = [embeddings[word] for word in question.lower().split() if word in embeddings]
    if not words_embedding:
        return np.zeros(dim)
    words_embedding = np.array(words_embedding).astype(np.float32)
    return words_embedding.mean(axis=0)

def average_tfidf_vectors(question, embeddings, dim, vect, idf_scores):
    # get idf weights
    split_question = [word for word in question.lower().split() if word in embeddings]
    if not split_question:
        return np.zeros(dim).astype(np.float32)
    words_embedding = np.zeros((dim, len(split_question))).astype(np.float32)
    for i, token in enumerate(split_question):
        if token in embeddings:
            embed_score = embeddings[token]
        else: embed_score = 0
        idf_score = idf_scores[token]
        # word vectors multiply by their TF-IDF scores
        words_embedding[:, i] = embed_score * idf_score    
    return words_embedding.mean(axis=1)


def get_embeddings(filename):
    embeddings = {}
    with open(MODEL_PATH/filename, newline='') as f:
        reader = csv.reader(f, delimiter='\t')
        embed_list = list(reader)
    for line in embed_list:
        embeddings[line[0]] = np.asarray(line[1:], dtype=np.float32)
        
    dim = len(embeddings['code'])
    return embeddings, dim


def unpickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)



re1 = re.compile(r'  +')
# def clean_title(text, remove_html=False, other=False):
def clean_text(text, remove_html=False, other=False):
    
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    
    if remove_html:
        x = re.sub(r'<code>[^>]*</code>', '', x)
        x = re.sub(r'<[^>]*>', '', x)
        x = re.sub(r'[^A-Za-z0-9]', ' ', x)
    text = text.lower()
    text = text.replace('π', 'pi').replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
            'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
            '<br />', "\n").replace('\\"', '').replace('<unk>','u_n').replace(' @.@ ','.').replace(
            ' @-@ ','-').replace('\\', ' \\ ').replace('"',"'").replace('\n', ' ').replace('\r', ' ')
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
    return re1.sub(' ', html.unescape(text).strip())


# def get_top_preds(X, clf, k=3):
#     preds = clf.predict_proba(X)
#     return np.argsort(preds)[:,::-1][0][:3]

def get_top_preds(X, clf, class_map, k=3, cutoff=0.8):
        preds = clf.predict_proba(X)
        sorted_preds = np.argsort(preds)[:,::-1][0][:3]
        top_scores = [preds[0][i] for i in sorted_preds]
        if top_scores[0] > cutoff:
            # if top_scores above threshold only need to return top 
            return [class_map[sorted_preds[0]]]
        else:
            return [class_map[p] for p in sorted_preds]


# -------------------------------------------------------------------------------------------------------------------



# with open(MODEL_PATH/'tf_idf.pkl', mode='rb') as f:
#     vect = pickle.load(f)
    
# idf_scores = defaultdict(lambda:0, zip(vect.get_feature_names(), vect.idf_))

# parent_comment_map = pickle.load((DATA_PATH/'parent_comment_map.pkl').open('rb'))


class DialogueManager(object):
    def __init__(self, data_path, model_path):
        self.model_path = model_path
        self.data_path = data_path
        self.thread_embeddings_path = model_path/'thread_embeddings_by_tags'
        self.knn_path = model_path/'knn_embeddings_path'
        self.word_embeddings, self.dim = get_embeddings('starspace_embedding100_ngram2.tsv')
        self.parent_comment_map = unpickle(data_path/'parent_comment_map.pkl')
        
        self.tag_classifier = unpickle(model_path/'LR_tag_classifier_all.pkl')
        self.class_map = unpickle(data_path/'class_map.pkl')
        
        self.tfid_vectorizer = unpickle(model_path/'tf_idf.pkl')
        self.idf_scores = defaultdict(lambda:0, zip(self.tfid_vectorizer.get_feature_names(), 
                                                    self.tfid_vectorizer.idf_))
        
        self.parent_comment_map = unpickle(data_path/'parent_comment_map.pkl')
        
        
    def __get_embeddings_by_tag(self, tag):
        
#         embeddings_files = [self.thread_embeddings_path/tag for tag in tags]
#         for file in embeddings_files:
#             ids, vectors = unpickle(embeddings_files)
        tag_path = tag + '.pkl'
        embeddings_file = self.thread_embeddings_path/tag_path
        ids, vectors = unpickle(embeddings_file)
        return ids, vectors
    
    def __create_nmslib_index(self, a, space, load=True, filepath=None, save=False):
        M = 25
        efC = 100
        
        num_threads = 4
        index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 
                             'post': 0, 'skip_optimized_index':1}
        query_time_params = {'efSearch': efC}

        index = nmslib.init(space=space)
        if load:
            # only need to init if indexed is saved
            index.loadIndex(filepath)
#             index.setQueryTimeParams(query_time_params)
            return index
        else:
            index.addDataPointBatch(a)
            index.createIndex()
#             index.setQueryTimeParams(query_time_params)
            if save: index.saveIndex(filepath)
            return index

    def __create_annoy_index(self, data, space='angular', n_trees=30, load=True, filepath=None, save=False):
                
        index = annoy.AnnoyIndex(self.dim, metric=space)
        if load:
            # only need to init if indexed is saved
            index.load(filepath)
        else:
            for i, vect in enumerate(data):
                index.add_item(i, vect)
            index.build(n_trees)            
            if save: index.save(filepath)
        return index
    
    def get_similar(self, question, question_to_vec=average_tfidf_vectors, topk=5, space='cosinesimil', 
                    load=True, save=False, return_dist=True, *args, **kwargs):
        
        tags = self.get_tags(question)
        # need to update for multiple tags
        tag = tags[0]
        print('get vects')
        start = datetime.now()
        thread_ids, thread_vectors = self.__get_embeddings_by_tag(tag)
        print(datetime.now() - start)
        print('create index')
        start = datetime.now()
        tag_path = tag + '.bin'
#         index = self.__create_nmslib_index(thread_vectors, space=space, load=load, 
#                                            filepath=str(self.knn_path/tag_path), save=save)
        index = self.__create_annoy_index(thread_vectors, space=space, load=load, 
                                           filepath=str(self.knn_path/tag_path), save=save)
        print(datetime.now() - start)
        print('question creation')

        question2vec = question_to_vec(question, self.word_embeddings, self.dim, vect=self.tfid_vectorizer, 
                                        idf_scores=self.idf_scores, *args, **kwargs)
        print('query')
        start = datetime.now()
        idxs, distances = index.knnQuery([question2vec], k=topk)
        print(datetime.now() - start)
        output = [thread_ids[i] for i in idxs]
        if return_dist:
            output = output, distances
        return output
    
    
    def get_comments(self, post_ids):
        # need to weight by distance 
        # need to get for multiple tags, combine and weight by distance
#         post_ids = self.get_similar('How do I calculate someones age in C', load=True, save=False, return_dist=False)
        df_parent = self.get_df(np.array(post_ids).flatten().tolist())

        knns = [j for i in post_ids if i in self.parent_comment_map for j in self.parent_comment_map[i]]
        df_comments = self.get_df(knns)

        return df_parent, df_comments 
    
    def get_tags(self, question, k=3): 
        cleaned_question = clean_text(question)
        features = self.tfid_vectorizer.transform([cleaned_question])
        preds = get_top_preds(features, self.tag_classifier, self.class_map, k)
        return preds
    
    def clean_output(self, df):
        # get rid of bad (negative / 0 scores)
        # clean html
        # print in cool format
#         for 
        pass

    def get_df(self, ids):
        neighbor_length = '?,' * len(ids)
        df = pd.read_sql("SELECT * FROM posts WHERE comment_id IN ({})".format(neighbor_length[:-1]), 
                                         connection, params=tuple(ids))
        return df
    

In [3]:
%%time
dm = DialogueManager(DATA_PATH, MODEL_PATH)

CPU times: user 18.6 s, sys: 1.7 s, total: 20.3 s
Wall time: 20.3 s


In [4]:
results = dm.get_similar('How do I calculate someones age in C', load=True, save=False, return_dist=False)
df_parent, df_comment = dm.get_comments(results)

get vects
0:00:00.219250
create index
0:00:00.147380
question creation
query
0:00:00.000110


In [6]:
df_comment

Unnamed: 0,comment_id,parent_id,comment,title,date,score,tags
0,982166,982129,<p>It forces a <a href='http://en.wikipedia.or...,,2009-06-11,3,
1,982179,982129,<p>It is a atomic builtin for <a href='http://...,,2009-06-11,23,
2,1733721,1733705,<p>You use a NSSet to ensure uniqueness. <code...,,2009-11-14,3,
3,2842876,2842866,<p>If you have <code>i+i++</code> it is actual...,,2010-05-16,-4,
4,2842882,2842866,<p>This will post-increment <code>l</code> i.e...,,2010-05-16,0,
5,2842896,2842866,<p>Be warned - many languages don't dictate th...,,2010-05-16,-1,
6,2842897,2842866,<p><code>l+l++</code> is undefined. There is n...,,2010-05-16,16,
7,4228582,4228494,<p>Could it be that when you run the program a...,,2010-11-19,0,
8,4228713,4228494,<pre><code>int pivot=a[left]; while(i<=j){ whi...,,2010-11-19,0,
9,4228742,4228494,<p>The program gets stuck in the call to <code...,,2010-11-19,5,


In [87]:
%%time
x = "How do I calculate someone's age in C#?"
x2 = "Given a representing a person's birthday how do I calculate their age in years in actionscript"
print(dm.get_tags(x2))

['sql', 'actionscript3-flex-flash', 'other']
CPU times: user 2.34 ms, sys: 179 µs, total: 2.51 ms
Wall time: 1.96 ms


In [15]:
# # post_ids = [7675136, 982129, 2842866, 4228494, 1733705]
post_ids = [7675136, 2842866, 4228494, 982129, 1733705]
neighbor_length = get_neighbor_inputs(post_ids)
df_parent = pd.read_sql("SELECT * FROM posts WHERE comment_id IN ({})".format(neighbor_length[:-1]), 
                                         connection, params=tuple(post_ids))
df_parent

NameError: name 'get_neighbor_inputs' is not defined

In [130]:
knns = [7675136, 982129, 2842866, 4228494, 1733705]

neighbor_length = '?,' * len(knns)
df = pd.read_sql("SELECT * FROM posts WHERE comment_id IN ({})".format(neighbor_length[:-1]), 
                                 connection, params=tuple(knns))


df.head()

Unnamed: 0,comment_id,parent_id,comment,title,date,score,tags
0,982129,,<p>I saw an <a href='https://stackoverflow.com...,What does __sync_synchronize do?,2009-06-11,17,c++ c linux
1,1733705,,<p>ruby code</p> <pre><code>irb(main):001:0> a...,about NSMutabuleArray,2009-11-14,0,objective-c ruby
2,2842866,,"<p>Say,the trailing <code>++</code> has no act...",l+l++ is the same as l+l?,2010-05-16,0,c++ syntax
3,4228494,,<p>this code does not show me any output</p> <...,randomized_quicksort,2010-11-19,-6,c++ algorithm
4,7675136,,<p>How can I calculate the execution time in t...,How to calculate the execution time in C?,2011-10-06,0,c timing


In [145]:
score_df = df[df.score > 0]
for i in score_df.comment.values:
    print(i)

<p>It forces a <a href='http://en.wikipedia.org/wiki/Memory_fence' rel='nofollow noreferrer'>memory fence</a> I guess.</p>
<p>It is a atomic builtin for <a href='http://gcc.gnu.org/onlinedocs/gcc-4.6.2/gcc/Atomic-Builtins.html' rel='noreferrer'>full memory barrier</a>.</p> <blockquote> <p>No memory operand will be moved across the operation, either forward or backward. Further, instructions will be issued as necessary to prevent the processor from speculating loads across the operation and from queuing stores after the operation.</p> </blockquote> <p>Check details on the link above.</p>
<p>You use a NSSet to ensure uniqueness. <code>setWithArray</code> receives an array containing the objects to add to the new set. If the same object appears more than once in <em>anArray</em>, it is added only once to the returned set:</p> <pre><code>NSArray *arr = [[NSSet setWithArray: [NSArray arrayWithObjects: @'a', @'b', @'b', @'a', @'b', nil]] allObjects]; //If you want to obtain a mutable array: 

In [132]:
%%time
dm.get_similar('Compressing / Decompressing Folders & Files', load=True, save=False, vect=vect)

get vects


Check failed: input


0:00:00.292297
create index


RuntimeError: Check failed: Cannot open file 'models/knn_embeddings_path/c#.bin' for reading

In [81]:
%%time
dm.get_similar('How do I calculate someones age in C', 'c', load=True, save=False, vect=vect)

In [87]:
results[0]

[25642565, 50661263, 44366395, 50656792, 50646159]

In [None]:
# from torch.nn import CosineSimilarity
# import torch 
# # need to change / test for NOT CUDA 
# def get_similar2(self, question, tag, question_to_vec=avg_word_vectors, topk=5, return_score=False, *args, **kwargs):

#     thread_ids, thread_vectors = self.__get_embeddings_by_tag(tag)
#     cos = CosineSimilarity(dim=1)
#     question2vec = question_to_vec(question, self.word_embeddings, self.dim, *args, **kwargs)
#     output = cos(torch.Tensor(question2vec.reshape(1, -1)).cuda(), torch.Tensor(thread_vectors).cuda())
#     output = output.cpu().numpy()
#     data = [(i, thread_ids[i]) for i in range(len(output))] 
#     if return_score:
#         output = [(x, score) for score, x in sorted(zip(output, data), key=lambda pair: pair[0], reverse=True)]
#     else:
#         output = [x for _, x in sorted(zip(output, data), key=lambda pair: pair[0], reverse=True)]
#     if topk: return output[:topk]
#     else: return output

In [4]:
%%time
dm.get_similar('How do I calculate someones age in C', 'c', load=False, save=True, vect=vect, return_dist=True)

get vects
0:00:02.558332
create index
0:04:49.197026
question creation
query
0:00:00.001622
CPU times: user 30min 31s, sys: 24.7 s, total: 30min 55s
Wall time: 4min 52s


([5157079, 50660067, 50657512, 50661696, 50656291],
 array([0.8260471 , 0.89971733, 0.94661146, 0.97068864, 0.97275424],
       dtype=float32))

In [7]:
parent_comment_map = pickle.load((DATA_PATH/'parent_comment_map.pkl').open('rb'))

In [124]:
parent_ids = [25642565, 50661263, 2551706, 50656792, 50646159]
all_comment_ids = [j for i in results for j in parent_comment_map[i]]
all_comment_ids

[25807038, 50661315, 2551741, 2551760, 2551787, 2551789, 50658829, 50646212]

In [69]:
l = '?,'*len(parent_comment_map[2551706])
l[:-1]

'?,?,?,?'

In [127]:
import sqlite3
DB_NAME = 'StackOverflow.db'
connection = sqlite3.connect(DB_NAME)
c = connection.cursor()

parent_ids = [25642565, 50661263, 2551706, 50656792, 50646159]
all_comment_ids = [j for i in results for j in parent_comment_map[i]]
# all_comment_ids


neighbor_length = '?,' * len(all_comment_ids)
print(neighbor_length)
df = pd.read_sql("SELECT * FROM posts WHERE comment_id IN ({})".format(neighbor_length[:-1]), 
                                 connection, params=tuple(all_comment_ids))




?,?,?,?,?,?,?,?,


In [128]:
df

Unnamed: 0,comment_id,parent_id,comment,title,date,score,tags
0,2551741,2551706,<p>There are some good articles about that her...,,2010-03-31,0,
1,2551760,2551706,<p>The best way to handle this scenario would ...,,2010-03-31,0,
2,2551787,2551706,<p>You have following to choose from:</p> <ol>...,,2010-03-31,0,
3,2551789,2551706,<p>Something like below (the first part just w...,,2010-03-31,1,
4,25807038,25642565,<p>I specified parameter for 'file'</p> <pre><...,,2014-09-12,0,
5,50646212,50646159,<p>Why not wrap your task in a function and ca...,,2018-06-01,3,
6,50658829,50656792,"<p>Macros are expanded exactly once, when the ...",,2018-06-02,0,
7,50661315,50661263,"<p>This is a definition of a global variable, ...",,2018-06-02,3,


In [5]:
x = 25642565

df = pd.read_sql("SELECT * FROM posts WHERE comment_id = 5157079;", 
                                 connection)

In [6]:
df

Unnamed: 0,comment_id,parent_id,comment,title,date,score,tags
0,5157079,,<p>I want to deserialize a JSON string which d...,Overlay data from JSON string to existing obje...,2011-03-01,35,c# .net json serialization
