In [None]:
#TODO 
# - Need to keep http links 
# - Need to weight best answers based on distance, number of matching words, Levenshtein distance, 

In [1]:
import os 
import numpy as np
import pandas as pd
from pathlib import Path
import csv
import pickle
import nmslib
import annoy
from datetime import datetime
from collections import defaultdict
from IPython.display import FileLink
import sqlite3
DB_NAME = 'StackOverflow_python.db'


connection = sqlite3.connect(DB_NAME)
c = connection.cursor()


DATA_PATH = Path('data/')
MODEL_PATH = Path('models/')
FLASK_PATH = Path('ui')
FLASK_PATH.mkdir(exist_ok=True)

In [2]:
# utils file 

import nltk
import pickle
import re
import numpy as np
import html

from nltk.corpus import stopwords
stopwords_set = set(stopwords.words('english'))


def avg_word_vectors(question, embeddings, dim):
    words_embedding = [embeddings[word] for word in question.lower().split() if word in embeddings]
    if not words_embedding:
        return np.zeros(dim)
    words_embedding = np.array(words_embedding).astype(np.float32)
    return words_embedding.mean(axis=0)

def average_tfidf_vectors(question, embeddings, dim, vect, idf_scores):
    # get idf weights
    split_question = [word for word in question.lower().split() if word in embeddings]
    if not split_question:
        return np.zeros(dim).astype(np.float32)
    words_embedding = np.zeros((dim, len(split_question))).astype(np.float32)
    for i, token in enumerate(split_question):
        if token in embeddings:
            embed_score = embeddings[token]
        else: embed_score = 0
        idf_score = idf_scores[token]
        # word vectors multiply by their TF-IDF scores
        words_embedding[:, i] = embed_score * idf_score    
    return words_embedding.mean(axis=1)


def get_embeddings(filename):
    embeddings = {}
    with open(MODEL_PATH/filename, newline='') as f:
        reader = csv.reader(f, delimiter='\t')
        embed_list = list(reader)
    for line in embed_list:
        embeddings[line[0]] = np.asarray(line[1:], dtype=np.float32)
        
    dim = len(embeddings['code'])
    return embeddings, dim


def unpickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)



re1 = re.compile(r'  +')
# def clean_title(text, remove_html=False, other=False):
def clean_text(text, remove_html=False, other=False):
    
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    
    if remove_html:
        x = re.sub(r'<code>[^>]*</code>', '', text)
        x = re.sub(r'<[^>]*>', '', x)
        x = re.sub(r'[^A-Za-z0-9]', ' ', x)
    text = text.lower()
    text = text.replace('π', 'pi').replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
            'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
            '<br />', "\n").replace('\\"', '').replace('<unk>','u_n').replace(' @.@ ','.').replace(
            ' @-@ ','-').replace('\\', ' \\ ').replace('"',"'").replace('\n', ' ').replace('\r', ' ')
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
    return re1.sub(' ', html.unescape(text).strip())


# def get_top_preds(X, clf, k=3):
#     preds = clf.predict_proba(X)
#     return np.argsort(preds)[:,::-1][0][:3]

def get_top_preds(X, clf, class_map, k=3, cutoff=0.8):
        preds = clf.predict_proba(X)
        sorted_preds = np.argsort(preds)[:,::-1][0][:3]
        top_scores = [preds[0][i] for i in sorted_preds]
        if top_scores[0] > cutoff:
            # if top_scores above threshold only need to return top 
            return [class_map[sorted_preds[0]]]
        else:
            return [class_map[p] for p in sorted_preds]

        
def merge_parent_comment(df_parent, df_comments, results, distances):
    distance_df = pd.DataFrame.from_dict(dict(zip(results, distances)), orient='index')
    distance_df.columns = ['distances']
    parent_len = len(df_parent)
    df_parent = df_parent.merge(distance_df, how='left', left_on='comment_id', right_index=True)
    assert len(df_parent) == parent_len 
    
    final_df = df_parent[['comment_id', 'comment', 'title', 'score', 'distances']].merge(
            df_comments[['parent_id', 'comment', 'score']], how='left', left_on='comment_id', right_on='parent_id', 
            suffixes=('_parent', '_comment')).dropna(subset=['comment_comment']).sort_values('distances')
    return final_df.reset_index(drop=True)


from sklearn.metrics.pairwise import linear_kernel

def tfidf_cosine_similarities(X, question, topk=5):
    tfid_vectorizer = unpickle(MODEL_PATH/'tf_idf_python_title_stopwords.pkl')

    tfidf_X = tfid_vectorizer.transform(X)
    tfidf_question = tfid_vectorizer.transform([question])

    cosine_similarities = linear_kernel(tfidf_question, tfidf_X).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-topk:-1]
    print(related_docs_indices)
    return final_df.loc[related_docs_indices]


# -------------------------------------------------------------------------------------------------------------------



# with open(MODEL_PATH/'tf_idf.pkl', mode='rb') as f:
#     vect = pickle.load(f)
    
# idf_scores = defaultdict(lambda:0, zip(vect.get_feature_names(), vect.idf_))

# parent_comment_map = pickle.load((DATA_PATH/'parent_comment_map.pkl').open('rb'))


class DialogueManager(object):
    def __init__(self, data_path, model_path):
        self.model_path = model_path
        self.data_path = data_path
        self.thread_embeddings_path = model_path/'thread_embeddings_by_tags'
        self.knn_path = model_path/'knn_embeddings_path'
        self.word_embeddings, self.dim = get_embeddings('starspace_embedding100_ngram2.tsv')
        self.parent_comment_map = unpickle(data_path/'parent_comment_map.pkl')
        
#         self.tag_classifier = unpickle(model_path/'LR_tag_classifier_all.pkl')
#         self.class_map = unpickle(data_path/'class_map.pkl')
        
        self.tfid_vectorizer = unpickle(model_path/'tf_idf_python_title_stopwords.pkl')
        self.idf_scores = defaultdict(lambda:0, zip(self.tfid_vectorizer.get_feature_names(), 
                                                    self.tfid_vectorizer.idf_))
        
        self.parent_comment_map = unpickle(data_path/'parent_comment_map.pkl')
        
        
    def __get_embeddings_by_tag(self, tag):
        
#         embeddings_files = [self.thread_embeddings_path/tag for tag in tags]
#         for file in embeddings_files:
#             ids, vectors = unpickle(embeddings_files)
        
        # tfidf-avg
        tag_path = 'python_only.pkl'
        embeddings_file = self.thread_embeddings_path/tag_path
        ids, vectors = unpickle(embeddings_file)
        return ids, vectors
    
    def __create_nmslib_index(self, a, space, load=True, filepath=None, save=False):
        M = 25
        efC = 100
        
        num_threads = 4
        index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 
                             'post': 0, 'skip_optimized_index':1}
        query_time_params = {'efSearch': efC}

        index = nmslib.init(space=space)
        if load:
            # only need to init if indexed is saved
            index.loadIndex(filepath)
#             index.setQueryTimeParams(query_time_params)
            return index
        else:
            index.addDataPointBatch(a)
            index.createIndex()
#             index.setQueryTimeParams(query_time_params)
            if save: index.saveIndex(filepath)
            return index
        
    def __create_annoy_index(self, data, space='angular', n_trees=30, load=True, filepath=None, save=False):
                
        index = annoy.AnnoyIndex(self.dim, metric=space)
        if load:
            # only need to init if indexed is saved
            index.load(filepath)
        else:
            for i, vect in enumerate(data):
                index.add_item(i, vect)
            index.build(n_trees)            
            if save: index.save(filepath)
        return index

    
    def get_similar(self, question, question_to_vec=average_tfidf_vectors, topk=5, space='angular', 
                    load=True, save=False, return_dist=True, *args, **kwargs):
        
#         tags = self.get_tags(question)
        # need to update for multiple tags
#         tag = tags[0]
        print('get vects')
        start = datetime.now()
#         tag
        thread_ids, thread_vectors = self.__get_embeddings_by_tag(tag=None)
#         candidate2vecs = np.load(MODEL_PATH/'candidate2vecs_python_title.npy')
        print(datetime.now() - start)
        print('create index')
        start = datetime.now()
#         tag_path = tag + '.bin'
#         tag_path = 'python_only.bin'
        tag_path = 'python_only.annoy'
#         index = self.__create_nmslib_index(thread_vectors, space=space, load=load, 
#                                            filepath=str(self.knn_path/tag_path), save=save)
        index = self.__create_annoy_index(thread_vectors, space=space, load=load, 
                                           filepath=str(self.knn_path/tag_path), save=save)
        print(datetime.now() - start)
        print('question creation')

        question2vec = question_to_vec(question, self.word_embeddings, self.dim, vect=self.tfid_vectorizer, 
                                        idf_scores=self.idf_scores, *args, **kwargs)
        print('query')
        start = datetime.now()
#         idxs, distances = index.knnQuery([question2vec], k=topk)
        idxs, distances = index.get_nns_by_vector(question2vec, n=topk, include_distances=return_dist)
        print(datetime.now() - start)
        output = [thread_ids[i] for i in idxs]
        if return_dist: output = output, distances
        return output
    
    
    def get_comments(self, post_ids):
        # need to weight by distance 
        # need to get for multiple tags, combine and weight by distance
        df_parent = self.get_df(np.array(post_ids).flatten().tolist())

        knns = [j for i in post_ids if i in self.parent_comment_map for j in self.parent_comment_map[i]]
        df_comments = self.get_df(knns)

        return df_parent, df_comments 
    
    def get_tags(self, question, k=3): 
        cleaned_question = clean_text(question)
        features = self.tfid_vectorizer.transform([cleaned_question])
        preds = get_top_preds(features, self.tag_classifier, self.class_map, k)
        return preds
    
    def clean_output(self, df):
        # get rid of bad (negative / 0 scores)
        # only show good comments (based on distance?)
        # clean html
        # print in cool format
        pass
    
    def merge_parent_comment(self, df_parent, df_comments, results, distances):
        distance_df = pd.DataFrame.from_dict(dict(zip(results, distances)), orient='index')
        distance_df.columns = ['distances']
        parent_len = len(df_parent)
        df_parent = df_parent.merge(distance_df, how='left', left_on='comment_id', right_index=True)
        assert len(df_parent) == parent_len 

        final_df = df_parent[['comment_id', 'comment', 'title', 'score', 'distances']].merge(
                df_comments[['parent_id', 'comment', 'score']], how='left', left_on='comment_id', right_on='parent_id', 
                suffixes=('_parent', '_comment')).dropna(subset=['comment_comment']).sort_values('distances')
        return final_df.reset_index(drop=True)

    def get_df(self, ids):
        neighbor_length = '?,' * len(ids)
        df = pd.read_sql("SELECT * FROM posts WHERE comment_id IN ({})".format(neighbor_length[:-1]), 
                                         connection, params=tuple(ids))
        return df
    
    

In [4]:
%%time
dm = DialogueManager(DATA_PATH, MODEL_PATH)

CPU times: user 17.2 s, sys: 1.63 s, total: 18.8 s
Wall time: 18.8 s


In [5]:
question = 'How do you implement logistic regression in python?'
clean_question = clean_text(question)

results, distances = dm.get_similar(clean_question, topk=100, load=True, save=False, return_dist=True)
print(list(zip(results, distances)))
df_parent, df_comments = dm.get_comments(results)
final_df = dm.merge_parent_comment(df_parent, df_comments, results, distances)
final_df.head()

get vects
0:00:00.495709
create index
0:00:00.899871
question creation
query
0:00:00.001123
[(47278604, 0.1676356941461563), (37391923, 0.3394250273704529), (38621685, 0.3394250273704529), (40223553, 0.3394250273704529), (40865717, 0.3394250273704529), (46349963, 0.3394250273704529), (48962697, 0.3394250273704529), (20328835, 0.34385648369789124), (14262755, 0.3649623394012451), (40251587, 0.3673402667045593), (49135107, 0.3728841543197632), (46999321, 0.4175085425376892), (33616265, 0.41781094670295715), (40070064, 0.42642542719841003), (45590515, 0.42642542719841003), (45705490, 0.4515378773212433), (47728309, 0.45285671949386597), (30499018, 0.464110791683197), (41926679, 0.46783876419067383), (43426454, 0.4716132879257202), (40051478, 0.4758162498474121), (24154466, 0.4758162498474121), (46588190, 0.4840378761291504), (13794754, 0.487010657787323), (37031276, 0.49025967717170715), (34302840, 0.49104371666908264), (50283653, 0.4915216863155365), (24935415, 0.49579480290412903), (171

Unnamed: 0,comment_id,comment_parent,title,score_parent,distances,parent_id,comment_comment,score_comment
0,47278604,<p>I am applying multiple ML algorithm to this...,Implement Logistic Regression,0,0.167636,47278604.0,<p><code>predict</code> returns log-odds for a...,4.0
1,48962697,<p>I created a simple Logistic Regression mode...,Chainer - Python - Logistic Regression,0,0.339425,48962697.0,<p>At first what is your main question? The be...,1.0
2,46349963,<p>I'm trying to build a Logistic regression m...,Logistic Regression in Python,0,0.339425,46349963.0,<p>You can just use a linear function as activ...,0.0
3,37391923,<p>I am currently doing the Logistic Regressio...,Logistic Regression in python,0,0.339425,37391923.0,"<p>I think it will be more easily, when you po...",0.0
4,37391923,<p>I am currently doing the Logistic Regressio...,Logistic Regression in python,0,0.339425,37391923.0,<p>The Predict method as mentioned in the scik...,0.0


In [195]:
distance_df = pd.DataFrame.from_dict(dict(zip(results, distances)), orient='index')
distance_df.columns = ['distances']
distance_df

Unnamed: 0,distances
47278604,0.167636
37391923,0.339425
38621685,0.339425
40223553,0.339425
40865717,0.339425
46349963,0.339425
48962697,0.339425
20328835,0.343856
14262755,0.364962
40251587,0.367340


In [196]:
def merge_parent_comment(df_parent, df_comments, results, distances):
    distance_df = pd.DataFrame.from_dict(dict(zip(results, distances)), orient='index')
    distance_df.columns = ['distances']
    parent_len = len(df_parent)
    df_parent = df_parent.merge(distance_df, how='left', left_on='comment_id', right_index=True)
    assert len(df_parent) == parent_len 
    
    final_df = df_parent[['comment_id', 'comment', 'title', 'score', 'distances']].merge(
            df_comments[['parent_id', 'comment', 'score']], how='left', left_on='comment_id', right_on='parent_id', 
            suffixes=('_parent', '_comment')).dropna(subset=['comment_comment']).sort_values('distances')
    return final_df.reset_index(drop=True)

final_df = merge_parent_comment(df_parent, df_comments, results, distances)
print(final_df.shape)
final_df.head()

(114, 8)


Unnamed: 0,comment_id,comment_parent,title,score_parent,distances,parent_id,comment_comment,score_comment
0,47278604,<p>I am applying multiple ML algorithm to this...,Implement Logistic Regression,0,0.167636,47278604.0,<p><code>predict</code> returns log-odds for a...,4.0
1,48962697,<p>I created a simple Logistic Regression mode...,Chainer - Python - Logistic Regression,0,0.339425,48962697.0,<p>At first what is your main question? The be...,1.0
2,46349963,<p>I'm trying to build a Logistic regression m...,Logistic Regression in Python,0,0.339425,46349963.0,<p>You can just use a linear function as activ...,0.0
3,37391923,<p>I am currently doing the Logistic Regressio...,Logistic Regression in python,0,0.339425,37391923.0,"<p>I think it will be more easily, when you po...",0.0
4,37391923,<p>I am currently doing the Logistic Regressio...,Logistic Regression in python,0,0.339425,37391923.0,<p>The Predict method as mentioned in the scik...,0.0


In [156]:
from sklearn.metrics.pairwise import linear_kernel

def tfidf_cosine_similarities(X, question, topk=5):
    tfid_vectorizer = unpickle(MODEL_PATH/'tf_idf_python_title_stopwords.pkl')

    tfidf_X = tfid_vectorizer.transform(X)
    tfidf_question = tfid_vectorizer.transform([question])

    cosine_similarities = linear_kernel(tfidf_question, tfidf_X).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-topk:-1]
    print(related_docs_indices)
    return final_df.loc[related_docs_indices]


tfidf_cosine_similarities(final_df['comment_parent'], question)

[38 87 86 19]


Unnamed: 0,comment_id,comment_parent,title,score_parent,distances,parent_id,comment_comment,score_comment
38,33002621,<p>I am compiling the framework for a logistic...,Logistic Regression Framework,0,0.519785,33002621.0,<p>Most of your questions can be addressed by ...,1.0
87,49860809,<h2>I used logistic regression with python and...,how do I find the actual logistic regression m...,1,0.578273,49860809.0,<p>I'm assuming you're using SkLearn. But what...,0.0
86,49860809,<h2>I used logistic regression with python and...,how do I find the actual logistic regression m...,1,0.578273,49860809.0,<p>The <code>model</code> object has an attrib...,2.0
19,30499018,<p>According to what I have understood linear ...,Difference between linear and logistic regress...,4,0.464111,30499018.0,<p>There is a strict link between linear regre...,6.0


In [201]:
from fuzzyset import FuzzySet

# Levenshtein distance
    
def num_matching_words(query, df):
    f = FuzzySet()
    for title, comment_parent, comment_comment in zip(df['title'], df['comment_parent'], df['comment_comment']):
        f.add(title + ' ' + comment_parent + ' ' + comment_comment)
#         f.add(clean_text(comment_parent))
    return f.use_levenshtein(query)
    

print(question)
print(num_matching_words(question, final_df))

How do you implement logistic regression in python?


TypeError: 'bool' object is not callable

In [157]:
final_df

Unnamed: 0,comment_id,comment_parent,title,score_parent,distances,parent_id,comment_comment,score_comment
0,47278604,<p>I am applying multiple ML algorithm to this...,Implement Logistic Regression,0,0.167636,47278604.0,<p><code>predict</code> returns log-odds for a...,4.0
1,48962697,<p>I created a simple Logistic Regression mode...,Chainer - Python - Logistic Regression,0,0.339425,48962697.0,<p>At first what is your main question? The be...,1.0
2,46349963,<p>I'm trying to build a Logistic regression m...,Logistic Regression in Python,0,0.339425,46349963.0,<p>You can just use a linear function as activ...,0.0
3,37391923,<p>I am currently doing the Logistic Regressio...,Logistic Regression in python,0,0.339425,37391923.0,"<p>I think it will be more easily, when you po...",0.0
4,37391923,<p>I am currently doing the Logistic Regressio...,Logistic Regression in python,0,0.339425,37391923.0,<p>The Predict method as mentioned in the scik...,0.0
5,40865717,<p>I have been at this for a couple of hours a...,Python Logistic Regression,0,0.339425,40865717.0,"<p>First of all, double-check if your problem ...",1.0
6,38621685,<p>I have been trying to implement logistic re...,Logistic Regression Python,1,0.339425,38621685.0,<p>You are not configuring the C parameter - w...,0.0
7,37391923,<p>I am currently doing the Logistic Regressio...,Logistic Regression in python,0,0.339425,37391923.0,<p>If I am not mistaken the 'u' is just notati...,0.0
8,20328835,<p>I want to implement Logisitic regression fr...,Logistic regression using python,1,0.343856,20328835.0,"<p>In order to implement Logistic Regression, ...",1.0
9,20328835,<p>I want to implement Logisitic regression fr...,Logistic regression using python,1,0.343856,20328835.0,<p>You may want to try to translate your octav...,2.0


In [197]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA


def tfidf_cosine_similarities(df, col, question, topk=5):
#     tfid_vectorizer = unpickle(MODEL_PATH/'tf_idf_python_title_stopwords.pkl')
    tfidf_X = tfid_vectorizer.transform(df[col])
    tfidf_question = tfid_vectorizer.transform([question])

    cosine_similarities = linear_kernel(tfidf_question, tfidf_X).flatten()
    df['tfidf_' + col] = cosine_similarities
#     related_docs_indices = cosine_similarities.argsort()[:-topk:-1]
#     return df.loc[related_docs_indices]
    return df



def symbolic_ngrams(df, col, question, pca_dim=50, topk=5):
    vect = CountVectorizer(analyzer='char', ngram_range=(1, 5), min_df=10)
    X = vect.fit_transform(df[col]).todense()
    pca = PCA(pca_dim)
    pca_X = pca.fit_transform(X)
    X_test = pca.transform(vect.transform([question]).todense())
    cosine_similarities = linear_kernel(tfidf_question, tfidf_X).flatten()
#     related_docs_indices = cosine_similarities.argsort()[:-topk:-1]
    df['ngram_' + col] = cosine_similarities
#     return df.loc[related_docs_indices]
    return df

#     return df.loc[linear_kernel(X_test, pca_X).flatten().argsort()[:-topk:-1]]


def tfidf_symbolic_ngrams_cosine_similarities(df, col, question, pca_dim=50, topk=5):
    
    # tfid 
    tfid_vectorizer = unpickle(MODEL_PATH/'tf_idf_python_title_stopwords.pkl')
    tfid_vectorizer = TfidfVectorizer(token_pattern='(\S+)', min_df=3, max_df=0.9, ngram_range=(1,1))
    tfidf_X = tfid_vectorizer.fit_transform(df[col]).todense()
    
    # symbolic n-grams (1 - 5)
    vect = CountVectorizer(analyzer='char', ngram_range=(1, 5), min_df=3, max_df=0.9)
    count_X = vect.fit_transform(df[col]).todense()
    X = np.concatenate((tfidf_X, count_X), axis=1)
    
    # PCA 
    pca = PCA(pca_dim)
    pca_X = pca.fit_transform(X)
    
    # question
    tfidf_question = tfid_vectorizer.transform([question]).todense()
    count_question = vect.transform([question]).todense()
    transformed_question = np.concatenate((tfidf_question, count_question), axis=1)
    transformed_question = pca.transform(transformed_question)
    
    # cosine similarity 
    cosine_similarities = linear_kernel(transformed_question, pca_X).flatten()
#     print(cosine_similarities)?
    df['tfidf_ngram_combo_' + col] = cosine_similarities
#     related_docs_indices = cosine_similarities.argsort()[:-topk:-1]
#     return df.loc[related_docs_indices]
    return df
    
for col in ['comment_comment', 'title', 'comment_parent']:
    final_df = tfidf_cosine_similarities(final_df, col, question, topk=100)
    final_df = symbolic_ngrams(final_df, col, question, topk=100)
    final_df = tfidf_symbolic_ngrams_cosine_similarities(final_df, col, question, topk=100)
final_df

Unnamed: 0,comment_id,comment_parent,title,score_parent,distances,parent_id,comment_comment,score_comment,tfidf_comment_comment,ngram_comment_comment,tfidf_ngram_combo_comment_comment,tfidf_title,ngram_title,tfidf_ngram_combo_title,tfidf_comment_parent,ngram_comment_parent,tfidf_ngram_combo_comment_parent
0,47278604,<p>I am applying multiple ML algorithm to this...,Implement Logistic Regression,0,0.167636,47278604.0,<p><code>predict</code> returns log-odds for a...,4.0,0.279356,0.248157,2329.829127,1.000000,0.248157,20.178589,0.248157,0.248157,7932.416314
1,48962697,<p>I created a simple Logistic Regression mode...,Chainer - Python - Logistic Regression,0,0.339425,48962697.0,<p>At first what is your main question? The be...,1.0,0.000000,0.100467,-2951.342950,0.575795,0.100467,8.008265,0.100467,0.100467,7249.518118
2,46349963,<p>I'm trying to build a Logistic regression m...,Logistic Regression in Python,0,0.339425,46349963.0,<p>You can just use a linear function as activ...,0.0,0.000000,0.074215,3678.065843,0.829932,0.074215,24.062918,0.074215,0.074215,2559.153092
3,37391923,<p>I am currently doing the Logistic Regressio...,Logistic Regression in python,0,0.339425,37391923.0,"<p>I think it will be more easily, when you po...",0.0,0.000000,0.221597,2892.922842,0.829932,0.221597,24.062918,0.221597,0.221597,7027.858467
4,37391923,<p>I am currently doing the Logistic Regressio...,Logistic Regression in python,0,0.339425,37391923.0,<p>The Predict method as mentioned in the scik...,0.0,0.000000,0.221597,1949.986364,0.829932,0.221597,24.062918,0.221597,0.221597,7027.858467
5,40865717,<p>I have been at this for a couple of hours a...,Python Logistic Regression,0,0.339425,40865717.0,"<p>First of all, double-check if your problem ...",1.0,0.058089,0.000000,-995.975860,0.829932,0.000000,6.437111,0.000000,0.000000,-50960.543671
6,38621685,<p>I have been trying to implement logistic re...,Logistic Regression Python,1,0.339425,38621685.0,<p>You are not configuring the C parameter - w...,0.0,0.000000,0.161612,1714.893022,0.829932,0.161612,13.066688,0.161612,0.161612,1144.946519
7,37391923,<p>I am currently doing the Logistic Regressio...,Logistic Regression in python,0,0.339425,37391923.0,<p>If I am not mistaken the 'u' is just notati...,0.0,0.000000,0.221597,1271.823543,0.829932,0.221597,24.062918,0.221597,0.221597,7027.858467
8,20328835,<p>I want to implement Logisitic regression fr...,Logistic regression using python,1,0.343856,20328835.0,"<p>In order to implement Logistic Regression, ...",1.0,0.368484,0.229025,115.464275,0.793694,0.229025,5.344199,0.229025,0.229025,9691.134640
9,20328835,<p>I want to implement Logisitic regression fr...,Logistic regression using python,1,0.343856,20328835.0,<p>You may want to try to translate your octav...,2.0,0.166114,0.229025,2532.404389,0.793694,0.229025,5.344199,0.229025,0.229025,9691.134640


In [180]:
symbolic_ngrams(final_df, 'comment_comment', question, topk=10)

Unnamed: 0,comment_id,comment_parent,title,score_parent,distances,parent_id,comment_comment,score_comment
22,40051478,<p>Here is the line of code. I know the issue ...,Simple Logistic Regression Error in Python,0,0.475816,40051478.0,<pre><code>data[col_name].values.reshape(len(d...,0.0
88,36760000,<p>I have a test dataset and train dataset as ...,Python : How to use Multinomial Logistic Regre...,5,0.581021,36760000.0,<p>You could try </p> <pre><code>LogisticRegre...,2.0
2,46349963,<p>I'm trying to build a Logistic regression m...,Logistic Regression in Python,0,0.339425,46349963.0,<p>You can just use a linear function as activ...,0.0
39,34033189,"<p>Good night, community!</p> <p>I have a simp...",sklearn Python and Logistic regression,1,0.52076,34033189.0,<p>if <code>model</code> is your <code>sklearn...,1.0
92,47845610,<p>I have the following variables:</p> <pre><c...,Regression using Python,0,0.582225,47845610.0,<p>The <code>return</code> statement should be...,0.0
42,12146914,<p>When we have to predict the value of a <a h...,What is the difference between linear regressi...,117,0.522536,12146914.0,<p>In short: Linear Regression gives continuou...,0.0
91,47845610,<p>I have the following variables:</p> <pre><c...,Regression using Python,0,0.582225,47845610.0,<p>At the start of your line</p> <pre><code>n ...,0.0
86,49860809,<h2>I used logistic regression with python and...,how do I find the actual logistic regression m...,1,0.578273,49860809.0,<p>The <code>model</code> object has an attrib...,2.0
69,45157944,<p>I'm following this <a href='https://www.ten...,coefficients of logistic regression model in t...,1,0.556687,45157944.0,<p>You can use the two methods: <code>get_vari...,1.0


In [47]:
final_df.to_csv(FLASK_PATH/'final_df.csv', index=False)

In [48]:
from IPython.display import FileLink

# FileLink(str(FLASK_PATH/'df_parent.csv'))
FileLink(str(FLASK_PATH/'final_df.csv'))

In [8]:
df_parent.to_csv(FLASK_PATH/'df_parent.csv', index=False)
df_comments.to_csv(FLASK_PATH/'df_comments.csv', index=False)

In [48]:
post_ids = [40251587, 49135107, 40223553]

knns = [j for i in post_ids if i in dm.parent_comment_map for j in dm.parent_comment_map[i]]
knns

[41124800, 49137743]

In [28]:
# def get_df(ids):
#         neighbor_length = '?,' * len(ids)
#         df = pd.read_sql("SELECT * FROM posts WHERE comment_id IN ({})".format(neighbor_length[:-1]), 
#                                          connection, params=tuple(ids))
#         return df
    
    
# df_parent = get_df(results)

def get_df(ids):
        neighbor_length = '?,' * len(ids)
        df = pd.read_sql("SELECT * FROM posts WHERE comment_id IN ({})".format(neighbor_length[:-1]), 
                                         connection, params=tuple(ids))
        return df
    
    

most_similar = get_df(np.array(results).flatten().tolist())
most_similar


Unnamed: 0,comment_id,parent_id,comment,title,date,score,tags
0,36139,,<p>What is the best way of creating an alphabe...,How to sort a list of strings?,2008-08-30,321,python string sorting
1,6198836,,<p>How would I sort this ? </p> <pre><code>>>>...,Python sorting a list of strings,2011-06-01,1,python sorting
2,11620088,,<blockquote> <p><strong>Possible Duplicate:</s...,Sorting a list of strings in Python,2012-07-23,2,python string sorting
3,36698259,,<p>I would like to sort this list:</p> <p><cod...,Python - How to sort a list of strings,2016-04-18,0,python sorting python-3.x
4,48408866,,<p>I have the following list of lists of strin...,How to sort a list of strings in Python?,2018-01-23,-3,python string list sorting


In [130]:
knns = [7675136, 982129, 2842866, 4228494, 1733705]

neighbor_length = '?,' * len(knns)
df = pd.read_sql("SELECT * FROM posts WHERE comment_id IN ({})".format(neighbor_length[:-1]), 
                                 connection, params=tuple(knns))


df.head()

Unnamed: 0,comment_id,parent_id,comment,title,date,score,tags
0,982129,,<p>I saw an <a href='https://stackoverflow.com...,What does __sync_synchronize do?,2009-06-11,17,c++ c linux
1,1733705,,<p>ruby code</p> <pre><code>irb(main):001:0> a...,about NSMutabuleArray,2009-11-14,0,objective-c ruby
2,2842866,,"<p>Say,the trailing <code>++</code> has no act...",l+l++ is the same as l+l?,2010-05-16,0,c++ syntax
3,4228494,,<p>this code does not show me any output</p> <...,randomized_quicksort,2010-11-19,-6,c++ algorithm
4,7675136,,<p>How can I calculate the execution time in t...,How to calculate the execution time in C?,2011-10-06,0,c timing


In [145]:
score_df = df[df.score > 0]
for i in score_df.comment.values:
    print(i)

<p>It forces a <a href='http://en.wikipedia.org/wiki/Memory_fence' rel='nofollow noreferrer'>memory fence</a> I guess.</p>
<p>It is a atomic builtin for <a href='http://gcc.gnu.org/onlinedocs/gcc-4.6.2/gcc/Atomic-Builtins.html' rel='noreferrer'>full memory barrier</a>.</p> <blockquote> <p>No memory operand will be moved across the operation, either forward or backward. Further, instructions will be issued as necessary to prevent the processor from speculating loads across the operation and from queuing stores after the operation.</p> </blockquote> <p>Check details on the link above.</p>
<p>You use a NSSet to ensure uniqueness. <code>setWithArray</code> receives an array containing the objects to add to the new set. If the same object appears more than once in <em>anArray</em>, it is added only once to the returned set:</p> <pre><code>NSArray *arr = [[NSSet setWithArray: [NSArray arrayWithObjects: @'a', @'b', @'b', @'a', @'b', nil]] allObjects]; //If you want to obtain a mutable array: 

In [132]:
%%time
dm.get_similar('Compressing / Decompressing Folders & Files', load=True, save=False, vect=vect)

get vects


Check failed: input


0:00:00.292297
create index


RuntimeError: Check failed: Cannot open file 'models/knn_embeddings_path/c#.bin' for reading

In [81]:
%%time
dm.get_similar('How do I calculate someones age in C', 'c', load=True, save=False, vect=vect)

In [87]:
results[0]

[25642565, 50661263, 44366395, 50656792, 50646159]

In [None]:
# from torch.nn import CosineSimilarity
# import torch 
# # need to change / test for NOT CUDA 
# def get_similar2(self, question, tag, question_to_vec=avg_word_vectors, topk=5, return_score=False, *args, **kwargs):

#     thread_ids, thread_vectors = self.__get_embeddings_by_tag(tag)
#     cos = CosineSimilarity(dim=1)
#     question2vec = question_to_vec(question, self.word_embeddings, self.dim, *args, **kwargs)
#     output = cos(torch.Tensor(question2vec.reshape(1, -1)).cuda(), torch.Tensor(thread_vectors).cuda())
#     output = output.cpu().numpy()
#     data = [(i, thread_ids[i]) for i in range(len(output))] 
#     if return_score:
#         output = [(x, score) for score, x in sorted(zip(output, data), key=lambda pair: pair[0], reverse=True)]
#     else:
#         output = [x for _, x in sorted(zip(output, data), key=lambda pair: pair[0], reverse=True)]
#     if topk: return output[:topk]
#     else: return output

In [4]:
%%time
dm.get_similar('How do I calculate someones age in C', 'c', load=False, save=True, vect=vect, return_dist=True)

get vects
0:00:02.558332
create index
0:04:49.197026
question creation
query
0:00:00.001622
CPU times: user 30min 31s, sys: 24.7 s, total: 30min 55s
Wall time: 4min 52s


([5157079, 50660067, 50657512, 50661696, 50656291],
 array([0.8260471 , 0.89971733, 0.94661146, 0.97068864, 0.97275424],
       dtype=float32))

In [7]:
parent_comment_map = pickle.load((DATA_PATH/'parent_comment_map.pkl').open('rb'))

In [124]:
parent_ids = [25642565, 50661263, 2551706, 50656792, 50646159]
all_comment_ids = [j for i in results for j in parent_comment_map[i]]
all_comment_ids

[25807038, 50661315, 2551741, 2551760, 2551787, 2551789, 50658829, 50646212]

In [69]:
l = '?,'*len(parent_comment_map[2551706])
l[:-1]

'?,?,?,?'

In [127]:
import sqlite3
DB_NAME = 'StackOverflow.db'
connection = sqlite3.connect(DB_NAME)
c = connection.cursor()

parent_ids = [25642565, 50661263, 2551706, 50656792, 50646159]
all_comment_ids = [j for i in results for j in parent_comment_map[i]]
# all_comment_ids


neighbor_length = '?,' * len(all_comment_ids)
print(neighbor_length)
df = pd.read_sql("SELECT * FROM posts WHERE comment_id IN ({})".format(neighbor_length[:-1]), 
                                 connection, params=tuple(all_comment_ids))




?,?,?,?,?,?,?,?,


In [128]:
df

Unnamed: 0,comment_id,parent_id,comment,title,date,score,tags
0,2551741,2551706,<p>There are some good articles about that her...,,2010-03-31,0,
1,2551760,2551706,<p>The best way to handle this scenario would ...,,2010-03-31,0,
2,2551787,2551706,<p>You have following to choose from:</p> <ol>...,,2010-03-31,0,
3,2551789,2551706,<p>Something like below (the first part just w...,,2010-03-31,1,
4,25807038,25642565,<p>I specified parameter for 'file'</p> <pre><...,,2014-09-12,0,
5,50646212,50646159,<p>Why not wrap your task in a function and ca...,,2018-06-01,3,
6,50658829,50656792,"<p>Macros are expanded exactly once, when the ...",,2018-06-02,0,
7,50661315,50661263,"<p>This is a definition of a global variable, ...",,2018-06-02,3,


In [21]:
x = 25642565

df = pd.read_sql("SELECT * FROM posts WHERE comment_id = 36698259;", 
                                 connection)

In [22]:
df

Unnamed: 0,comment_id,parent_id,comment,title,date,score,tags
0,36698259,,<p>I would like to sort this list:</p> <p><cod...,Python - How to sort a list of strings,2016-04-18,0,python sorting python-3.x
