In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import csv


DATA_PATH = Path('data/')
MODEL_PATH = Path('models/')

In [2]:
import nltk
import pickle
import re
import numpy as np
import html

# nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_set = set(stopwords.words('english'))

url_re = '(?<!=")(\b[\w]+:\/\/[\w-?&;#~=\.\/\@]+[\w\/])'


re1 = re.compile(r'  +')
def clean_text(x, remove_html=True, other=False):
    if remove_html:
        x = re.sub(r'<code>[^>]*</code>', '', x)
        x = re.sub(r'<[^>]*>', '', x)
        x = re.sub(r'[^A-Za-z0-9]', ' ', x)
    if other:
        x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
            'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
            '<br />', "\n").replace('\\"', '').replace('<unk>','u_n').replace(' @.@ ','.').replace(
            ' @-@ ','-').replace('\\', ' \\ ').replace('"',"'").replace('\n', ' ').replace('\r', ' ').strip()
    return re1.sub(' ', html.unescape(x).strip())




re1 = re.compile(r'  +')
def clean_title(text, remove_html=False, other=False):
    
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    
    if remove_html:
        x = re.sub(r'<code>[^>]*</code>', '', x)
        x = re.sub(r'<[^>]*>', '', x)
        x = re.sub(r'[^A-Za-z0-9]', ' ', x)
    text = text.lower()
    text = text.replace('π', 'pi').replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
            'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
            '<br />', "\n").replace('\\"', '').replace('<unk>','u_n').replace(' @.@ ','.').replace(
            ' @-@ ','-').replace('\\', ' \\ ').replace('"',"'").replace('\n', ' ').replace('\r', ' ')
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
    return re1.sub(' ', html.unescape(text).strip())


import sqlite3
DB_NAME = 'StackOverflow_python.db'

connection = sqlite3.connect(DB_NAME)
c = connection.cursor() 

def get_data(column):
    df = pd.read_sql("SELECT {} FROM posts WHERE parent_id is NULL".format(column), 
                 connection, chunksize=10000)
    all_text = preprocess(df)
    return all_text
    

    
def preprocess(df):
    final_df = pd.DataFrame(columns=['comment_id', 'title', 'comment'])
    for i, data in enumerate(df):
        data['title'] = data['title'].map(lambda x: clean_title(x))
        final_df = final_df.append(data)
    return final_df

df = get_data('comment_id, title, comment')
print(df.shape)
df.head()

(1222892, 3)


Unnamed: 0,comment_id,title,comment
0,287312,tix documentation python,<p>I've recently starting playing around with ...
1,354124,statistical studies indicates python productive,"<p>If I do a google search with the string ""p..."
2,370733,trac documentation,<p>I'm trying to write my first little plugin ...
3,405582,function class documentation best practices py...,<p>I am looking for best practices for functio...
4,527134,python java language exposed self taught progr...,<p>Of the two which one would exposed someone ...


In [3]:
# only used to create tf_idf for `idf_scores`

def compute_tfidf(X, X_test=None, save_path=MODEL_PATH/'tf_idf_python_title_stopwords.pkl', load=True, save=False):
    
    if load:
        with open(save_path, mode='rb') as f:
            vect = pickle.load(f) 
    else:
        vect = TfidfVectorizer(token_pattern='(\S+)', min_df=3, max_df=0.9, ngram_range=(1,1))
        vect.fit(X)
        if save:
            # save vect
            with open(save_path, mode='wb') as f:
                pickle.dump(vect, f)
            print('SAVED')

    X = vect.transform(X)
    if X_test: 
        X_test = vect.transform(X_test)
        return X, X_test, vect   
    return X, vect


X, vect = compute_tfidf(df.title, load=True, save=False)
idf_scores = defaultdict(lambda:0, zip(vect.get_feature_names(), vect.idf_))
print(len(idf_scores))

41166


In [4]:
# read in star-space word embeddings 
def get_embeddings(filename):
    embeddings = {}
    with open(MODEL_PATH/filename, newline='') as f:
        reader = csv.reader(f, delimiter='\t')
        embed_list = list(reader)
    for line in embed_list:
        embeddings[line[0]] = np.asarray(line[1:], dtype=np.float32)
    return embeddings

# embeddings = get_embeddings('starspace_embedding300_ngram2.tsv')
embeddings = get_embeddings('starspace_embedding100_ngram2.tsv')

In [5]:
def avg_word_vectors(question, embeddings, dim):
    words_embedding = [embeddings[word] for word in question.lower().split() if word in embeddings]
    if not words_embedding:
        return np.zeros(dim)
    words_embedding = np.array(words_embedding).astype(np.float32)
    return words_embedding.mean(axis=0)

def average_tfidf_vectors(question, embeddings, dim, vect):
    # get idf weights
    split_question = [word for word in question.lower().split() if word in embeddings]
    if not split_question:
        return np.zeros(dim).astype(np.float32)
    words_embedding = np.zeros((dim, len(split_question))).astype(np.float32)
    for i, token in enumerate(split_question):
        if token in embeddings:
            embed_score = embeddings[token]
        else: embed_score = 0
        idf_score = idf_scores[token]
        # word vectors multiply by their TF-IDF scores
        words_embedding[:, i] = embed_score * idf_score    
    return words_embedding.mean(axis=1)

In [21]:
# from torch.nn import CosineSimilarity
# import torch

# def rank_candidates(question, candidates, embeddings, dim=300, question_to_vec=avg_word_vectors, topk=5, 
#                     return_score=False, save=False, *args, **kwargs):
#     cos = CosineSimilarity(dim=1)
#     question2vec = question_to_vec(question, embeddings, dim, *args, **kwargs)
#     candidate2vecs = np.array([question_to_vec(cand, embeddings, dim, *args, **kwargs) for cand in candidates])
#     if save:
#         np.save(MODEL_PATH/'candidate2vecs_python_title.npy', candidate2vecs)
#     candidate2vecs = np.load(MODEL_PATH/'candidate2vecs_python_title.npy')

#     output = cos(torch.Tensor(question2vec.reshape(1, -1)).cuda(), torch.Tensor(candidate2vecs).cuda())
#     output = output.cpu().numpy()
#     print(output.shape)
#     data = [(i, candidates[i]) for i in range(len(output))]   
#     if return_score:
#         output = [(x, score) for score, x in sorted(zip(output, data), key=lambda pair: pair[0], reverse=True)]
#     else:
#         output = [x for _, x in sorted(zip(output, data), key=lambda pair: pair[0], reverse=True)]
#     if topk: return output[:topk]
#     else: return output

In [60]:
# %%time
# ex = questions.iloc[105].Title
# print(ex)
# ex = 'How do I sort a list of strings in Python?'
# clean_example = clean_title(ex)
# print(clean_example)
# print(rank_candidates(clean_example, df.title, embeddings, 100, average_tfidf_vectors, return_score=True, vect=vect))

sort list strings python
CPU times: user 76 µs, sys: 0 ns, total: 76 µs
Wall time: 54.8 µs


In [6]:
tag_path = 'python_only.pkl'
THREAD_PATH = MODEL_PATH/'thread_embeddings_by_tags'/tag_path
THREAD_PATH_AVG = MODEL_PATH/'thread_embeddings_by_tags/python_only_avg.pkl'

In [32]:
# create python threads (average_tfidf_vectors for each title in df)
run = False
if run:    
    import os
    embeddings_dim = 100

    tag_posts = df.title.values
    tag_post_ids = df.comment_id.values.astype(np.int32)    
    tag_vectors = np.zeros((len(df), embeddings_dim), dtype=np.float32)

    for i, question in enumerate(tag_posts):
        tag_vectors[i, :] = average_tfidf_vectors(question, embeddings, embeddings_dim, vect)
#         tag_vectors[i, :] = avg_word_vectors(question, embeddings, embeddings_dim)

    pickle.dump((tag_post_ids, tag_vectors), (THREAD_PATH).open('wb'))
#     pickle.dump((tag_post_ids, tag_vectors), (THREAD_PATH_AVG).open('wb'))

In [49]:
df.comment[:10]

0    <p>I've recently starting playing around with ...
1    <p>If I do a google search with the string  "p...
2    <p>I'm trying to write my first little plugin ...
3    <p>I am looking for best practices for functio...
4    <p>Of the two which one would exposed someone ...
5    <p>What's the point of both? When do you think...
6    <p>A common task in programming interviews (no...
7    <p>I'm using Notepad++ as an editor to write p...
8    <p>I'm a long-time PHP developer looking to tr...
9    <p>I have been teaching myself the rudiments o...
Name: comment, dtype: object

In [50]:
# create python threads (average_tfidf_vectors for each comment in df)
THREAD_PATH_AVG_COMMENT = MODEL_PATH/'thread_embeddings_by_tags/python_only_avg_comment.pkl'
run = False
if run:    
    import os
    embeddings_dim = 100

    tag_posts = df.comment.values
    tag_post_ids = df.comment_id.values.astype(np.int32)    
    tag_vectors = np.zeros((len(df), embeddings_dim), dtype=np.float32)

    for i, comment in enumerate(tag_posts):
        tag_vectors[i, :] = avg_word_vectors(clean_text(comment), embeddings, embeddings_dim)

    pickle.dump((tag_post_ids, tag_vectors), (THREAD_PATH_AVG_COMMENT).open('wb'))

In [51]:
%%time
import annoy

thread_ids, thread_vectors = pickle.load((THREAD_PATH_AVG).open('rb'))
# thread_ids, thread_vectors = pickle.load((THREAD_PATH_AVG_COMMENT).open('rb'))
dim = 100
n_trees = 30
# filepath = str(MODEL_PATH/'knn_embeddings_path/python_only.annoy')
filepath = str(MODEL_PATH/'knn_embeddings_path/python_only_comment.annoy')

a = annoy.AnnoyIndex(dim)

print("Adding")
for i, vect in enumerate(thread_vectors):
    a.add_item(i, vect)

print("Starting to build")
a.build(n_trees)
print("Saving")
a.save(filepath)


# def build_annoy_index(metric, input_filename, output_filename, n_trees):
# # Creates an index for Approimate Nearest Neighbors retrieval, using the annoy library.
#     print 'Aproximate Nearest Neighbors for: ' + input_filename
#     centroids_array = np.load(input_filename)
#     n_dimensions = centroids_array.shape[1]
#     t = AnnoyIndex(n_dimensions, metric=metric)
#     for i in range(centroids_array.shape[0]):
#         t.add_item(i, centroids_array[i][:])
#     print "Building Index - Number of Trees: ",str(n_trees)
#     t.build(n_trees)
#     t.save(output_filename) 

Adding
Starting to build
Saving
CPU times: user 1min 1s, sys: 873 ms, total: 1min 1s
Wall time: 1min 1s


In [45]:
%%time
import annoy

thread_ids, thread_vectors = pickle.load((THREAD_PATH_AVG).open('rb'))
dim = 100
n_trees = 30
filepath = str(MODEL_PATH/'knn_embeddings_path/python_only.annoy')

a = annoy.AnnoyIndex(dim)
a.load(filepath)

CPU times: user 74.6 ms, sys: 232 ms, total: 307 ms
Wall time: 306 ms


In [55]:
test_idx = 1100

# question = 'How do I sort a list of strings in Python?'
question = df.iloc[test_idx].title
print(question)
# question = 'What is a transformation in python'
clean_question = clean_title(question)
# question2vec = average_tfidf_vectors(clean_question, embeddings, 100, vect)
question2vec = avg_word_vectors(clean_question, embeddings, 100)

idxs, dist = a.get_nns_by_vector(question2vec, 100, include_distances=True)

print([(thread_ids[i], d) for i, d in zip(idxs, dist)])

weird behavior pandas data as_matrix
[(19330561, 0.7556686997413635), (42706059, 0.7725855112075806), (36333507, 0.7814211249351501), (50505284, 0.7818649411201477), (47550765, 0.7994747757911682), (37107796, 0.802489697933197), (46017678, 0.8028570413589478), (34532226, 0.8033103346824646), (39398821, 0.8052082061767578), (45942222, 0.80625319480896), (30518588, 0.8069236278533936), (24813961, 0.8092679381370544), (33699330, 0.8096696138381958), (46757603, 0.8113886713981628), (48472156, 0.8113936185836792), (35230388, 0.8125177025794983), (27991786, 0.8135660886764526), (17020763, 0.815342903137207), (31690821, 0.8162144422531128), (34259637, 0.8175956010818481), (19544948, 0.8177072405815125), (36153944, 0.8180021047592163), (50456590, 0.8184759616851807), (13689512, 0.8193232417106628), (28429082, 0.8213790059089661), (38272465, 0.8216429948806763), (27887681, 0.8218681216239929), (44299013, 0.8221104741096497), (42268930, 0.8225754499435425), (40125654, 0.8242794871330261), (22210

In [23]:
# a.get_distance(question2vec, 140)
df

1222892

In [56]:
def get_df(ids):
        neighbor_length = '?,' * len(ids)
        df = pd.read_sql("SELECT * FROM posts WHERE comment_id IN ({})".format(neighbor_length[:-1]), 
                                         connection, params=tuple(ids))
        return df
    
    

most_similar = get_df(np.array([thread_ids[i] for i in idxs]).flatten().tolist())
most_similar

Unnamed: 0,comment_id,parent_id,comment,title,score,tags
0,11548005,,<p>Is there a preferred way to keep the data t...,NumPy or Pandas: Keeping array type as integer...,76,python numpy int pandas type-conversion
1,11668446,,<p>I'm trying to cast the type of a DataFrame ...,pandas DataFrame type cast,1,python numpy pandas
2,13689512,,<p>I want to use numpy.diff on a pandas Series...,numpy diff on a pandas Series,11,python numpy pandas
3,17020763,,<p>I have a huge dataframe with unique index. ...,pandas 0.10.1 to 0.11.0 .ix method,2,python pandas
4,19330561,,"<p>I'm loading a data file, extracting certain...",Python - Trouble plotting datetime index with ...,2,python numpy matplotlib pandas
5,19544948,,<p>I have a pandas data frame like this:</p> <...,How to set up a new value in pandas but keep t...,1,python pandas
6,22210865,,"<p>Why in Python, using Pandas, we cannot use ...",Python: Assign values to first observation of ...,1,python group-by pandas
7,23451244,,<p>I have a pandas dataframe with a column of ...,how to zscore normalize pandas column with nans?,13,python numpy pandas scipy
8,24017710,,<p>My PANDAS data has columns that were read a...,changing column types of a pandas data frame -...,1,python pandas
9,24813961,,<p>I'm using pandas 0.13.1. This dataframe:</p...,why statistics methods of DataFrames return Se...,0,python numpy pandas


In [6]:
# # create knn embeddings (essentially a look up for the word vectors)

# import nmslib

# M = 15
# efC = 100

# num_threads = 4
# index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0,
#                      'skip_optimized_index' : 0 }

# efS = 100
# query_time_params = {'efSearch': efS}

# def get_knns(index, vecs):
#     return zip(*index.knnQueryBatch(vecs, k=3, num_threads=4))

# def get_knn(index, vec):
#     return index.knnQuery(vec, k=3)


# def create_index(data, space='cosinesimil', load=True, save=False, 
#                  filepath=str(MODEL_PATH/'knn_embeddings_path/python_only.bin')):
#     index = nmslib.init(space=space)
#     if load:
#         index.loadIndex(filepath)
#         index.setQueryTimeParams(query_time_params)
#         print('LOADED')
#         return index
#     else:
#         print('CREATING')
#         index.addDataPointBatch(data)
#         index.createIndex(index_time_params)
#         index.setQueryTimeParams(query_time_params)
#         if save: 
#             index.saveIndex(filepath)
#             print('SAVED')
#         return index

In [1]:
# %%time
# # thread_ids, thread_vectors = pickle.load((THREAD_PATH).open('rb'))
# thread_ids, thread_vectors = pickle.load((THREAD_PATH_AVG).open('rb'))
        
# nms_index = create_index(thread_vectors, space='cosinesimil', save=False, load=True, 
#                         filepath=str(MODEL_PATH/'knn_embeddings_path/python_only_avg.bin'))

In [62]:
# test_idx = 140

# # question = 'How do I sort a list of strings in Python?'
# question = df.iloc[test_idx].title
# print(question)
# clean_question = clean_title(question)
# # question2vec = average_tfidf_vectors(clean_question, embeddings, 100, vect)
# question2vec = avg_word_vectors(clean_question, embeddings, 100)


# idxs, distances = get_knns(nms_index, [question2vec])
# print([(thread_ids[i], d) for i, d in zip(idxs[0], distances[0])])
# # print([thread_ids[i] for i in idxs[0]])

use subprocess tar compress folder
[(31179080, 0.3577078), (50553004, 0.5768412), (50654215, 0.62134665)]


In [63]:
# def get_df(ids):
#         neighbor_length = '?,' * len(ids)
#         df = pd.read_sql("SELECT * FROM posts WHERE comment_id IN ({})".format(neighbor_length[:-1]), 
#                                          connection, params=tuple(ids))
#         return df
    
    

# get_df(np.array([thread_ids[i] for i in idxs[0]]).flatten().tolist())

Unnamed: 0,comment_id,parent_id,comment,title,score,tags
0,31179080,,<p>I am trying to make a program in Python tha...,Deleting all files in a folder in Python,0,python python-3.x temporary-files temp
1,50553004,,<p>Using Python I want to upload a file to a r...,Upload a file with a ip folder on an FTP server,-1,python urllib
2,50654215,,<p>I have have python file <code>a.py</code> w...,How to make sleep a file and use its function ...,0,python multithreading python-3.x function caching


In [52]:
pd.read_sql("SELECT * FROM posts WHERE comment_id = 15575160", connection)

Unnamed: 0,comment_id,parent_id,comment,title,score,tags
0,15575160,,<p>I'm developing an app that needs to simulat...,Drag.start in pyqt,2,python qt drag-and-drop pyqt pyside


In [48]:
# from fuzzywuzzy import fuzz, StringMatcher
# from difflib import Differ, SequenceMatcher, context_diff, get_close_matches

# try:
#     from cfuzzyset import cFuzzySet as FuzzySet
# except ImportError:
from fuzzyset import FuzzySet

# Levenshtein distance
    
def num_matching_words(query):
    f = FuzzySet()
    for title, comment in zip(most_similar['title'], most_similar['comment']):
        f.add(clean_text(title + ' ' + comment))
    return f.get(query)
    

print(question)
num_matching_words(question)

python temperature humidity extech rht10 usb drive


[(0.9090909090909091,
  'Python Temperature Humidity from Extech RHT10 USB Drive')]

In [58]:
# cosine distance between TF-IDF vectors
def unpickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

    
    
from sklearn.metrics.pairwise import linear_kernel
# tfid_vectorizer = unpickle(MODEL_PATHL/'tf_idf_python_title_stopwords.pkl')

cosine_similarities = linear_kernel(X[0:1], X).flatten()

related_docs_indices = cosine_similarities.argsort()[:-5:-1]
related_docs_indices

array([     0, 496181,  17821,  41049])

In [101]:
df.iloc[496181]

comment_id                                             29474967
title                        python tix help documentation view
comment       <p>Doing an application in Tix (Python), I wan...
Name: 6181, dtype: object

In [102]:
df.iloc[0]

comment_id                                               287312
title                                  tix documentation python
comment       <p>I've recently starting playing around with ...
Name: 0, dtype: object

Question-Answer ideas

- vector similarities based on average word vector
 - try fastText n-grams
- number of matching words 
- cosine distance between TF-IDF vectors 
- Levenshtein distance

- symbolic n-grams (1 - 5)
 - SVD and take first 300 components 

In [None]:
Need to think of a way to evaluate the model. 