In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from collections import defaultdict

from pathlib import Path
MODEL_PATH = Path('models/')

Three Approaches
- Doc2Vec : you can train your dataset using Doc2Vec and then use the sentence vectors.
- Average of Word2Vec vectors : You can just take the average of all the word vectors in a sentence. This average vector will represent your sentence vector.
- Average of Word2Vec vectors with TF-IDF : this is one of the best approach which I will recommend. Just take the word vectors and multiply it with their TF-IDF scores. Just take the average and it will represent your sentence vector.

In [2]:
def avg_word_vectors(question, embeddings, dim):
    words_embedding = [embeddings[word] for word in question.lower().split() if word in embeddings]
    if not words_embedding:
        return np.zeros(dim)
    words_embedding = np.array(words_embedding).astype(np.float32)
    return words_embedding.mean(axis=0)

In [3]:
def compute_tfidf(X, X_test=None, save_path=MODEL_PATH/'tf_idf.pkl'):
    vect = TfidfVectorizer(token_pattern='(\S+)', min_df=2, max_df=0.9, ngram_range=(1,1))
    vect.fit(X)
    # save vect
    with open(save_path, mode='wb') as f:
        pickle.dump(vect, f)
        
    X = vect.transform(X)
    if X_test: 
        X_test = vect.transform(X_test)
        return X, X_test   
    return X, vect

# X, vect = compute_tfidf(data)
# idf_scores = defaultdict(lambda:0, zip(vect.get_feature_names(), vect.idf_))

def average_tfidf_vectors(question, embeddings, dim, vect):
    # if blank question 
    if not question:
        return np.zeros(dim).astype(np.float32)
    
    # get idf weights
    split_question = question.lower().split()
    words_embedding = np.zeros((dim, len(split_question))).astype(np.float32)
    for i, token in enumerate(split_question):
        if token in embeddings:
            embed_score = embeddings[token]
        else: embed_score = 0
        idf_score = idf_scores[token]
        # word vectors multiply by their TF-IDF scores
        words_embedding[:, i] = embed_score * idf_score
    
    return words_embedding.mean(axis=1)


# question = 'How do I use with WPF bindings dfhfohsodfhokhv what are the different use cases'
# question = ''
# average_tfidf_vectors(question, embeddings, 300, vect)

In [None]:
from gensim.models.doc2vec import LabeledSentence, TaggedDocument
from gensim.models.doc2vec import Doc2Vec



class LabeledLineSentence(object):
    def __init__(self):
        self.data = all_text
    def __iter__(self):
        for i, line in enumerate(self.data):
            yield TaggedDocument(words=line.split(), tags=['SENT_{}'.format(i)])



labeled_sent = LabeledLineSentence()
# print(next(iter(labeled_sent)))
model = Doc2Vec(vector_size=300, window=10, min_count=1, workers=4, alpha=0.025, min_alpha=0.025, epochs=10) 
model.build_vocab(labeled_sent)
model.train(labeled_sent, total_examples=model.corpus_count, epochs=model.epochs, )

In [None]:
model.save(MODEL_PATH/'doc2vec')

In [27]:
list(labeled_sent)[:1]
# model.wv.most_similar([labeled_sent][0])

model.infer_vector(labeled_sent)

[TaggedDocument(words=['I', 'want', 'to', 'use', 'a', 'track', 'bar', 'to', 'change', 'a', 'form', 's', 'opacity', 'This', 'is', 'my', 'code', 'When', 'I', 'build', 'the', 'application', 'it', 'gives', 'the', 'following', 'error', 'Cannot', 'implicitly', 'convert', 'type', 'to', 'I', 'tried', 'using', 'and', 'but', 'then', 'the', 'control', 'doesn', 't', 'work', 'This', 'code', 'worked', 'fine', 'in', 'a', 'past', 'VB', 'NET', 'project'], tags=['SENT_0'])]

In [39]:
ex = 'Given a representing a person s birthday how do I calculate their age in years'

# model.infer_vector(ex.split())
model.wv.most_similar_to_given(ex.split(), all_text[0])

KeyError: "word ' ' not in vocabulary"

In [11]:
from collections import Counter

c = Counter(' '.join(all_text).split())
c

Counter({'I': 8405,
         'want': 665,
         'to': 6625,
         'use': 711,
         'a': 5227,
         'track': 15,
         'bar': 18,
         'change': 127,
         'form': 79,
         's': 1051,
         'opacity': 6,
         'This': 301,
         'is': 2836,
         'my': 1188,
         'code': 565,
         'When': 198,
         'build': 53,
         'the': 8297,
         'application': 170,
         'it': 2132,
         'gives': 39,
         'following': 256,
         'error': 237,
         'Cannot': 9,
         'implicitly': 2,
         'convert': 75,
         'type': 119,
         'tried': 194,
         'using': 623,
         'and': 3777,
         'but': 1082,
         'then': 226,
         'control': 32,
         'doesn': 178,
         't': 947,
         'work': 338,
         'worked': 37,
         'fine': 55,
         'in': 3032,
         'past': 40,
         'VB': 1,
         'NET': 59,
         'project': 129,
         'Given': 27,
         'representing': 8,

In [21]:
from gensim.models.doc2vec import Doc2Vec, Doc2VecVocab
from gensim.models.doc2vec import LabeledSentence
from gensim.models.doc2vec import TaggedDocument
from gensim.models.phrases import Phrases
from gensim.models.word2vec import LineSentence


model = Doc2Vec([all_text[:10]], vector_size=100, min_count=2, epochs=1, verbose=1)

model.build_vocab_from_freq(c)
model.train(c, total_examples=len(c), epochs=model.epochs)

AttributeError: 'list' object has no attribute 'words'

In [None]:
model

In [20]:
[all_text[:10]]

[['I want to use a track bar to change a form s opacity This is my code When I build the application it gives the following error Cannot implicitly convert type to I tried using and but then the control doesn t work This code worked fine in a past VB NET project',
  'Given a representing a person s birthday how do I calculate their age in years',
  'Given a specific value how do I display relative time like 2 hours ago 3 days ago a month ago',
  'Is there any standard way for a Web Server to be able to determine a user s timezone within a web page Perhaps from an HTTP header or part of the user agent string',
  'I often have to sort a dictionary consisting of keys values by value For example I have a hash of words and respective frequencies that I want to order by frequency There is a which is good for a single value say frequency that I want to map it back to the word SortedDictionary orders by key not value Some resort to a custom class but is there a cleaner way',
  'Form based auth

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import html

re1 = re.compile(r'  +')
def clean_text(x, remove_html=True, other=False):
    if remove_html:
        x = re.sub(r'<code>[^>]*</code>', '', x)
        x = re.sub(r'<[^>]*>', '', x)
        x = re.sub(r'[^A-Za-z0-9]', ' ', x)
    if other:
        x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
            'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
            '<br />', "\n").replace('\\"', '').replace('<unk>','u_n').replace(' @.@ ','.').replace(
            ' @-@ ','-').replace('\\', ' \\ ').replace('"',"'").replace('\n', ' ').replace('\r', ' ').strip()
    return re1.sub(' ', html.unescape(x).strip())


In [5]:
import sqlite3
DB_NAME = 'StackOverflow.db'

start_index = 0
# limit = 5000
limit = 50
last_unix = 0
cur_length = limit
counter = 0
connection = sqlite3.connect(DB_NAME)
c = connection.cursor()

# while cur_length == limit:
#     df = pd.read_sql("SELECT comment, title FROM posts WHERE title is NOT NULL LIMIT {}".format(limit), connection)
#     df
        

In [43]:
df = pd.read_sql("SELECT comment, title FROM posts WHERE parent_id is NULL and score > 0", 
                 connection, chunksize=10000)

def preprocess(df, field):
    all_text = []
    for i, data in enumerate(df):  
#         print(i)
        all_text.extend([clean_text(i) for i in data[field]])
    return all_text

all_text = preprocess(df, 'title')
print(len(all_text))

7480341


In [7]:
X, vect = compute_tfidf(all_text)

In [8]:
# idf_scores = dict(zip(vect.get_feature_names(), vect.idf_))
idf_scores = defaultdict(lambda:0, zip(vect.get_feature_names(), vect.idf_))

In [54]:
idf_scores['cool']
# avg_word_vectors(X, )

6.605802066295998

In [33]:
X.getrow(0).data

array([0.14185559, 0.08583777, 0.09819513, 0.07051642, 0.07526899,
       0.06991629, 0.16301519, 0.06954121, 0.13013352, 0.07046131,
       0.11755379, 0.17314393, 0.09804197, 0.1647598 , 0.09804197,
       0.16862811, 0.13692706, 0.10050308, 0.09584303, 0.14989091,
       0.09830537, 0.16662482, 0.09460471, 0.09265912, 0.15983128,
       0.09736615, 0.1296375 , 0.06346664, 0.06333943, 0.11294143,
       0.13893036, 0.12639903, 0.14109408, 0.05996348, 0.17856848,
       0.04762009, 0.14185559, 0.03917769, 0.0904954 , 0.03942107,
       0.07703553, 0.11514935, 0.09505124, 0.17079183, 0.13824596,
       0.13013352, 0.15205463, 0.09143574, 0.18175239, 0.13446791,
       0.10380805, 0.09881659, 0.09881659, 0.1259665 , 0.14035342,
       0.15680724, 0.11215783, 0.12512168, 0.17856848, 0.05471944,
       0.13893036, 0.15698316, 0.10832387, 0.17856848, 0.04083706,
       0.18175239, 0.10540373])

In [10]:
def get_embeddings(filename):
    embeddings = {}
    with open(MODEL_PATH/filename, newline='') as f:
        reader = csv.reader(f, delimiter='\t')
        embed_list = list(reader)
    for line in embed_list:
        embeddings[line[0]] = np.asarray(line[1:], dtype=np.float32)
    return embeddings

embeddings = get_embeddings('starspace_embedding300.tsv')

In [139]:
def question_to_vec_tests(func, embeddings, dim, *args):
    if (np.zeros(dim) != func('', embeddings, dim, *args)).any():
        return "You need to return zero vector for empty question."
    if (np.zeros(dim) != func('thereisnosuchword', embeddings, dim, *args)).any():
        return "You need to return zero vector for the question, which consists only unknown words."
    if (embeddings['word'] != func('word', embeddings, dim, *args)).any():
        return "You need to check the corectness of your function."
    if ((embeddings['cool'] + embeddings['beans']) / 2 != func('Cool Beans', embeddings, dim, *args)).any():
        return "Your function should calculate a mean of word vectors."
    if (embeddings['word'] != func('thereisnosuchword word', embeddings, dim, *args)).any():
        return "You should not consider words which embeddings are unknown."
    return "Basic tests are passed."

In [117]:
question_to_vec_tests(avg_word_vectors, embeddings, 300)

'Basic tests are passed.'

In [140]:
question_to_vec_tests(average_tfidf_vectors, embeddings, 300, vect)

'You should not consider words which embeddings are unknown.'

In [None]:
from torch.nn import CosineSimilarity
import torch

def rank_candidates_pytorch(question, candidates, embeddings, dim=300):
    """
        question: a string
        candidates: a list of strings (candidates) which we want to rank
        embeddings: some embeddings
        dim: dimension of the current embeddings
        
        result: a list of pairs (initial position in the list, question)
    """
    cos = CosineSimilarity(dim=1)
    question2vec = question_to_vec(question, embeddings, dim)
    candidate2vecs = np.array([question_to_vec(cand, embeddings, dim) for cand in candidates])
    output = cos(torch.Tensor(question2vec.reshape(1, -1)).cuda(), torch.Tensor(candidate2vecs).cuda())
    output = output.cpu().numpy()
    data = [(i, candidates[i]) for i in range(len(output))]   
    return [x for _, x in sorted(zip(output, data), key=lambda pair: pair[0], reverse=True)]


In [None]:
def hits_count(dup_ranks, k):
    dup_ranks = np.array(dup_ranks)
    return len(dup_ranks[dup_ranks <= k]) / len(dup_ranks)

def dcg_score(dup_ranks, k):
    dup_ranks = np.array(dup_ranks)
    N = len(dup_ranks)
    dup_ranks = dup_ranks[dup_ranks <= k]
    out = np.sum((np.ones_like(dup_ranks)) / (np.log2(1.0 + dup_ranks))) / float(N)
    if np.isnan(out): out = 0.0
    return out

In [None]:
def read_corpus(filename):
    data = []
    for line in open(filename, encoding='utf-8'):
        data.append(line.strip().split('\t'))
    return data

validation = read_corpus('data/validation.tsv')

In [None]:
wv_ranking = []
for line in validation:
    q, *ex = line
    ranks = rank_candidates(q, ex, wv_embeddings)
    wv_ranking.append([r[0] for r in ranks].index(0) + 1)
    