# Vector Representation and Question Answer Similarity 

In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from collections import defaultdict
import re
import html

from pathlib import Path
MODEL_PATH = Path('models/')

In [2]:
re1 = re.compile(r'  +')
def clean_text(x, remove_html=True, other=False):
    if remove_html:
        x = re.sub(r'<code>[^>]*</code>', '', x)
        x = re.sub(r'<[^>]*>', '', x)
        x = re.sub(r'[^A-Za-z0-9]', ' ', x)
    if other:
        x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
            'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
            '<br />', "\n").replace('\\"', '').replace('<unk>','u_n').replace(' @.@ ','.').replace(
            ' @-@ ','-').replace('\\', ' \\ ').replace('"',"'").replace('\n', ' ').replace('\r', ' ').strip()
    return re1.sub(' ', html.unescape(x).strip())


import nltk
import pickle
import re
import numpy as np

# nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_set = set(stopwords.words('english'))


re1 = re.compile(r'  +')
def clean_title(text, remove_html=False, other=False):
    
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    
    if remove_html:
        x = re.sub(r'<code>[^>]*</code>', '', x)
        x = re.sub(r'<[^>]*>', '', x)
        x = re.sub(r'[^A-Za-z0-9]', ' ', x)
    text = text.lower()
    text = text.replace('π', 'pi').replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
            'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
            '<br />', "\n").replace('\\"', '').replace('<unk>','u_n').replace(' @.@ ','.').replace(
            ' @-@ ','-').replace('\\', ' \\ ').replace('"',"'").replace('\n', ' ').replace('\r', ' ')
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
    return re1.sub(' ', html.unescape(text).strip())


import sqlite3
DB_NAME = 'StackOverflow.db'

connection = sqlite3.connect(DB_NAME)
c = connection.cursor() 

def get_data(column):
    df = pd.read_sql("SELECT {} FROM posts WHERE parent_id is NULL".format(column), 
                 connection, chunksize=10000)
    all_text = preprocess(df, 'title')
    return all_text
    

def preprocess(df, field):
    comment_id = []
    all_text = []
    for i, data in enumerate(df):  
#         all_text.extend([clean_text(x, remove_html=True) for x in data['title']])
        all_text.extend([clean_title(x) for x in data['title']])
        comment_id.extend(x for x in data['comment_id'])
    return all_text, comment_id


all_titles, comment_ids = get_data('comment_id, title')
print(len(all_titles), len(all_titles))
all_titles[:10]

16022817 16022817


['applying opacity form use decimal double value',
 'percentage width child element absolutely positioned parent internet explorer 7',
 'calculate someones age c#',
 'calculate relative time c#',
 'determine users timezone',
 'difference mathfloor mathtruncate',
 'filling dataset datatable linq query result set',
 'binary data mysql',
 'fastest way get value pi',
 'throw error mysql trigger']

- Need to compare them to the COMMENTS as well
- Need to link the post ids to the comments to return the top ANSWERS for the question 

In [3]:
# RECOMPUTE with ALL text !!!!!!!!!!!!!!

def compute_tfidf(X, X_test=None, save_path=MODEL_PATH/'tf_idf.pkl', load=True, save=False):
    
    if load:
        with open(save_path, mode='rb') as f:
            vect = pickle.load(f) 
    else:
        vect = TfidfVectorizer(token_pattern='(\S+)', min_df=5, max_df=0.9, ngram_range=(1,1))
        vect.fit(X)
        if save:
            # save vect
            with open(save_path, mode='wb') as f:
                pickle.dump(vect, f)
            print('SAVED')

    X = vect.transform(X)
    if X_test: 
        X_test = vect.transform(X_test)
        return X, X_test, vect   
    return X, vect


X, vect = compute_tfidf(all_titles, load=True, save=False)
idf_scores = defaultdict(lambda:0, zip(vect.get_feature_names(), vect.idf_))
print(len(idf_scores))

168402


Read in Word Embeddings

In [4]:
def get_embeddings(filename):
    embeddings = {}
    with open(MODEL_PATH/filename, newline='') as f:
        reader = csv.reader(f, delimiter='\t')
        embed_list = list(reader)
    for line in embed_list:
        embeddings[line[0]] = np.asarray(line[1:], dtype=np.float32)
    return embeddings

# embeddings = get_embeddings('starspace_embedding300_ngram2.tsv')
embeddings = get_embeddings('starspace_embedding100_ngram2.tsv')

Three Approaches
- Doc2Vec : you can train your dataset using Doc2Vec and then use the sentence vectors.
- Average of Word2Vec vectors : You can just take the average of all the word vectors in a sentence. This average vector will represent your sentence vector.
- Average of Word2Vec vectors with TF-IDF : this is one of the best approach which I will recommend. Just take the word vectors and multiply it with their TF-IDF scores. Just take the average and it will represent your sentence vector.

In [6]:
def avg_word_vectors(question, embeddings, dim):
    words_embedding = [embeddings[word] for word in question.lower().split() if word in embeddings]
    if not words_embedding:
        return np.zeros(dim)
    words_embedding = np.array(words_embedding).astype(np.float32)
    return words_embedding.mean(axis=0)

def average_tfidf_vectors(question, embeddings, dim, vect):
    # get idf weights
    split_question = [word for word in question.lower().split() if word in embeddings]
    if not split_question:
        return np.zeros(dim).astype(np.float32)
    words_embedding = np.zeros((dim, len(split_question))).astype(np.float32)
    for i, token in enumerate(split_question):
        if token in embeddings:
            embed_score = embeddings[token]
        else: embed_score = 0
        idf_score = idf_scores[token]
        # word vectors multiply by their TF-IDF scores
        words_embedding[:, i] = embed_score * idf_score    
    return words_embedding.mean(axis=1)

def question_to_vec_tests(func, embeddings, dim, *args):
    if (np.zeros(dim) != func('', embeddings, dim, *args)).any():
        return "You need to return zero vector for empty question."
    if (np.zeros(dim) != func('thereisnosuchword', embeddings, dim, *args)).any():
        return "You need to return zero vector for the question, which consists only unknown words."
    if (embeddings['word'] != func('word', embeddings, dim, *args)).any():
        return "You need to check the corectness of your function."
    if ((embeddings['cool'] + embeddings['beans']) / 2 != func('Cool Beans', embeddings, dim, *args)).any():
        return "Your function should calculate a mean of word vectors."
    if (embeddings['word'] != func('thereisnosuchword word', embeddings, dim, *args)).any():
        return "You should not consider words which embeddings are unknown."
    return "Basic tests are passed."

def question_to_vec_tfidf_tests(func, embeddings, dim, *args):
    if (np.zeros(dim) != func('', embeddings, dim, *args)).any():
        return "You need to return zero vector for empty question."
    if (np.zeros(dim) != func('thereisnosuchword', embeddings, dim, *args)).any():
        return "You need to return zero vector for the question, which consists only unknown words."
    if (embeddings['word'] * idf_scores['word'] != func('word', embeddings, dim, *args)).any():
        return "You need to check the corectness of your function."
    if (((embeddings['cool'] * idf_scores['cool']) + (embeddings['beans'] * idf_scores['beans'])) / 2 != func('Cool Beans', embeddings, dim, *args)).any():
        return "Your function should calculate a mean of word vectors."
    if (embeddings['word'] * idf_scores['word'] != func('thereisnosuchword word', embeddings, dim, *args)).any():
        return "You should not consider words which embeddings are unknown."
    return "Basic tests are passed."

print(question_to_vec_tests(avg_word_vectors, embeddings, 100))
print(question_to_vec_tfidf_tests(average_tfidf_vectors, embeddings, 100, vect))

Basic tests are passed.
Basic tests are passed.


In [7]:
from torch.nn import CosineSimilarity
import torch

def rank_candidates(question, candidates, embeddings, dim=300, question_to_vec=avg_word_vectors, topk=5, 
                    return_score=False, *args, **kwargs):
    """
        question: a string
        candidates: a list of strings (candidates) which we want to rank
        embeddings: some embeddings
        dim: dimension of the current embeddings
        
        result: a list of pairs (initial position in the list, question)
    """
    cos = CosineSimilarity(dim=1)
    question2vec = question_to_vec(question, embeddings, dim, *args, **kwargs)
    candidate2vecs = np.array([question_to_vec(cand, embeddings, dim, *args, **kwargs) for cand in candidates])
    output = cos(torch.Tensor(question2vec.reshape(1, -1)).cuda(), torch.Tensor(candidate2vecs).cuda())
    output = output.cpu().numpy()
    data = [(i, candidates[i]) for i in range(len(output))]   
    if return_score:
        output = [(x, score) for score, x in sorted(zip(output, data), key=lambda pair: pair[0], reverse=True)]
    else:
        output = [x for _, x in sorted(zip(output, data), key=lambda pair: pair[0], reverse=True)]
    if topk: return output[:topk]
    else: return output

In [8]:
%%time
ex = 'While applying opacity to a form should we use a decimal or a double value'
print(rank_candidates(ex, all_titles[:2314688], embeddings, 100, average_tfidf_vectors, return_score=True, vect=vect))

[((0, 'applying opacity form use decimal double value'), 1.0000001), ((133296, 'use double instead decimal'), 0.6872146), ((2298201, 'double decimal values'), 0.6751053), ((369862, 'decimal double'), 0.6741582), ((1530132, 'convert double c# decimal c++'), 0.65882087)]
CPU times: user 57.6 s, sys: 575 ms, total: 58.2 s
Wall time: 58.3 s


In [12]:
import nmslib

M = 1
efC = 1000
num_threads = 4
index_time_params = {'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}

def create_index(a):
    index = nmslib.init(space='cosinesimil')
    index.addDataPointBatch(a)
    index.createIndex()
    return index

def get_knns(index, vecs):
    return zip(*index.knnQueryBatch(vecs, k=10, num_threads=4))

def get_knn(index, vec):
    return index.knnQuery(vec, k=10)

In [None]:
# %%time
# candidate2vecs = np.array([average_tfidf_vectors(cand, embeddings, 300, vect) for cand in all_titles[:2314688]])

# nms_index = create_index(candidate2vecs)

In [139]:
question = 'How do I calculate someones age in C#?'
question2vec = average_tfidf_vectors(question, embeddings, 300, vect)

idxs, distances = get_knns(nms_index, [question2vec])
# idxs[0]#, distances

[all_titles[i] for i in idxs[0]]

['What s the best way to get speakers for my Users Group',
 'Testing for inequality in T SQL',
 'SQL query count and group by',
 'How do I most elegantly express left join with aggregate SQL as LINQ query',
 'How do I use T SQL Group By',
 'Can I logically reorder columns in a table',
 'Why all the Active Record hate',
 'How do I Transform Sql Columns into Rows',
 'Access a SQL Server 2005 Express Edition from a network computer',
 'Select all columns except one in MySQL']

In [98]:
# def unpickle(filename):
#     with open(filename, 'rb') as f:
#         return pickle.load(f)

# thread_embeddings_path = MODEL_PATH/'thread_embeddings_by_tags'

# tag_path = 'c' + '.pkl'
# embeddings_file = thread_embeddings_path/tag_path
# ids, vectors = unpickle(embeddings_file)

In [99]:
# tag_path = 'c' + '.bin'
# knn_path = MODEL_PATH/'knn_embeddings_path'
# filepath = str(knn_path/tag_path)

# index = nmslib.init()
# index.loadIndex(filepath)
# question = 'How do I calculate someones age in C#?'
# question2vec = average_tfidf_vectors(question, embeddings, 300, vect)
# idxs, distances = index.knnQuery([question2vec], k=5)

In [76]:
i = 30
print(all_titles[i])
idxs, distances = get_knn(nms_index, average_tfidf_vectors(all_titles[i], embeddings, 300, vect))
print([all_titles[i] for i in idxs])
print(idxs)

How would you access Object properties from within an object method
['How would you access Object properties from within an object method', 'Using object property as default for method property', 'Should I provide accessor methods Getter Setters for public protected components on a form', 'Get a new object instance from a Type', 'Adding a Method to an Existing Object Instance', 'Linq to objects select first object', 'How to generate getters and setters in Visual Studio', 'Using in to match an attribute of Python objects in an array', 'User Control Property Designer Properties', 'What is Object Mocking and when do I need it']
[ 30 166 610  94 117 737 314  87 858 435]


# Tests

In [9]:
def hits_count(dup_ranks, k):
    dup_ranks = np.array(dup_ranks)
    return len(dup_ranks[dup_ranks <= k]) / len(dup_ranks)

def dcg_score(dup_ranks, k):
    dup_ranks = np.array(dup_ranks)
    N = len(dup_ranks)
    dup_ranks = dup_ranks[dup_ranks <= k]
    out = np.sum((np.ones_like(dup_ranks)) / (np.log2(1.0 + dup_ranks))) / float(N)
    if np.isnan(out): out = 0.0
    return out

def read_corpus(filename):
    data = []
    for line in open(filename, encoding='utf-8'):
        data.append(line.strip().split('\t'))
    return data

def test_embeddings(embeddings, dim, func, *args, **kwargs):
    wv_ranking = []
    for line in validation:
        q, *ex = line
        ranks = rank_candidates(q, ex, embeddings, dim, func, topk=None, *args, **kwargs)
        wv_ranking.append([r[0] for r in ranks].index(0) + 1)
        
    for k in [1, 5, 10, 100, 500, 1000]:
        print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (k, dcg_score(wv_ranking, k), 
                                              k, hits_count(wv_ranking, k)))

In [10]:
validation = read_corpus('data/validation.tsv')

In [45]:
test_embeddings(embeddings, 300, avg_word_vectors)

DCG@   1: 0.423 | Hits@   1: 0.423
DCG@   5: 0.524 | Hits@   5: 0.612
DCG@  10: 0.547 | Hits@  10: 0.683
DCG@ 100: 0.585 | Hits@ 100: 0.867
DCG@ 500: 0.598 | Hits@ 500: 0.966
DCG@1000: 0.602 | Hits@1000: 1.000


In [24]:
test_embeddings(embeddings, 100, avg_word_vectors)

DCG@   1: 0.434 | Hits@   1: 0.434
DCG@   5: 0.528 | Hits@   5: 0.610
DCG@  10: 0.548 | Hits@  10: 0.670
DCG@ 100: 0.583 | Hits@ 100: 0.843
DCG@ 500: 0.598 | Hits@ 500: 0.956
DCG@1000: 0.602 | Hits@1000: 1.000


In [46]:
test_embeddings(embeddings, 300, average_tfidf_vectors, vect=vect)

DCG@   1: 0.453 | Hits@   1: 0.453
DCG@   5: 0.548 | Hits@   5: 0.631
DCG@  10: 0.567 | Hits@  10: 0.690
DCG@ 100: 0.602 | Hits@ 100: 0.861
DCG@ 500: 0.616 | Hits@ 500: 0.964
DCG@1000: 0.619 | Hits@1000: 1.000


In [28]:
test_embeddings(embeddings, 100, average_tfidf_vectors, vect=vect)

DCG@   1: 0.413 | Hits@   1: 0.413
DCG@   5: 0.503 | Hits@   5: 0.579
DCG@  10: 0.520 | Hits@  10: 0.632
DCG@ 100: 0.557 | Hits@ 100: 0.812
DCG@ 500: 0.574 | Hits@ 500: 0.946
DCG@1000: 0.580 | Hits@1000: 1.000


In [11]:
# n-gram 2 100 dimension 
test_embeddings(embeddings, 100, avg_word_vectors)

DCG@   1: 0.417 | Hits@   1: 0.417
DCG@   5: 0.516 | Hits@   5: 0.605
DCG@  10: 0.539 | Hits@  10: 0.677
DCG@ 100: 0.578 | Hits@ 100: 0.859
DCG@ 500: 0.592 | Hits@ 500: 0.967
DCG@1000: 0.595 | Hits@1000: 1.000


In [12]:
# n-gram 2 100 dimension 
test_embeddings(embeddings, 100, average_tfidf_vectors, vect=vect)

DCG@   1: 0.441 | Hits@   1: 0.441
DCG@   5: 0.533 | Hits@   5: 0.614
DCG@  10: 0.554 | Hits@  10: 0.678
DCG@ 100: 0.591 | Hits@ 100: 0.854
DCG@ 500: 0.605 | Hits@ 500: 0.965
DCG@1000: 0.609 | Hits@1000: 1.000


CHECK out using [nmslib](https://github.com/nmslib/nmslib)

In [74]:
from sklearn.datasets import make_multilabel_classification, make_classification
from sklearn.linear_model import LogisticRegression

x, y = make_classification(n_classes=5, n_clusters_per_class=5, n_informative=5)


clf = LogisticRegression(random_state=100)
clf.fit(x, y)
preds = clf.predict_proba(x)
preds_full = clf.predict(x)

In [76]:
preds_full

array([1, 1, 1, 3, 2, 1, 1, 0, 3, 0, 1, 2, 4, 1, 4, 1, 4, 4, 0, 4, 0, 4,
       3, 1, 0, 4, 4, 2, 1, 1, 0, 0, 3, 1, 4, 4, 2, 4, 0, 3, 2, 1, 3, 2,
       2, 3, 0, 0, 4, 0, 1, 4, 1, 0, 1, 3, 0, 2, 4, 4, 2, 2, 3, 4, 0, 3,
       1, 0, 0, 3, 0, 3, 1, 3, 1, 2, 2, 1, 2, 0, 2, 3, 2, 3, 2, 0, 4, 0,
       1, 2, 2, 4, 3, 3, 4, 1, 4, 4, 0, 3])

In [112]:
def get_top_preds(X, k=3):
    preds = clf.predict_proba([X])
    return np.argsort(preds)[:,::-1][0][:3]

get_top_preds(x[0])

array([1, 4, 2])

In [None]:
output = [x for _, x in sorted(zip(output, data), key=lambda pair: pair[0], reverse=True)]

In [31]:
from gensim.models.doc2vec import LabeledSentence, TaggedDocument
from gensim.models.doc2vec import Doc2Vec


class LabeledLineSentence(object):
    def __init__(self, data):
        self.data = data
    def __iter__(self):
        for i, line in enumerate(self.data):
            yield TaggedDocument(words=line.split(), tags=['SENT_{}'.format(i)])



labeled_sent = LabeledLineSentence(all_titles[:100000])
model = Doc2Vec(vector_size=300, window=10, min_count=1, workers=4, alpha=0.025, min_alpha=0.025, epochs=10) 
model.build_vocab(labeled_sent)
model.train(labeled_sent, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
wv_ranking = []
    for line in validation:
        q, *ex = line
        ranks = rank_candidates(q, ex, embeddings, dim, func, topk=None, *args, **kwargs)
        wv_ranking.append([r[0] for r in ranks].index(0) + 1)

In [39]:
ex = 'random state'
ex_infer = model.infer_vector(ex.split())
model.docvecs.most_similar([ex_infer], topn=10)

[('SENT_37495', 0.5060877799987793),
 ('SENT_92897', 0.4604107737541199),
 ('SENT_85994', 0.4344367980957031),
 ('SENT_79654', 0.4209355413913727),
 ('SENT_51531', 0.4206792116165161),
 ('SENT_95806', 0.4194100499153137),
 ('SENT_86052', 0.4170324206352234),
 ('SENT_79209', 0.41459929943084717),
 ('SENT_4856', 0.41097453236579895),
 ('SENT_80020', 0.40738117694854736)]

In [43]:
model.docvecs.

TypeError: n_similarity() missing 2 required positional arguments: 'ds1' and 'ds2'

In [33]:
' '.join(list(labeled_sent)[855].words)

'Entity Framework vs LINQ to SQL'

In [158]:
list(labeled_sent)[0].words

['while',
 'applying',
 'opacity',
 'to',
 'form',
 'should',
 'we',
 'use',
 'decimal',
 'or',
 'double',
 'value']

In [161]:
ranks = []
second_ranks = []
for doc_id in range(len(all_text)):
    inferred_vector = model.infer_vector(list(labeled_sent)[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])
    

ValueError: 0 is not in list

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import html

re1 = re.compile(r'  +')
def clean_text(x, remove_html=True, other=False):
    if remove_html:
        x = re.sub(r'<code>[^>]*</code>', '', x)
        x = re.sub(r'<[^>]*>', '', x)
        x = re.sub(r'[^A-Za-z0-9]', ' ', x)
    if other:
        x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
            'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
            '<br />', "\n").replace('\\"', '').replace('<unk>','u_n').replace(' @.@ ','.').replace(
            ' @-@ ','-').replace('\\', ' \\ ').replace('"',"'").replace('\n', ' ').replace('\r', ' ').strip()
    return re1.sub(' ', html.unescape(x).strip())


In [5]:
import sqlite3
DB_NAME = 'StackOverflow.db'

start_index = 0
# limit = 5000
limit = 50
last_unix = 0
cur_length = limit
counter = 0
connection = sqlite3.connect(DB_NAME)
c = connection.cursor()

# while cur_length == limit:
#     df = pd.read_sql("SELECT comment, title FROM posts WHERE title is NOT NULL LIMIT {}".format(limit), connection)
#     df
        

In [194]:
%%time
df = pd.read_sql("SELECT comment, title FROM posts WHERE parent_id is NULL", 
                 connection, chunksize=10000)

def preprocess(df, field):
    all_text = []
    for i, data in enumerate(df):  
#         print(i)
        all_text.extend([clean_text(i) for i in data[field]])
    return all_text

all_text = preprocess(df, 'title')
print(len(all_text))

16022817
CPU times: user 1min 44s, sys: 20.2 s, total: 2min 5s
Wall time: 2min 53s


In [192]:
df = pd.read_sql("SELECT * FROM posts WHERE parent_id is NULL and score > 500 LIMIT 10", 
                 connection)
df.head()

Unnamed: 0,comment_id,parent_id,comment,title,date,score,tags
0,4,,<p>I want to use a track-bar to change a form'...,"While applying opacity to a form, should we us...",2008-07-31,557,c# winforms type-conversion decimal opacity
1,9,,<p>Given a <code>DateTime</code> representing ...,How do I calculate someone's age in C#?,2008-07-31,1745,c# .net datetime
2,11,,<p>Given a specific <code>DateTime</code> valu...,Calculate relative time in C#,2008-07-31,1317,c# datetime time datediff relative-time-span
3,13,,<p>Is there any standard way for a Web Server ...,Determine a User's Timezone,2008-08-01,529,javascript html browser timezone timezoneoffset
4,289,,"<p>I often have to sort a dictionary, consisti...",How do you sort a dictionary by value?,2008-08-02,671,c# sorting dictionary


In [7]:
X, vect = compute_tfidf(all_text)

In [55]:
# idf_scores = dict(zip(vect.get_feature_names(), vect.idf_))
idf_scores = defaultdict(lambda:0, zip(vect.get_feature_names(), vect.idf_))

In [54]:
idf_scores['cool']
# avg_word_vectors(X, )

6.605802066295998

In [33]:
X.getrow(0).data

array([0.14185559, 0.08583777, 0.09819513, 0.07051642, 0.07526899,
       0.06991629, 0.16301519, 0.06954121, 0.13013352, 0.07046131,
       0.11755379, 0.17314393, 0.09804197, 0.1647598 , 0.09804197,
       0.16862811, 0.13692706, 0.10050308, 0.09584303, 0.14989091,
       0.09830537, 0.16662482, 0.09460471, 0.09265912, 0.15983128,
       0.09736615, 0.1296375 , 0.06346664, 0.06333943, 0.11294143,
       0.13893036, 0.12639903, 0.14109408, 0.05996348, 0.17856848,
       0.04762009, 0.14185559, 0.03917769, 0.0904954 , 0.03942107,
       0.07703553, 0.11514935, 0.09505124, 0.17079183, 0.13824596,
       0.13013352, 0.15205463, 0.09143574, 0.18175239, 0.13446791,
       0.10380805, 0.09881659, 0.09881659, 0.1259665 , 0.14035342,
       0.15680724, 0.11215783, 0.12512168, 0.17856848, 0.05471944,
       0.13893036, 0.15698316, 0.10832387, 0.17856848, 0.04083706,
       0.18175239, 0.10540373])

In [135]:
def get_embeddings(filename):
    embeddings = {}
    with open(MODEL_PATH/filename, newline='') as f:
        reader = csv.reader(f, delimiter='\t')
        embed_list = list(reader)
    for line in embed_list:
        embeddings[line[0]] = np.asarray(line[1:], dtype=np.float32)
    return embeddings

embeddings = get_embeddings('starspace_embedding100.tsv')

In [130]:
def question_to_vec_tests(func, embeddings, dim, *args):
    if (np.zeros(dim) != func('', embeddings, dim, *args)).any():
        return "You need to return zero vector for empty question."
    if (np.zeros(dim) != func('thereisnosuchword', embeddings, dim, *args)).any():
        return "You need to return zero vector for the question, which consists only unknown words."
    if (embeddings['word'] != func('word', embeddings, dim, *args)).any():
        return "You need to check the corectness of your function."
    if ((embeddings['cool'] + embeddings['beans']) / 2 != func('Cool Beans', embeddings, dim, *args)).any():
        return "Your function should calculate a mean of word vectors."
    if (embeddings['word'] != func('thereisnosuchword word', embeddings, dim, *args)).any():
        return "You should not consider words which embeddings are unknown."
    return "Basic tests are passed."

def question_to_vec_tfidf_tests(func, embeddings, dim, *args):
    if (np.zeros(dim) != func('', embeddings, dim, *args)).any():
        return "You need to return zero vector for empty question."
    if (np.zeros(dim) != func('thereisnosuchword', embeddings, dim, *args)).any():
        return "You need to return zero vector for the question, which consists only unknown words."
    if (embeddings['word'] * idf_scores['word'] != func('word', embeddings, dim, *args)).any():
        return "You need to check the corectness of your function."
    if (((embeddings['cool'] * idf_scores['cool']) + (embeddings['beans'] * idf_scores['beans'])) / 2 != func('Cool Beans', embeddings, dim, *args)).any():
        return "Your function should calculate a mean of word vectors."
    if (embeddings['word'] * idf_scores['word'] != func('thereisnosuchword word', embeddings, dim, *args)).any():
        return "You should not consider words which embeddings are unknown."
    return "Basic tests are passed."

In [136]:
question_to_vec_tests(avg_word_vectors, embeddings, 100)

'Basic tests are passed.'

In [137]:
question_to_vec_tests(avg_word_vectors, embeddings, 100)
question_to_vec_tfidf_tests(average_tfidf_vectors, embeddings, 100, vect)

'Basic tests are passed.'

In [129]:
def average_tfidf_vectors(question, embeddings, dim, vect):
    # get idf weights
    split_question = [word for word in question.lower().split() if word in embeddings]
    if not split_question:
        return np.zeros(dim).astype(np.float32)
    words_embedding = np.zeros((dim, len(split_question))).astype(np.float32)
    for i, token in enumerate(split_question):
        if token in embeddings:
            embed_score = embeddings[token]
        else: embed_score = 0
        idf_score = idf_scores[token]
        # word vectors multiply by their TF-IDF scores
        words_embedding[:, i] = embed_score * idf_score    
    return words_embedding.mean(axis=1)



In [134]:
average_tfidf_vectors('word', embeddings, dim=300, vect=vect)[:10]

array([-0.33523113,  0.25927806, -0.75544304, -0.3199052 , -0.28468618,
       -0.03379241,  0.08405037, -0.19328661, -0.3096007 ,  0.19111453],
      dtype=float32)

In [133]:
average_tfidf_vectors('word thereisnosuchword', embeddings, dim=300, vect=vect)[:10]

array([-0.33523113,  0.25927806, -0.75544304, -0.3199052 , -0.28468618,
       -0.03379241,  0.08405037, -0.19328661, -0.3096007 ,  0.19111453],
      dtype=float32)

In [132]:
average_tfidf_vectors('thereisnosuchword', embeddings, dim=300, vect=vect)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [196]:
from torch.nn import CosineSimilarity
import torch

def rank_candidates(question, candidates, embeddings, dim=300, topk=5, question_to_vec=avg_word_vectors, *args, **kwargs):
    """
        question: a string
        candidates: a list of strings (candidates) which we want to rank
        embeddings: some embeddings
        dim: dimension of the current embeddings
        
        result: a list of pairs (initial position in the list, question)
    """
    cos = CosineSimilarity(dim=1)
    question2vec = question_to_vec(question, embeddings, dim, *args, **kwargs)
    candidate2vecs = np.array([question_to_vec(cand, embeddings, dim, *args, **kwargs) for cand in candidates])
    output = cos(torch.Tensor(question2vec.reshape(1, -1)).cuda(), torch.Tensor(candidate2vecs).cuda())
    output = output.cpu().numpy()
    data = [(i, candidates[i]) for i in range(len(output))]   
    output = [x for _, x in sorted(zip(output, data), key=lambda pair: pair[0], reverse=True)]
    if topk: return output[:topk]
    else: return output


In [11]:
def hits_count(dup_ranks, k):
    dup_ranks = np.array(dup_ranks)
    return len(dup_ranks[dup_ranks <= k]) / len(dup_ranks)

def dcg_score(dup_ranks, k):
    dup_ranks = np.array(dup_ranks)
    N = len(dup_ranks)
    dup_ranks = dup_ranks[dup_ranks <= k]
    out = np.sum((np.ones_like(dup_ranks)) / (np.log2(1.0 + dup_ranks))) / float(N)
    if np.isnan(out): out = 0.0
    return out

In [12]:
def read_corpus(filename):
    data = []
    for line in open(filename, encoding='utf-8'):
        data.append(line.strip().split('\t'))
    return data

validation = read_corpus('data/validation.tsv')

In [141]:
wv_ranking = []
for line in validation:
    q, *ex = line
    ranks = rank_candidates(q, ex, embeddings, 100, avg_word_vectors)
    wv_ranking.append([r[0] for r in ranks].index(0) + 1)

In [142]:
# avg_word_vectors 100
for k in [1, 5, 10, 100, 500, 1000]:
    print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (k, dcg_score(wv_ranking, k), 
                                              k, hits_count(wv_ranking, k)))

DCG@   1: 0.434 | Hits@   1: 0.434
DCG@   5: 0.528 | Hits@   5: 0.610
DCG@  10: 0.548 | Hits@  10: 0.670
DCG@ 100: 0.583 | Hits@ 100: 0.843
DCG@ 500: 0.598 | Hits@ 500: 0.956
DCG@1000: 0.602 | Hits@1000: 1.000


In [20]:
# avg_word_vectors 300
for k in [1, 5, 10, 100, 500, 1000]:
    print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (k, dcg_score(wv_ranking, k), 
                                              k, hits_count(wv_ranking, k)))

DCG@   1: 0.441 | Hits@   1: 0.441
DCG@   5: 0.540 | Hits@   5: 0.624
DCG@  10: 0.560 | Hits@  10: 0.686
DCG@ 100: 0.593 | Hits@ 100: 0.846
DCG@ 500: 0.607 | Hits@ 500: 0.959
DCG@1000: 0.611 | Hits@1000: 1.000


In [139]:
# average_tfidf_vectors 100
for k in [1, 5, 10, 100, 500, 1000]:
    print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (k, dcg_score(wv_ranking, k), 
                                              k, hits_count(wv_ranking, k)))

DCG@   1: 0.248 | Hits@   1: 0.248
DCG@   5: 0.332 | Hits@   5: 0.408
DCG@  10: 0.354 | Hits@  10: 0.473
DCG@ 100: 0.393 | Hits@ 100: 0.666
DCG@ 500: 0.419 | Hits@ 500: 0.873
DCG@1000: 0.432 | Hits@1000: 1.000


In [128]:
# average_tfidf_vectors
for k in [1, 5, 10, 100, 500, 1000]:
    print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (k, dcg_score(wv_ranking, k), 
                                              k, hits_count(wv_ranking, k)))

DCG@   1: 0.244 | Hits@   1: 0.244
DCG@   5: 0.328 | Hits@   5: 0.405
DCG@  10: 0.351 | Hits@  10: 0.477
DCG@ 100: 0.391 | Hits@ 100: 0.671
DCG@ 500: 0.417 | Hits@ 500: 0.874
DCG@1000: 0.430 | Hits@1000: 1.000


In [199]:
%%time
question = 'Given a representing a person s birthday how do I calculate their age in years'

rank_candidates(question, all_text[:1000], embeddings, 100, avg_word_vectors)

RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1518244507981/work/torch/lib/THC/generated/../generic/THCTensorMathPointwise.cu:367

In [None]:
need to run tag classifier 