## Using Doc2Vec to find potentials users to answer a novel question

1. Get the data
2. Split the data into train and test
<br/><br/>
3. Using all the questions in the training set, train a Doc2Vec model. 
4. For all the users, compute a vector represenatation for the user from the questions that he has asked in the system using Doc2Vec. datastructure:(userid, vecRepresentation)
5. For a novel question, compute the vector representation. 
6. Return the nearest neighbour from (4)
<br/><br/>
7. Evaluate the result.

# Get the data

In [1]:
#Import Libraries
import xml.etree.ElementTree
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import re
import matplotlib.pyplot as plt
import math 
import time
from nltk import FreqDist
from cStringIO import StringIO

#Get root tag of XML
e = xml.etree.ElementTree.parse('../data/android.meta.stackexchange.com.backup/Posts.xml').getroot()
u = xml.etree.ElementTree.parse('../data/android.meta.stackexchange.com.backup/Users.xml').getroot()

stop_words = set(stopwords.words('english'))

In [2]:
#Functions

def remove_html_tags(text):
    if text==None:
        return ""
    text=text.encode('utf-8')
    p=re.compile('<[^>]*>|\n')
    return p.sub(" ", text)
    
def clean_tags(text):
    if text==None:
        return ""
    text=text.encode('utf-8')
    p=re.compile('<|>')
    return p.sub(" ", text)

def get_alpha(x):
    x = 0.25 * (abs(x)-5)
    return 1 / (1 + math.exp(-x))

def get_posts(user_id):
    rows = e.findall('.//row[@OwnerUserId="'+ user_id +'"][@PostTypeId="1"]')
    textual_content = ""
    
    for row in rows:
        content = remove_html_tags(row.get('Body'))
        content += remove_html_tags(row.get('Title'))
        content += clean_tags(row.get('Tags'))
        textual_content += content  
        
    return textual_content,get_alpha(len(rows))

def get_ground_truth(question_id):
    # Get the users who answered the question.
    rows = e.findall('.//row[@ParentId="'+ str(question_id) +'"][@PostTypeId="2"]')
    content = []
    for row in rows:
        if (row.get('OwnerUserId') != None):
            content.append(int(row.get('OwnerUserId')))
    return content

def get_epoch(t):
    pattern = '%Y-%m-%dT%H:%M:%S.%f'
    epoch = int(time.mktime(time.strptime(t, pattern)))
    return epoch
    
def get_prior(user_id, question_id):
    user_time  = last_access[user_id]
    que_time = time_of_question[question_id]
    return 1
#     return math.exp(-np.abs(que_time - user_time) / 86400) 

def get_frequency(text):
    return Counter(text.split()).most_common()


In [3]:
int_nonqa = {} # key = userID, value = description  
ext_nonqa_twitter = {} # key = userID, value = description
ext_nonqa_hp = {} # key = userID, value = description

import re
#About
p=re.compile(r'&lt;(\S)*|p&gt;|\&#xA;|&quot;|/a&gt;|/li&gt;|/ul&gt;|/b&gt;|&gt;|;|#\d(\d)+|&amp')
a = open('user_internal_nonqa_info.txt')
for line in a:
    tmp=p.sub('', line) #remove all punctuations in i(single sentence) and store in tmp
    list_line = tmp.split('\t')
    int_nonqa[list_line[0]] = list_line[3]

#Twitter
a = open('user_external_nonqa_info.txt')
for line in a:
    list_line = line.split('\t')
    ext_nonqa_twitter[list_line[0]] = list_line[1]

#HomePage
a = open('user_external_nonqa_info_homepage_nouns.txt')
for line in a:
    list_line = line.split('\t')
    if(len(list_line) >= 2):
        ext_nonqa_hp[list_line[0]] = list_line[1]
        
        
def get_non_qa(accountid):
    data=""
    if ext_nonqa_hp.get(accountid) is not None:
        data += ext_nonqa_hp[accountid]
    if ext_nonqa_twitter.get(accountid) is not None:
        data += ext_nonqa_twitter[accountid]
    if int_nonqa.get(accountid) is not None:
        data += int_nonqa[accountid]
        
    data = re.sub(r"[\n\t\s]*", "", data)       
    return data

# 2. Split the data

In [4]:
questions = {}
ground_truths = {}
time_of_question = {}

#Get each question
for atype in e.findall('row'):
    id=atype.get('PostTypeId')
    if int(id)==1:
        body=atype.get('Body')
        title=atype.get('Title')
        tags=atype.get('Tags')
        que_id=int(atype.get('Id'))
        
        ground_truths[que_id] = get_ground_truth(que_id)
        body = remove_html_tags(body)
        title = remove_html_tags(title)
        tags=clean_tags(tags)
        post=str(body)+" "+str(title)+str(tags)

        questions[que_id] = post
        time_of_question[que_id] = get_epoch(atype.get('CreationDate'))
l=len(questions)

index=int(l*0.8)

#split
train_questions=dict(questions.items()[:index])
test_questions=dict(questions.items()[index:])
print(len(train_questions),len(test_questions))


#writing data to file
train_file_name = './stage/question_meta_train.txt'
test_file_name = './stage/question_meta_test.txt'

questions = train_questions.values()
train_handler = open(train_file_name, 'w')
test_handler = open(test_file_name, 'w')

train_list = []
test_list = []
len_list = len(questions)
for i in range(len_list):
    if i < 3*len_list/4:
        train_list.append(questions[i])
    else:
        test_list.append(questions[i])
        
train_handler.write("\n".join(train_list))
test_handler.write("\n".join(test_list))

train_handler.close()
test_handler.close()

print 'Done creating train and test data'

(692, 174)
Done creating train and test data


# Training Doc2Vec on the questions 

In [5]:
import gensim.models as g
import logging

#doc2vec parameters
vector_size = 300
window_size = 15
min_count = 1
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 0 #0 = dbow; 1 = dmpv
worker_count = 1 #number of parallel processes

In [6]:
#input corpus
train_corpus = train_file_name

#output model
saved_path = "./stage/doc2vec_model.bin"

#enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#train doc2vec model
docs = g.doc2vec.TaggedLineDocument(train_corpus)
model = g.Doc2Vec(docs, size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1, iter=train_epoch)

2018-04-18 13:01:29,984 : INFO : collecting all words and their counts
2018-04-18 13:01:29,986 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-04-18 13:01:30,031 : INFO : collected 10119 word types and 519 unique tags from a corpus of 519 examples and 62863 words
2018-04-18 13:01:30,033 : INFO : Loading a fresh vocabulary
2018-04-18 13:01:30,070 : INFO : min_count=1 retains 10119 unique words (100% of original 10119, drops 0)
2018-04-18 13:01:30,071 : INFO : min_count=1 leaves 62863 word corpus (100% of original 62863, drops 0)
2018-04-18 13:01:30,114 : INFO : deleting the raw counts dictionary of 10119 items
2018-04-18 13:01:30,116 : INFO : sample=1e-05 downsamples 4018 most-common words
2018-04-18 13:01:30,118 : INFO : downsampling leaves estimated 16967 word corpus (27.0% of prior 62863)
2018-04-18 13:01:30,161 : INFO : estimated required memory for 10119 words and 300 dimensions: 29967900 bytes
2018-04-18 13:01:30,162 : INFO : resetting layer we

2018-04-18 13:01:42,672 : INFO : EPOCH - 30 : training on 62863 raw words (17607 effective words) took 0.4s, 39399 effective words/s
2018-04-18 13:01:43,125 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-18 13:01:43,127 : INFO : EPOCH - 31 : training on 62863 raw words (17481 effective words) took 0.5s, 38573 effective words/s
2018-04-18 13:01:43,508 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-18 13:01:43,509 : INFO : EPOCH - 32 : training on 62863 raw words (17477 effective words) took 0.4s, 45939 effective words/s
2018-04-18 13:01:43,971 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-18 13:01:43,973 : INFO : EPOCH - 33 : training on 62863 raw words (17421 effective words) took 0.5s, 37743 effective words/s
2018-04-18 13:01:44,352 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-18 13:01:44,354 : INFO : EPOCH - 34 : training on 62863 raw words (17541 effective words) took 0

2018-04-18 13:01:58,745 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-18 13:01:58,747 : INFO : EPOCH - 67 : training on 62863 raw words (17452 effective words) took 0.5s, 37870 effective words/s
2018-04-18 13:01:59,192 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-18 13:01:59,193 : INFO : EPOCH - 68 : training on 62863 raw words (17375 effective words) took 0.4s, 39190 effective words/s
2018-04-18 13:01:59,681 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-18 13:01:59,683 : INFO : EPOCH - 69 : training on 62863 raw words (17520 effective words) took 0.5s, 35930 effective words/s
2018-04-18 13:02:00,105 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-18 13:02:00,107 : INFO : EPOCH - 70 : training on 62863 raw words (17445 effective words) took 0.4s, 41263 effective words/s
2018-04-18 13:02:00,566 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-18 13

## Getting the Doc2Vec vectors for all the users based on their previous questions

In [7]:
import pickle
pickle_file = './stage/user_vectors_qa.pkl'
try:
    user_vector = pickle.load(open(pickle_file, 'rb'))
except (OSError, IOError) as error:
    print error
    #inference hyper-parameters
    start_alpha=0.01
    infer_epoch=1000

    post_by_user = {}
    user_vector = {}
    for users_row in u.findall('row'):
        user_id = users_row.get('Id')
        AccountId = users_row.get('AccountId')
#         post_by_user[int(user_id)] = get_posts(user_id)[0] + str(get_non_qa(AccountId))
        post_by_user[int(user_id)] = get_posts(user_id)[0]
#         post_by_user[int(user_id)] = get_non_qa(AccountId)

        user_vector[int(user_id)] = model.infer_vector(post_by_user[int(user_id)], alpha=start_alpha, steps=infer_epoch)
    pickle.dump(user_vector, open(pickle_file, 'wb'))

## Infering

In [8]:
def calculate_MRR_per_qn(pred, actual):
    for i, p in enumerate(pred):
        if(p in actual):
            return (1./(i+1))
    return 0

def calculate_precision_per_qn(pred, actual):
    n_common_items = len(set(pred) - (set(pred) - set(actual)))
    return (n_common_items * 1.) / len(pred)


In [9]:
import numpy as np
import numpy.linalg as lin
import operator
start_alpha=0.01
infer_epoch=1000
mrr_s = []
precision_s_5 = []
precision_s_10 = []
user_id_question_map = {}

for que_id in test_questions.keys():
    userids = []
    userids_5 = []
    test_questions[que_id]
    vector = model.infer_vector(test_questions[que_id], alpha=start_alpha, steps=infer_epoch)
    complete_list = sorted(user_vector.items(), key=lambda (_, v): lin.norm(map(operator.sub, v,  vector), ord=2))
    
    for item in complete_list[:10]:
        userids.append(item[0])
    for item in complete_list[:5]:
        userids_5.append(item[0])
    
    mrr_s.append(calculate_MRR_per_qn(userids, ground_truths[que_id]))
    precision_s_10.append(calculate_precision_per_qn(userids, ground_truths[que_id]))
    precision_s_5.append(calculate_precision_per_qn(userids_5, ground_truths[que_id]))
    
    user_id_question_map[que_id] = complete_list
    print ('(current)Mean Reciprocal Rank: ', np.mean(mrr_s))
    print ('(current)Precision @ 5', np.mean(precision_s_5))
    print ('(current)Precision @ 10', np.mean(precision_s_10))

    print ('\n')
    
mrr = np.mean(mrr_s)
prec_5 = np.mean(precision_s_5)
prec_10 = np.mean(precision_s_10)

print ('(final)Mean Reciprocal Rank: ', mrr)
print ('(final)Precision @ 5', prec_5)
print ('(final)Precision @ 10', prec_10)

('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 5', 0.0)
('(current)Precision @ 10', 0.0)


('(current)Mean Reciprocal Rank: ', 0.1111111111111111)
('(current)Precision @ 5', 0.022222222222222223)
('(current)Precision @ 10', 0.0

('(current)Mean Reciprocal Rank: ', 0.034482758620689655)
('(current)Precision @ 5', 0.006896551724137932)
('(current)Precision @ 10', 0.003448275862068966)


('(current)Mean Reciprocal Rank: ', 0.03389830508474576)
('(current)Precision @ 5', 0.006779661016949153)
('(current)Precision @ 10', 0.0033898305084745766)


('(current)Mean Reciprocal Rank: ', 0.03333333333333333)
('(current)Precision @ 5', 0.006666666666666667)
('(current)Precision @ 10', 0.0033333333333333335)


('(current)Mean Reciprocal Rank: ', 0.03278688524590164)
('(current)Precision @ 5', 0.006557377049180328)
('(current)Precision @ 10', 0.003278688524590164)


('(current)Mean Reciprocal Rank: ', 0.03225806451612903)
('(current)Precision @ 5', 0.0064516129032258064)
('(current)Precision @ 10', 0.0032258064516129032)


('(current)Mean Reciprocal Rank: ', 0.031746031746031744)
('(current)Precision @ 5', 0.006349206349206349)
('(current)Precision @ 10', 0.0031746031746031746)


('(current)Mean Reciprocal Rank: ', 0.03125)


('(current)Mean Reciprocal Rank: ', 0.036036036036036036)
('(current)Precision @ 5', 0.007207207207207207)
('(current)Precision @ 10', 0.0036036036036036037)


('(current)Mean Reciprocal Rank: ', 0.03571428571428571)
('(current)Precision @ 5', 0.0071428571428571435)
('(current)Precision @ 10', 0.0035714285714285718)


('(current)Mean Reciprocal Rank: ', 0.035398230088495575)
('(current)Precision @ 5', 0.007079646017699116)
('(current)Precision @ 10', 0.003539823008849558)


('(current)Mean Reciprocal Rank: ', 0.03508771929824561)
('(current)Precision @ 5', 0.007017543859649123)
('(current)Precision @ 10', 0.0035087719298245615)


('(current)Mean Reciprocal Rank: ', 0.034782608695652174)
('(current)Precision @ 5', 0.006956521739130435)
('(current)Precision @ 10', 0.0034782608695652175)


('(current)Mean Reciprocal Rank: ', 0.034482758620689655)
('(current)Precision @ 5', 0.006896551724137932)
('(current)Precision @ 10', 0.003448275862068966)


('(current)Mean Reciprocal Rank: ', 0.03418

('(current)Mean Reciprocal Rank: ', 0.036585365853658534)
('(current)Precision @ 5', 0.007317073170731709)
('(current)Precision @ 10', 0.0036585365853658543)


('(current)Mean Reciprocal Rank: ', 0.03636363636363636)
('(current)Precision @ 5', 0.007272727272727274)
('(current)Precision @ 10', 0.003636363636363637)


('(current)Mean Reciprocal Rank: ', 0.03614457831325301)
('(current)Precision @ 5', 0.0072289156626506035)
('(current)Precision @ 10', 0.0036144578313253017)


('(current)Mean Reciprocal Rank: ', 0.03592814371257485)
('(current)Precision @ 5', 0.007185628742514971)
('(current)Precision @ 10', 0.0035928143712574854)


('(current)Mean Reciprocal Rank: ', 0.03571428571428571)
('(current)Precision @ 5', 0.0071428571428571435)
('(current)Precision @ 10', 0.0035714285714285718)


('(current)Mean Reciprocal Rank: ', 0.03550295857988166)
('(current)Precision @ 5', 0.007100591715976333)
('(current)Precision @ 10', 0.0035502958579881664)


('(current)Mean Reciprocal Rank: ', 0.035294

In [106]:
import pickle
pickle_file = './stage/user_id_question_map_doc2vec_qa_p_non_qa.pkl'
pickle.dump(user_id_question_map, open(pickle_file, 'wb'))

# pickle_file = 'groundtruth.pkl'
# pickle.dump(ground_truths, open(pickle_file, 'wb'))