In [1]:
# 1. Get the data
# 2. Split the data into train and test
# 3. Train an LDA on all the questions from the training set

# 4. Accept a question from for which we need to find the set of users
# 5. Get questions asked by this user (User-QA)
# 6. Get Twitter/HP/About-me data about the user (User-non-QA)
# 7. Calculate p(q|theta_qa) from User-QA
# 8. Calculate p(q|theta_non_qa) from User-non-QA
# 9. Calculate Equation (2)
# 10. Calculate Equation (1)

# 11. Rank the users

# 12. From the test set, get the set of users that have answered the question
# 13. Evaluate the result

# 1. Get the data

In [18]:
#Import Libraries
import xml.etree.ElementTree
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import re
import matplotlib.pyplot as plt
import math 
import time, sys
from nltk import FreqDist
import numpy
from cStringIO import StringIO
#Get root tag of XML
e = xml.etree.ElementTree.parse('data/Posts.xml').getroot()
u = xml.etree.ElementTree.parse('data/Users.xml').getroot()

stop_words = set(stopwords.words('english'))

In [19]:
def remove_html_tags(text):
    if text==None:
        return ""
    text=text.encode('utf-8', 'ignore')
    p=re.compile('<[^>]*>|\n')
    return p.sub(" ", text)
    
def clean_tags(text):
    if text==None:
        return ""
    text=text.encode('utf-8', 'ignore')
    p=re.compile('<|>')
    return p.sub(" ", text)

def get_alpha(x):
    x = 0.25 * (abs(x)-5)
    return 1 / (1 + math.exp(-x))

def get_posts(user_id):
    rows = e.findall('.//row[@OwnerUserId="'+ user_id +'"][@PostTypeId="1"]')
    textual_content = ""
    tags_text = ""
    for row in rows:
        content = remove_html_tags(row.get('Body'))
        content += remove_html_tags(row.get('Title'))
        content += clean_tags(row.get('Tags'))
        textual_content += content  
        tags_text += clean_tags(row.get('Tags'))
    return textual_content, tags_text, get_alpha(len(rows))

def get_ground_truth(question_id):
    # Get the users who answered the question.
    rows = e.findall('.//row[@ParentId="'+ str(question_id) +'"][@PostTypeId="2"]')
    content = []
    for row in rows:
        if (row.get('OwnerUserId') != None):
            content.append(int(row.get('OwnerUserId')))
    return content

def get_epoch(t):
    pattern = '%Y-%m-%dT%H:%M:%S.%f'
    epoch = int(time.mktime(time.strptime(t, pattern)))
    return epoch
    
def get_prior(user_id, question_id):
    user_time  = last_access[user_id]
    que_time = time_of_question[question_id]
    return 1
#     return math.exp(-np.abs(que_time - user_time) / 86400) 

def get_frequency(text):
    return Counter(text.split()).most_common()

# 2. Split the data

In [20]:
label_set = []
tags_for_q = []
questions = {}
ground_truths = {}
time_of_question = {}

for atype in e.findall('row'):
    id=atype.get('PostTypeId')
    if int(id)==1:
        body=atype.get('Body')
        title=atype.get('Title')
        tags=atype.get('Tags')
        que_id=int(atype.get('Id'))        
        ground_truths[que_id] = get_ground_truth(que_id)
        body = remove_html_tags(body)
        title = remove_html_tags(title)
        tags=clean_tags(tags)
        post=(str(body)+" "+str(title)+str(tags)).split()
        current = list(str(tags).split())
        tags_for_q.append(current)
        label_set.extend(x for x in current if x not in label_set)
        questions[que_id] = post
        time_of_question[que_id] = get_epoch(atype.get('CreationDate'))
l=len(questions)

index=int(l*0.8)
train_questions=dict(questions.items()[:index])
test_questions=dict(questions.items()[index:])
#split
train_tags=tags_for_q[:index]
test_tags=tags_for_q[index:]
#writing data to file
train_file_name = 'question_meta_test.txt'
# test_file_name = 'question_meta_train.txt'

questions = train_questions.values()
# train_handler = open(train_file_name, 'w')
# test_handler = open(test_file_name, 'w')

train_list = []
# test_list = []
len_list = len(questions)
for i in range(len_list):
#     if i < 3*len_list/4:
    train_list.append(questions[i])
#     else:
#         test_list.append(questions[i])
        
# train_handler.write("\n".join(train_list))
# test_handler.write("\n".join(test_list))
# train_handler.close()
# test_handler.close()

print 'Done creating train and test data'

Done creating train and test data


# Train LDA on Question profile

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
collection = open('question_meta_test.txt')
n_top_words = 2000 
tf_vectorizer = CountVectorizer(max_df = 5, min_df = 1, max_features = n_top_words, stop_words = 'english')
tf = tf_vectorizer.fit_transform(collection)
tf_feature_names = tf_vectorizer.get_feature_names()
n_topics = 20
import scipy
from scipy.sparse import coo_matrix
cx = scipy.sparse.coo_matrix(tf)
tf = cx.toarray()

In [24]:
class LLDA:
    def __init__(self, K, alpha, beta):
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.labelmap = {}

    def complement_label(self, label):
        if not label: return numpy.ones(len(self.labelmap))
        vec = numpy.zeros(len(self.labelmap))
        vec[0] = 1.0
        for x in label: 
            if x in self.labelmap:
                vec[self.labelmap[x]] = 1.0
        return vec

    def set_corpus(self, labelset, corpus, labels, vocab):
        self.labelmap = dict(zip(labelset, range(len(labelset))))
#         print(self.labelmap)
        self.K = len(self.labelmap)
#         print(self.K)
        self.vocas = vocab
        self.labels = numpy.array([self.complement_label(label) for label in labels])
        self.docs = corpus

        M = np.shape(self.docs)[0]
        V = len(vocab)

        self.z_m_n = []
        self.n_m_z = numpy.zeros((M, self.K), dtype=int)
        self.n_z_t = numpy.zeros((self.K, V), dtype=int)
        self.n_z = numpy.zeros(self.K, dtype=int)

        for m, doc, label in zip(range(M), self.docs, self.labels):
            N_m = np.shape(self.docs)[1]
            #z_n = [label[x] for x in numpy.random.randint(len(label), size=N_m)]
            z_n = [numpy.random.multinomial(1, label / label.sum()).argmax() for x in range(N_m)]
            self.z_m_n.append(z_n)
            for do, z in zip(range(V), z_n):
                if doc[do].all()>0:
                    self.n_m_z[m, z] += doc[do]
                    self.n_z_t[z, do] += doc[do]
                    self.n_z[z] += doc[do]
#             for t, z in zip(doc, z_n):
#                 self.n_m_z[m, z] += 1
#                 self.n_z_t[z, t] += 1
#                 self.n_z[z] += 1


    def inference(self):
        V = len(self.vocas)
        for m, doc, label in zip(range(np.shape(self.docs)[0]), self.docs, self.labels):
            for do in range(V):
                z = self.z_m_n[m][do]
                if doc[do].all()>0:
                    self.n_m_z[m, z] -= doc[do]
                    self.n_z_t[z, do] -= doc[do]
                    self.n_z[z] -= doc[do]

                    denom_a = self.n_m_z[m].sum() + self.K * self.alpha
                    denom_b = self.n_z_t.sum(axis=1) + V * self.beta
                    p_z = label * (self.n_z_t[:, do] + self.beta) / denom_b * (self.n_m_z[m] + self.alpha) / denom_a
                    new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()

                    self.z_m_n[m][do] = new_z
                    self.n_m_z[m, new_z] += doc[do]
                    self.n_z_t[new_z, do] += doc[do]
                    self.n_z[new_z] += doc[do]

    def phi(self):
        V = len(self.vocas)
        return (self.n_z_t + self.beta) / (self.n_z[:, numpy.newaxis] + V * self.beta)

    def theta(self):
        """document-topic distribution"""
        n_alpha = self.n_m_z + self.labels * self.alpha
        return n_alpha / n_alpha.sum(axis=1)[:, numpy.newaxis]

    def perplexity(self, docs=None):
        if docs == None: docs = self.docs
        phi = self.phi()
        thetas = self.theta()

        log_per = N = 0
        for doc, theta in zip(docs, thetas):
            for w in doc:
                log_per -= numpy.log(numpy.inner(phi[:,w], theta))
            N += len(doc)
        return numpy.exp(log_per / N)
    
    def test_inference(self, tf_test, tags_2):
        M = np.shape(tf_test)[0]
        V = len(self.vocas)
        labels2 = numpy.array([self.complement_label(label) for label in tags_2])
        docs2 = tf_test
        z_m_n_1 = []
        n_m_z_1 = numpy.zeros((M, self.K), dtype=int)
        n_z_t_1 = numpy.zeros((self.K, V), dtype=int)
        n_z_1 = numpy.zeros(self.K, dtype=int)
        if (tags_2!=[]):
            for m, doc, label in zip(range(M), docs2, labels2):
                N_m_1 = np.shape(docs2)[1]
                z_n_1 = [numpy.random.multinomial(1, label / label.sum()).argmax() for x in range(N_m_1)]
                z_m_n_1.append(z_n_1)
                if(np.shape(docs2)[0]> 0):
                    for do, z in zip(range(V), z_n_1):
                        if doc!=[] and doc[do].all()>0:
                            n_m_z_1[m, z] += doc[do]
                            n_z_t_1[z, do] += doc[do]
                            n_z_1[z] += doc[do]
                else:
                    pl = labels2 * self.alpha
                    return pl / pl.sum(axis=1)[:, numpy.newaxis]


            for i in range(2):
                for m, doc, label in zip(range(np.shape(docs2)[0]), docs2, labels2):
                    for do in range(V):
                        z = z_m_n_1[m][do]
                        if doc!=[] and doc[do].all()>0:
                            n_m_z_1[m, z] -= doc[do]
                            n_z_t_1[z, do] -= doc[do]
                            n_z_1[z] -= doc[do]

                            denom_a = n_m_z_1[m].sum() + self.K * self.alpha
                            denom_b = n_z_t_1.sum(axis=1) + V * self.beta
                            p_z = label * (n_z_t_1[:, do] + self.beta) / denom_b * (n_m_z_1[m] + self.alpha) / denom_a
                            new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()

                            z_m_n_1[m][do] = new_z
                            n_m_z_1[m, new_z] += doc[do]
                            n_z_t_1[new_z, do] += doc[do]
                            n_z_1[new_z] += doc[do]
                        else:
                            return np.full((self.K), 1.0/self.K)
                          
        else:
            N_m_1 = np.shape(docs2)[1]
            z_n_1 = [numpy.random.multinomial(1, [1]).argmax() for x in range(N_m_1)]
            z_m_n_1.append(z_n_1)
            doc = docs2
            if(np.shape(docs2)[0]> 0):
                for do, z in zip(range(V), z_n_1):
                    if doc!=[] and doc[do].all()>0:
                        n_m_z_1[m, z] += doc[do]
                        n_z_t_1[z, do] += doc[do]
                        n_z_1[z] += doc[do]
                    else:
                        return np.full((self.K), 1.0/self.K)
            else:
                return np.full((self.K), 1.0/self.K)

            for i in range(2):
                for doc in docs2:
                    for do in range(V):
                        z = z_m_n_1[m][do]
                        if doc!=[] and doc[do].all()>0:
                            n_m_z_1[m, z] -= doc[do]
                            n_z_t_1[z, do] -= doc[do]
                            n_z_1[z] -= doc[do]

                            denom_a = n_m_z_1[m].sum() + self.K * self.alpha
                            denom_b = n_z_t_1.sum(axis=1) + V * self.beta
                            p_z = label * (n_z_t_1[:, do] + self.beta) / denom_b * (n_m_z_1[m] + self.alpha) / denom_a
                            new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()
                            z_m_n_1[m][do] = new_z
                            n_m_z_1[m, new_z] += doc[do]
                            n_z_t_1[new_z, do] += doc[do]
                            n_z_1[new_z] += doc[do]
                        else:
                            return np.full((self.K), 1.0/self.K)

        #print(labels2)
        n_alpha = n_m_z_1 + labels2 * self.alpha
        return n_alpha / n_alpha.sum(axis=1)[:, numpy.newaxis]

In [25]:
llda = LLDA(20, 0.1, 0.1)
llda.set_corpus(label_set, tf, train_tags, tf_feature_names)
for i in range(10):
    sys.stderr.write("-- %d " % (i + 1))
    llda.inference()

-- 1 -- 2 -- 3 -- 4 -- 5 -- 6 -- 7 -- 8 -- 9 -- 10 

In [26]:
comp = llda.phi()

In [27]:
int_nonqa = {} # key = userID, value = description  
ext_nonqa_twitter = {} # key = userID, value = description
ext_nonqa_hp = {} # key = userID, value = description

import re
#About
p=re.compile(r'&lt;(\S)*|p&gt;|\&#xA;|&quot;|/a&gt;|/li&gt;|/ul&gt;|/b&gt;|&gt;|;|#\d(\d)+|&amp')
a = open('user_internal_nonqa_info.txt')
for line in a:
    tmp=p.sub('', line) #remove all punctuations in i(single sentence) and store in tmp
    list_line = tmp.split('\t')
    int_nonqa[list_line[0]] = list_line[3]

#Twitter
a = open('user_external_nonqa_info.txt')
for line in a:
    list_line = line.split('\t')
    ext_nonqa_twitter[list_line[0]] = list_line[1]

#HomePage
a = open('user_external_nonqa_info_homepage_nouns.txt')
for line in a:
    list_line = line.split('\t')
    if(len(list_line) >= 2):
        ext_nonqa_hp[list_line[0]] = list_line[1]
        

In [None]:
def get_prob_q_given_u(question,userid):
    alpha=alpha_of_user[userid]
    qa=get_prob_q_given_u_qa(question,userid,'qa')
    non_qa=get_prob_q_given_u_qa(question,userid,'non-qa')
    return (alpha*qa+(1-alpha)*non_qa)


def get_theta_u(userid,comp, t):
    data=""
    if t=='qa':  #For QA data 
        data = StringIO(LLDA_UP[userid]) # single line file of all questions
    else:        #For NonQA data
        accountid=AccountId[userid]
        if ext_nonqa_hp.get(accountid) is not None:
            data += ext_nonqa_hp[accountid]
        if ext_nonqa_twitter.get(accountid) is not None:
            data += ext_nonqa_twitter[accountid]
        if int_nonqa.get(accountid) is not None:
            data += int_nonqa[accountid]
        
        data = re.sub(r"[\n\t\s]*", "", data)
        #data = unicode(data, "utf-8")
        data=StringIO(data)
        
    tf_test = tf_vectorizer.transform(data)
    #print(np.shape(tf_test))
    tf_test_features = tf_vectorizer.get_feature_names()
    cx = scipy.sparse.coo_matrix(tf_test)
    tf_test = cx.toarray()
    tags = tags_UP[userid].split()
    pred = np.transpose(llda.test_inference(tf_test, tags))
    pred = pred.tolist()
    pred2 = sorted(pred, reverse=True)
    pred2 = pred2[:20]
    #print(pred2)
    pred = [float(i)/sum(pred2) for i in pred2]
    
    list_features = [tf_test_features[j] for j in range(n_top_words)]
    feature_prob = {}
    topic_prob = []
    for topic_i, topic in enumerate(comp): 
        feature_prob[topic_i] = [j for j in topic]
    
    if t=='qa':
        UP_QA_topic[int(user_id)],UP_QA_word[int(user_id)],UP_QA_feature[int(user_id)]=pred,feature_prob,list_features
    else:
        UP_NON_QA_topic[int(user_id)],UP_NON_QA_word[int(user_id)],UP_NON_QA_feature[int(user_id)]=pred,feature_prob,list_features

def get_prob_q_given_u_qa(question,userid,t):
    pred=[]
    feature_prob=[]
    list_features=[]
    if userid in UP_NON_QA_topic:
        pred=UP_NON_QA_topic[userid]
    else:
        return 0
    if userid in UP_NON_QA_word:
        feature_prob=UP_NON_QA_word[userid]
    if userid in UP_NON_QA_feature:
        list_features=UP_NON_QA_feature[userid]
    if t=='qa':  #For QA data 
        if userid in UP_QA_topic:
            pred=UP_QA_topic[userid]
        else:
            return 0
        if userid in UP_QA_word:
            feature_prob=UP_QA_word[userid]
        if userid in UP_QA_feature:
            list_features=UP_QA_feature[userid]
    eq5=0
    for word in question.split():
        word = word.lower()
        if word in list_features:
            word_i = list_features.index(word)
            for topic_i, topic_prob in enumerate(pred):
                eq5 += math.log(float(topic_prob)) + math.log(feature_prob[topic_i][word_i])
        else:
            eq5 += 1e-10
    return eq5    

# User QA Data

In [None]:
LLDA_UP={}
tags_UP = {}
alpha_of_user={}
last_access={}
AccountId={}
UP_QA_topic={}
UP_QA_word={}
UP_QA_feature={}
UP_NON_QA_topic={}
UP_NON_QA_word={}
UP_NON_QA_feature={}
for users_row in u.findall('row'):
    user_id = users_row.get('Id')
    print('UserId',user_id)
    LLDA_UP[int(user_id)],tags_UP[int(user_id)], alpha_of_user[int(user_id)] = get_posts(user_id)
    last_access[int(user_id)] = get_epoch(users_row.get('LastAccessDate'))
    AccountId[int(user_id)] = users_row.get('AccountId')
    get_theta_u(int(user_id), comp,'qa')
    get_theta_u(int(user_id), comp, 'non_qa')


In [None]:
# print(LDA_UP)
# print(AccountId)

# Non QA Data

# Evaluation

In [None]:
def calculate_MRR_per_qn(pred, actual):
    for i, p in enumerate(pred):
        if(p in actual):
            return (1./(i+1))
    return 0

def calculate_precision_per_qn(pred, actual):
    n_common_items = len(set(pred) - (set(pred) - set(actual)))
    return (n_common_items * 1.) / len(pred)


In [None]:
questions = {}
ground_truths = {}
time_of_question = {}

#Get each question
for atype in e.findall('row'):
    id=atype.get('PostTypeId')
    if int(id)==1:
        body=atype.get('Body')
        title=atype.get('Title')
        tags=atype.get('Tags')
        que_id=int(atype.get('Id'))
        
        #print('>>>>', que_id)
        ground_truths[que_id] = get_ground_truth(que_id)
        #print('<<<<')
        
        body = remove_html_tags(body)
        title = remove_html_tags(title)
        tags=clean_tags(tags)
        post=str(body)+" "+str(title)+str(tags)

        questions[que_id] = post
        time_of_question[que_id] = get_epoch(atype.get('CreationDate'))
l=len(questions)

index=int(l*0.8)


train_questions=dict(questions.items()[:index])
test_questions=dict(questions.items()[index:])
mrr_s = []
precision_s = []
for que_id in test_questions.keys():
    user_list = []
    user_prob = []
    for userid in LLDA_UP.keys():
        prob = get_prob_q_given_u(test_questions[que_id],userid) + math.log(get_prior(userid, que_id))
        user_list.append(userid)
        user_prob.append(prob)
    ids = np.array(user_prob).argsort()[:10]
    userids = np.array(user_list)[ids]
    
    mrr_s.append(calculate_MRR_per_qn(userids, ground_truths[que_id]))
    precision_s.append(calculate_precision_per_qn(userids, ground_truths[que_id]))
    
#     print ('que_id, ids, groud_truth, userids ', que_id, ids,  ground_truths[que_id],userids)
    print ('(current)Mean Reciprocal Rank: ', np.mean(mrr_s))
    print ('(current)Precision @ 10', np.mean(precision_s))
    print ('\n')
#     break
    
mrr = np.mean(mrr_s)
prec = np.mean(precision_s)

print ('(final)Mean Reciprocal Rank: ', mrr)
print ('(final)Precision @ 10', prec)

('(current)Mean Reciprocal Rank: ', 0.0)
('(current)Precision @ 10', 0.0)
('(current)Precision @ 5', 0.0)


('(current)Mean Reciprocal Rank: ', 0.08333333333333333)
('(current)Precision @ 10', 0.05)
('(current)Precision @ 5', 0.0)


('(current)Mean Reciprocal Rank: ', 0.09722222222222221)
('(current)Precision @ 10', 0.06666666666666667)
('(current)Precision @ 5', 0.0)


('(current)Mean Reciprocal Rank: ', 0.07291666666666666)
('(current)Precision @ 10', 0.05)
('(current)Precision @ 5', 0.0)


('(current)Mean Reciprocal Rank: ', 0.05833333333333333)
('(current)Precision @ 10', 0.04)
('(current)Precision @ 5', 0.0)


('(current)Mean Reciprocal Rank: ', 0.048611111111111105)
('(current)Precision @ 10', 0.03333333333333333)
('(current)Precision @ 5', 0.0)


('(current)Mean Reciprocal Rank: ', 0.041666666666666664)
('(current)Precision @ 10', 0.028571428571428574)
('(current)Precision @ 5', 0.0)


('(current)Mean Reciprocal Rank: ', 0.03645833333333333)
('(current)Precision @ 10', 0.025)
('



('(current)Mean Reciprocal Rank: ', 0.07663690476190475)
('(current)Precision @ 10', 0.03214285714285715)
('(current)Precision @ 5', 0.017857142857142856)


('(current)Mean Reciprocal Rank: ', 0.0752923976608187)
('(current)Precision @ 10', 0.03157894736842106)
('(current)Precision @ 5', 0.017543859649122806)


('(current)Mean Reciprocal Rank: ', 0.0739942528735632)
('(current)Precision @ 10', 0.031034482758620693)
('(current)Precision @ 5', 0.017241379310344827)


('(current)Mean Reciprocal Rank: ', 0.08121468926553671)
('(current)Precision @ 10', 0.03220338983050848)
('(current)Precision @ 5', 0.020338983050847456)


('(current)Mean Reciprocal Rank: ', 0.0798611111111111)
('(current)Precision @ 10', 0.03166666666666667)
('(current)Precision @ 5', 0.02)


('(current)Mean Reciprocal Rank: ', 0.078551912568306)
('(current)Precision @ 10', 0.031147540983606562)
('(current)Precision @ 5', 0.019672131147540982)


('(current)Mean Reciprocal Rank: ', 0.07728494623655913)
('(current)Precisi



('(current)Mean Reciprocal Rank: ', 0.07564102564102564)
('(current)Precision @ 10', 0.03076923076923077)
('(current)Precision @ 5', 0.01846153846153846)


('(current)Mean Reciprocal Rank: ', 0.07638888888888888)
('(current)Precision @ 10', 0.03181818181818182)
('(current)Precision @ 5', 0.01818181818181818)


('(current)Mean Reciprocal Rank: ', 0.07524875621890546)
('(current)Precision @ 10', 0.03134328358208956)
('(current)Precision @ 5', 0.01791044776119403)


('(current)Mean Reciprocal Rank: ', 0.07414215686274508)
('(current)Precision @ 10', 0.030882352941176472)
('(current)Precision @ 5', 0.01764705882352941)


('(current)Mean Reciprocal Rank: ', 0.07306763285024154)
('(current)Precision @ 10', 0.030434782608695653)
('(current)Precision @ 5', 0.017391304347826087)


('(current)Mean Reciprocal Rank: ', 0.07202380952380952)
('(current)Precision @ 10', 0.030000000000000002)
('(current)Precision @ 5', 0.017142857142857144)


('(current)Mean Reciprocal Rank: ', 0.07805164319248825)
('



('(current)Mean Reciprocal Rank: ', 0.10931372549019605)
('(current)Precision @ 10', 0.03647058823529412)
('(current)Precision @ 5', 0.03411764705882353)


('(current)Mean Reciprocal Rank: ', 0.10940545808966859)
('(current)Precision @ 10', 0.0368421052631579)
('(current)Precision @ 5', 0.033918128654970764)


('(current)Mean Reciprocal Rank: ', 0.10876937984496121)
('(current)Precision @ 10', 0.03662790697674419)
('(current)Precision @ 5', 0.03372093023255814)


('(current)Mean Reciprocal Rank: ', 0.108140655105973)
('(current)Precision @ 10', 0.03641618497109827)
('(current)Precision @ 5', 0.033526011560693646)


('(current)Mean Reciprocal Rank: ', 0.10751915708812258)
('(current)Precision @ 10', 0.03620689655172414)
('(current)Precision @ 5', 0.03333333333333334)


('(final)Mean Reciprocal Rank: ', 0.10751915708812258)
('(final)Precision @ 10', 0.03620689655172414)
('(final)Precision @ 5', 0.03333333333333334)


In [13]:
import pickle
pickle_file = 'user_id_question_map3.pkl'
pickle.dump(user_id_question_map, open(pickle_file, 'wb'))