# Summary

Learn a word2vec model for 1 week of sessions

### Imports etc.

In [2]:
import datetime
import numpy as np
import os,sys
import json
import time
import random
import requests
print('ready')

ready


In [62]:
## select the test data
day_start = datetime.date(2020,4,1)
n_days = 30
N_sel = 785726
wiki = 'simplewiki'
ids = 'pageid'

N_eval_max = 10
k = 100

filename = '../../reader_navigation/output/reading-sessions-corpora/%s-%s_%s_ndays-%s_sample-%s'%(wiki,ids,str(day_start),n_days,N_sel)
ftest = filename+'.test'
fdev = filename+'.dev'

print(os.path.isfile(ftest))
print(os.path.isfile(fdev))

True
True


In [65]:
## get the query data (page-ids)

def prepare_queries_pairs(f, N_max = -1 ):
    '''
    from a file containing sequences of pageview.
    select one random pair of consecutive pageivews.
    returns a list of tuples [(src,trg)], where src, trg are of type str.

    get at most N_max pairs (default is -1 == all).
    '''
    queries = []; count=0
    for line in open(f):
        session = line.strip().split(" ")
        if len(session)>=2:
            idx_src = random.randint(0,len(session)-2)
            queries.append(( session[idx_src],session[idx_src+1] ))
            count+=1
        if count == N_max:
            break
    print("Completed "+str(count)+" paths")
    return queries

queries_dev = prepare_queries_pairs(fdev,N_max=N_eval_max)
queries_test = prepare_queries_pairs(ftest,N_max=N_eval_max)

Completed 10 paths
Completed 10 paths


In [60]:
def titleFromPageid(page_id,wiki):
    '''
    query wikipedia-API to get the pagetitle from a pageid
    '''
    ## get the page-ids
    api_url_base = 'https://%s.wikipedia.org/w/api.php'%( wiki.replace('wiki','') )
    params = {
        "action": "query",
        "pageids": page_id,
        "prop": "pageprops",
        "format": "json",
    }
    try:
        response = requests.get( api_url_base,params=params).json()
        if 'query' in response:
            if 'pages' in response['query']:
                title = response['query']['pages'].get(page_id,{}).get('title','')

    except:
        title = ''
    return title

## morelike search
def morelikeFromTitle(title,wiki,k=100):
    '''
    do morelike search https://www.mediawiki.org/wiki/Help:CirrusSearch#Morelike
    get k recommendations for a page-title in a given wiki.
    Return titles and pageids.
    '''

    api_url_base = 'https://%s.wikipedia.org/w/api.php'%( wiki.replace('wiki','') )
    ## https://www.mediawiki.org/wiki/API:Search
    ## https://www.mediawiki.org/wiki/Help:CirrusSearch#Morelike

    params = {
        'action': 'query',
        'list': 'search',
        'format': 'json',
        'srsearch': 'morelike:'+title,
        'srnamespace' : 0,
        'srwhat': 'text',
        'srprop': 'wordcount',
        'srlimit': k
    }
    try:
        response = requests.get( api_url_base,params=params).json()
    except:
        print('Could not do morelike search for %s in %s. Try another article or another language.' % (title,wiki))
        return [] 

    if 'query' not in response or 'search' not in response['query']:
        print('Could not do morelike search for %s in %s. Try another article or another language.' % (title,wiki))
        return []
    return response['query']['search']

def morelikeFromPageid(page_id,wiki,k=100):
    '''
    before querying morelikeFromTitle we have to get the title from the pageid
    '''
    title = titleFromPageid(str(page_id),wiki)
    if len(title)>0:
        result = morelikeFromTitle(title,wiki,k=k)
    else:
        result = []
    return result

def queriesPairsToRank(queries,wiki,k=100):
    '''
    from a list of pairs (src,target)
    - get the k nearest neighbors of src via morelike in specific wiki
    - check rank of trg among nearest neighbors
    '''
    t_rest = 0.1 ## be nice to morelike API
    rank_list = []
    for pid_src,pid_trg in queries:
        result = morelikeFromPageid(pid_src,wiki)
        pid_src_nn = [str(nn['pageid']) for nn in result  ]
        try:
            rank = pid_src_nn.index(pid_trg)+1
        except ValueError:
            rank = 1e6
        rank_list.append(rank)
        
        time.sleep(t_rest)
    return np.array(rank_list)

 
def metrics(mrr_list):
    '''
    calculate metrics associated with rank querying from a list of ranks
    - mrr (mean reciprocal rank)
    - recall@k, whether trg was among top-k in mrr-list
    '''
    mrr = np.mean(1/mrr_list)
    recall1 = np.where((mrr_list <= 1) & (mrr_list != 1e6))[0].shape[0]/mrr_list.shape[0]
    recall10 = np.where((mrr_list <= 10) & (mrr_list != 1e6))[0].shape[0]/mrr_list.shape[0]
    recall50 = np.where((mrr_list <= 50) & (mrr_list != 1e6))[0].shape[0]/mrr_list.shape[0]
    recall100 = np.where((mrr_list <= 100) & (mrr_list != 1e6))[0].shape[0]/mrr_list.shape[0]
    
    dict_result = {
        'N':mrr_list.shape[0], 
        'MRR':mrr,
        'Recall@1':recall1,
        'Recall@10':recall10,
        'Recall@50':recall50,
        'Recall@100':recall100
    }
    return dict_result
#     return mrr_list.shape[0], mrr, recall1, recall10, recall50, recall100

def queriesPairsEval(queries,wiki,k=100):
    list_rank = queriesPairsToRank(queries,wiki,k=k)
    return metrics(list_rank)

In [64]:
result_dev = queriesPairsEval(queries_dev,wiki=wiki,k=k)
result_test = queriesPairsEval(queries_test,wiki=wiki,k=k)
print(result_dev)
print(result_test)

{'N': 10, 'MRR': 0.1500008, 'Recall@1': 0.1, 'Recall@10': 0.2, 'Recall@50': 0.2, 'Recall@100': 0.2}
{'N': 10, 'MRR': 0.1833340333333333, 'Recall@1': 0.1, 'Recall@10': 0.3, 'Recall@50': 0.3, 'Recall@100': 0.3}
