In [24]:
import pandas as pd
import numpy as np
import math

In [31]:
results = [
    {'1954994128': {'rank': 2, 'relevancy': True},
 '2018589324': {'rank': 3, 'relevancy': True},
 '2027088710': {'rank': 0, 'relevancy': True},
 '2120281063': {'rank': 7, 'relevancy': True},
 '2130881427': {'rank': 5, 'relevancy': True},
 '2182961668': {'rank': 1, 'relevancy': True},
 '2325036826': {'rank': 4, 'relevancy': True},
 '2341666688': {'rank': 6, 'relevancy': True},
 '2589367414': {'rank': 8, 'relevancy': True},
 '466258520': {'rank': 9, 'relevancy': False}},
    {'2102150147': {'rank': 3, 'relevancy': True},
 '2121430311': {'rank': 1, 'relevancy': True},
 '2130456104': {'rank': 4, 'relevancy': True},
 '2136794847': {'rank': 8, 'relevancy': True},
 '2144292550': {'rank': 7, 'relevancy': True},
 '2152966300': {'rank': 0, 'relevancy': True},
 '2153604853': {'rank': 5, 'relevancy': True},
 '2600424145': {'rank': 2, 'relevancy': True},
 '2656069216': {'rank': 6, 'relevancy': True},
 '74609370': {'rank': 9, 'relevancy': False}},
    {'1151530292': {'rank': 3, 'relevancy': False},
 '2004366870': {'rank': 5, 'relevancy': True},
 '2130711152': {'rank': 4, 'relevancy': True},
 '2135488253': {'rank': 0, 'relevancy': True},
 '2616859289': {'rank': 2, 'relevancy': True},
 '374805745': {'rank': 7, 'relevancy': False},
 '56197593': {'rank': 6, 'relevancy': True},
 '798588419': {'rank': 1, 'relevancy': False}},
    {'2026708028': {'rank': 0, 'relevancy': False},
 '2113084173': {'rank': 2, 'relevancy': True},
 '2137656347': {'rank': 5, 'relevancy': False},
 '2151819869': {'rank': 8, 'relevancy': False},
 '2251781635': {'rank': 6, 'relevancy': False},
 '2311840536': {'rank': 4, 'relevancy': True},
 '2345687710': {'rank': 1, 'relevancy': False},
 '2577876495': {'rank': 7, 'relevancy': False},
 '61728793': {'rank': 3, 'relevancy': True}}
]

## $MRR = \frac{1}{Q}\sum\limits_{i=1}^{|Q|}\frac{1}{rank_i}$

In [64]:
def mean_reciprocal_rank(results):
    partial_ranks = []
    
    for result in results:
        sortd = sorted(result.items(), key=lambda item: item[1]['rank'])

        for s in sortd:
            if s[1]['relevancy'] == True:
                # We had to do rank from 1 on instead of 0 on because of the 1 / rank formula.
                partial_ranks.append(1 / (s[1]['rank']+1))
                break
    
    mrr = np.around(np.mean(partial_ranks), decimals=2)
    
    return mrr
    

In [77]:
def mean_average_precision(results):
    
    average_precision_scores = []
    
    for result in results:
        sortd = sorted(result.items(), key=lambda item: item[1]['rank'])
        
        average_precison_partials_list = []
        current_sublist_size = 0
        relevant_found = 0
        
        for s in sortd:
            if s[1]['relevancy'] == True:
                current_sublist_size += 1
                relevant_found += 1
                average_precision_partial = relevant_found / current_sublist_size
                average_precison_partials_list.append(average_precision_partial)
            else:
                current_sublist_size += 1

        average_precision = np.sum(average_precison_partials_list) / len(sortd)
        average_precision_scores.append(average_precision)
    
    mapr = np.around(np.mean(average_precision_scores), decimals=2)
    
    return mapr
            

In [78]:
mean_average_precision(results)

0.6

In [92]:
def mean_precision_at_n(results, n=5):
    
    average_precision_scores = []
    
    for result in results:
        
        sortd = sorted(result.items(), key=lambda item: item[1]['rank'])
        
        correct = 0
        
        for s in sortd[:n]:
            if s[1]['relevancy'] == True:
                correct += 1
        
        average_precision_scores.append(correct / n)
    
    mpan = np.around(np.mean(average_precision_scores), decimals=2)
    
    return mpan


In [95]:
mean_precision_at_n(results, 10)

0.65

In [11]:
import pickle

with open('../../Cloud/rankings.pickle', 'rb') as handle:
    rankings = pickle.load(handle)
    rankings = {k.lower(): v for k, v in rankings.items()}

In [38]:
from collections import defaultdict

In [13]:
ndcg_queries = [q.lower() for q in ['Interpolation', 'Wireless sensor network', 'Genetic algorithm', 'Cluster analysis', 'Fuzzy logic', 'Biometrics', 
                'Support vector machine', 'Deconvolution', 'Image segmentation', 'Fast Fourier transform', 'Sample size determination', 
                'Semantic similarity', 'Transfer of learning', 'Novelty detection', 'Knapsack problem', 'Linear algebra', 'Parallel algorithm', 
                'Computer architecture', 'Monte Carlo method', 'Dynamic programming', 'Automatic image annotation', 'Computational geometry', 
                'Search algorithm', 'Medical imaging', 'Combinatorial optimization', 'Middleware', 'Cloud computing', 
                'k-nearest neighbors algorithm', 'Big data', 'Eye tracking', 'Evolutionary algorithm', 'User interface', 
                "Newton's method", 'Wearable computer', 'Computational biology', 'Best-first search', 'Hierarchical clustering', 
                'Social network', 'Bayesian statistics', 'Knowledge extraction', 'World Wide Web', 'Multi-task learning', 'Speech processing', 
                'Gaussian random field', 'Hyperspectral imaging', 'Belief propagation', 'Semantic Web', 'Gibbs sampling', 
                'Dimensionality reduction', 'Latent Dirichlet allocation', 'Facial recognition system', 'Game theory', 
                'Machine translation', 'Kernel density estimation', 'Convex optimization', 'Web search query', 'Generative model', 
                'Query optimization', 'Activity recognition', 'Automatic summarization', 'Propagation of uncertainty', 'Topic model', 
                'Social web', 'Web service', 'Information privacy', 'Web 2.0', 'Learning to rank', 'Entity linking', 'Sentiment analysis', 
                'Clustering high-dimensional data', 'Relational database', 'Random forest', 'Batch processing', 'Anomaly detection', 
                'Image restoration', 'Security token', 'Wavelet transform', 'WordNet', 'Ensemble learning', 'Multi-agent system', 
                'Quantum information science', 'Continuous-time Markov chain', 'Video denoising', 'Color quantization', 'Logic programming', 
                'Constraint satisfaction', 'Mean field theory', 'Categorial grammar', 'Rule induction', 'Linear separability', 'Network theory', 
                'Open Knowledge Base Connectivity', 'Semantic grid', 'OWL-S', 'Uncertainty quantification', 'Ontology language', 
                'Semantic role labeling']]

In [46]:
def create_author_rankings(query_rankings):
    author_ranking = defaultdict(dict)
    for query, value in query_rankings.items():
        for v in value:
            author = v['author']
            label = v["label"]
            author_ranking[author][query] = label
    return author_ranking

In [47]:
author_rankings = create_author_rankings(rankings)

In [52]:
with open('author_rankings_with_relevance_labels.pickle', 'wb') as handle:
    pickle.dump(author_rankings, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [51]:
author_rankings['2152361128']

{'gaussian random field': 1, 'quantum information science': 3}

In [14]:
rankings["quantum information science"]

[{'author': '1639338439', 'label': 3, 'weight': 3627},
 {'author': '2155150338', 'label': 3, 'weight': 2835},
 {'author': '2114421234', 'label': 3, 'weight': 2293},
 {'author': '2265313210', 'label': 3, 'weight': 1383},
 {'author': '2469105369', 'label': 3, 'weight': 1134},
 {'author': '2325221658', 'label': 3, 'weight': 1039},
 {'author': '1498996866', 'label': 3, 'weight': 954},
 {'author': '2810605424', 'label': 3, 'weight': 911},
 {'author': '2100120781', 'label': 3, 'weight': 845},
 {'author': '2676713218', 'label': 3, 'weight': 837},
 {'author': '2420235564', 'label': 3, 'weight': 821},
 {'author': '2089688319', 'label': 3, 'weight': 769},
 {'author': '2569856224', 'label': 3, 'weight': 723},
 {'author': '2120853311', 'label': 3, 'weight': 700},
 {'author': '677148088', 'label': 3, 'weight': 676},
 {'author': '2152361128', 'label': 3, 'weight': 656},
 {'author': '2615151663', 'label': 3, 'weight': 626},
 {'author': '917844952', 'label': 2, 'weight': 607},
 {'author': '48597468', 

In [28]:
def ideal_dcg(query, n=10):
    ranking = rankings[query.lower()][:n]
    idcg = sum([r["label"]/math.log((i+1)+1) for i, r in enumerate(ranking)])
    return idcg

In [70]:
ideal_dcg("quantum information science", 10)

19.664911575134393

In [None]:
{'139622989': {'rank': 2, 'relevancy': False},
 '2021943728': {'rank': 4, 'relevancy': True},
 '2104552492': {'rank': 8, 'relevancy': True},
 '2148053431': {'rank': 1, 'relevancy': False},
 '2148083837': {'rank': 9, 'relevancy': False},
 '2152361128': {'rank': 7, 'relevancy': True},
 '2157056412': {'rank': 5, 'relevancy': True},
 '2420235564': {'rank': 0, 'relevancy': True},
 '2569856224': {'rank': 6, 'relevancy': True},
 '917844952': {'rank': 3, 'relevancy': False}}

In [36]:
one_result =  [{'139622989': {'rank': 2, 'relevancy': False},
 '2021943728': {'rank': 4, 'relevancy': True},
 '2104552492': {'rank': 8, 'relevancy': True},
 '2148053431': {'rank': 1, 'relevancy': False},
 '2148083837': {'rank': 9, 'relevancy': False},
 '2152361128': {'rank': 7, 'relevancy': True},
 '2157056412': {'rank': 5, 'relevancy': True},
 '2420235564': {'rank': 0, 'relevancy': True},
 '2569856224': {'rank': 6, 'relevancy': True},
 '917844952': {'rank': 3, 'relevancy': False}}]

In [73]:
def actual_dcg(result, query, n=10):
    sortd = sorted(result[0].items(), key=lambda item: item[1]['rank'])[:n]
    labels = []
    for author_id, v in sortd:
        if query.lower() in author_rankings[author_id]:
            label = author_rankings[author_id][query.lower()]
            labels.append(label)
        else:
            labels.append(0)
            
    dcg = sum([l/math.log((i+1)+1) for i, l in enumerate(labels)])
    
    return dcg

In [74]:
actual_dcg(one_result, "quantum information science", 10)

12.636521510487256

In [64]:
def normalized_discounted_cumulative_gain(query, result, n=10):
    query = query.lower()
    idcg = ideal_dcg(query, n)
    dcg = actual_dcg(result, query, n)
    return dcg/idcg

In [85]:
normalized_discounted_cumulative_gain("quantum information science", one_result, 10)

0.6425923382470585

In [86]:
def average_normalized_discounted_cumulative_gain(queries, results, n=10):
    scores = []
    for q, r in zip(queries, results):
        score = normalized_discounted_cumulative_gain(q, r, n)
        scores.append(score)
    return np.mean(scores)