## Pip installs, gsutil data getters and imports

In [None]:
# Libs
! pip install sentence_transformers;
!apt install libomp-dev;
!pip3 install --upgrade faiss-gpu;

In [128]:
import os
import time
import math
import random
import joblib
import numpy as np
import json
import pandas as pd
from sklearn.preprocessing import normalize
import faiss
from pprint import pprint
import scipy
from collections import Counter
import ast
from more_itertools import unique_everseen
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')
# res = faiss.StandardGpuResources()  # use a single GPU

def dist2sim(d):
    return 1 - d / 2

## Functions for index and data loading

In [9]:
def load_relevant_index(type="separate_sbert"):
    index = None
    if type == "separate_sbert":
        index = faiss.read_index("Mapped_indeces/separate_embeddings_faiss.index")
    elif type == "merged_sbert":
        index = faiss.read_index("Mapped_indeces/merged_embeddings_faiss.index")
    elif type == "retro_merged_sbert":
        index = faiss.read_index("Mapped_indeces/retro_merged_embeddings_faiss.index")
    elif type == "retro_separate_sbert":
        index = faiss.read_index("Mapped_indeces/retro_separate_embeddings_faiss.index")
    elif type == "tfidf_svd":
        index = faiss.read_index("Mapped_indeces/tfidf_embeddings_faiss.index")
    elif type == "pooled_bert":
        index = faiss.read_index("Mapped_indeces/mean_bert_faiss.index")
    elif type == "pooled_glove":
        index = faiss.read_index("Mapped_indeces/glove_faiss.index")
    return index

In [10]:
def load_data_and_authors(data_path="papers.csv, 
                          authors_path="authors.csv"):
    data = pd.read_csv(data_path)
    authors = pd.read_csv(authors_path)
    return data, authors

data_and_authors = load_data_and_authors()
data = data_and_authors[0]
authors = data_and_authors[1]

In [11]:
def load_tidf_classifier():
    return joblib.load("Mapped_indeces/tfidf_svd_transformer.joblib")

In [3]:
# Was only needed to populate faiss with tf-idf, now just provided as loaded index
# tfidf_clf = load_tidf_classifier()

## Functions for field retrieval by paper or author id

In [12]:
def retrieve_author_by_id(author_id):
    return authors[authors.id == int(author_id)]

def get_abstract_by_id(id_):
    return data[data.id == id_].abstract.values[0]

def get_fos_by_id(id_):
    return data[data.id == id_].fos.values[0]

def get_title_by_id(id_):
    return data[data.id == id_].title.values[0]

def get_authors_by_id(id_):
    try:
        return data[data.id == id_].authors.values[0]
    except:
        print(id_)
        return [{"id": -999999}]

def get_first_author_by_id(id_):
    authors = get_authors_by_id(id_)
    return authors[0]

In [13]:
def retrieve_authorname_by_authorid(author_id):
    return authors[authors.id == int(author_id)].name.values[0]

def retrieve_pub_count_by_id(author_id):
    return authors[authors.id == int(author_id)].n_pubs.values[0]

def retrieve_cit_count_by_id(author_id):
    return authors[authors.id == int(author_id)].n_citation.values[0]

In [14]:
def get_information_by_id(id_, query, tfidf=False):
    pprint(f"Title: {get_title_by_id(id_)}")
    print("===")
    pprint(f"Abstract: {get_abstract_by_id(id_)}")
    print("===")
    pprint(f"Tags: {get_fos_by_id(id_)}")
    print("===")
    authors = get_authors_by_id(id_)
    pprint(f"Authors: {authors}")
    first_author = authors[0]
    print("===")
    pprint(f"First author {first_author['name']} relevant: {check_if_author_relevant(int(first_author['id']), query)}")
    print("===")
    pprint(f"Approximately relevant: {check_if_author_relevant_approximate(int(first_author['id']), query, tfidf=tfidf)}")

In [15]:
def get_information_by_author_id(aid, query, tfidf=False):
    pprint(f"Name: {retrieve_authorname_by_authorid(aid)}")
    print("===")
    pprint(f"Number of publications: {retrieve_pub_count_by_id(aid)}")
    print("===")
    pprint(f"Number of citations: {retrieve_cit_count_by_id(aid)}")
    print("===")
    pprint(f"Exactly relevant: {check_if_author_relevant(int(aid), query)}")
    print("===")
    pprint(f"Approximately relevant: {check_if_author_relevant_approximate(int(aid), query, tfidf=tfidf)}")

## Functions for getting final author rankings

In [16]:
def get_author_ranking_exact(query, index, k=10, tfidf=False):
    query = query.lower()
    results = retrieve_results(query, index, k, tfidf=tfidf)
    candidate_papers = results[0]

    # We remove duplicate authors for now, while preserving order (their highest position)
    authors = list(unique_everseen([get_first_author_by_id(str(rid))["id"] for rid in candidate_papers]))
    relevancies = [check_if_author_relevant(int(a), query) for a in authors]

    ranking = {}

    for rank,(author, relevancy) in enumerate(zip(authors, relevancies)):
        if author not in ranking.keys():
            ranking[author] = {"relevancy": relevancy, "rank": rank}
        else:
            continue

    return ranking

def get_author_ranking_approximate(query, index, k=10, similarity_threshold=0.7, tfidf=False):
    query = query.lower()
    results = retrieve_results(query, index, k, tfidf=tfidf)
    
    candidate_papers = results[0]
    # We remove duplicate authors for now, while preserving order (their highest position)
    authors = list(unique_everseen([get_first_author_by_id(str(rid))["id"] for rid in candidate_papers]))
    relevancies = [check_if_author_relevant_approximate(int(a), query, similarity_threshold, tfidf=tfidf) for a in authors]

    ranking = {}

    for rank,(author, relevancy) in enumerate(zip(authors, relevancies)):
        if author not in ranking.keys():
            ranking[author] = {"relevancy": relevancy, "rank": rank}
        else:
            continue

    return ranking

In [17]:
def get_author_ranking_average_exact(query, index, k=10, tfidf=False):
    query = query.lower()
    results = retrieve_results_average(query, index, k, tfidf=tfidf)
    candidate_authors = list(unique_everseen(results[0]))

    # We remove duplicate authors for now, while preserving order (their highest position)
    # authors = list(unique_everseen([get_first_author_by_id(str(rid))["id"] for rid in candidate_papers]))
    relevancies = [check_if_author_relevant(int(a), query) for a in candidate_authors]

    ranking = {}

    for rank,(author, relevancy) in enumerate(zip(candidate_authors, relevancies)):
        if author not in ranking.keys():
            ranking[author] = {"relevancy": relevancy, "rank": rank}
        else:
            continue

    return ranking

def get_author_ranking_average_approximate(query, index, k=10, similarity_threshold=0.7, tfidf=False):
    query = query.lower()
    results = retrieve_results_average(query, index, k, tfidf=tfidf)
    
    candidate_authors = list(unique_everseen(results[0]))
    # We remove duplicate authors for now, while preserving order (their highest position)
    # authors = list(unique_everseen([get_first_author_by_id(str(rid))["id"] for rid in candidate_papers]))
    relevancies = [check_if_author_relevant_approximate(int(a), query, similarity_threshold, tfidf=tfidf) for a in candidate_authors]

    ranking = {}

    for rank,(author, relevancy) in enumerate(zip(candidate_authors, relevancies)):
        if author not in ranking.keys():
            ranking[author] = {"relevancy": relevancy, "rank": rank}
        else:
            continue

    return ranking

In [18]:
def prune_results_for_authors_wo_tags(results, query, how_many=10):
    ids = results[0]
    distances = results[1]

    relevant_ids = []
    relevant_distances = []
    # For now, I check if the first author is not in the set, I throw the paper away, because I now
    # only look at first author for evaluation. But later if I have another strategy for retrieving author per paper
    # we can change this logic back to "all authors not in the set".
    for rid, rd in zip(ids, distances):
        authors = [a["id"] for a in get_authors_by_id(str(rid))]
        relevancy = [check_if_author_relevant(int(a), query) for a in authors]
        # if relevancy != ['Not in the dataset or no tags present!']*len(relevancy):
        #     relevant_ids.append(rid)
        #     relevant_distances.append(rd)
        if relevancy[0] != 'Not in the dataset or no tags present!':
            relevant_ids.append(rid)
            relevant_distances.append(rd)

    return relevant_ids[:how_many], relevant_distances[:how_many]
        

In [19]:
def prune_results_for_authors_wo_tags_average(results, query, how_many=10):
    ids = results[0]
    distances = results[1]

    relevant_ids = []
    relevant_distances = []
    for aid, ad in zip(ids, distances):
        relevancy = check_if_author_relevant(int(aid), query)

        if relevancy != 'Not in the dataset or no tags present!':
            relevant_ids.append(aid)
            relevant_distances.append(ad)

    return relevant_ids[:how_many], relevant_distances[:how_many]

## Functions for the Faiss index search and utilities for the paper retrieval

In [20]:
def get_most_similar_ids(query, index, k=10, tfidf_classifier=None):
    # First, embed the query, normalize the vector and convert to float32

    if tfidf_classifier:
        query_emb = tfidf_classifier.transform([query])[0]
        normalized_query = np.float32([query_emb])[0]
    else:
        query_emb = embedder.encode([query])[0]
        normalized_query = np.float32(normalize([query_emb])[0])

    assert type(normalized_query[0]).__name__ == 'float32'

    #Next, run the index search
    s = time.time()
    dists, idxs = index.search(np.array([normalized_query]), k)
    # print("Search execution time:")
    # print((time.time() - s), "s.")
    # print("IDS, sorted by similarity:")
    # print(idxs[0])
    # print('Similarity scores:')
    # print(dist2sim(dists[0]))
    return idxs[0], dist2sim(dists[0])

In [21]:
def retrieve_results(query, index, k=10, verbose=False, tfidf=False):
    initial_retrieval = k*5
    s = time.time()
    if tfidf:
        most_similar_raw = get_most_similar_ids(query, index, initial_retrieval, tfidf_clf)
    else:
        most_similar_raw = get_most_similar_ids(query, index, initial_retrieval)
    s1 = time.time()
    pruned = prune_results_for_authors_wo_tags(most_similar_raw, query, k)
    s2 = time.time()
    if verbose:
        print(f"Full search execution time: {time.time() - s} seconds")
        print(f"from which {s1-s} s. in the search and {s2 - s1} s. in the pruning.")
        print("===")
        print("Pruned IDS, sorted by similarity:")
        print(pruned[0])
        print('Similarity scores:')
        print(pruned[1])
    return pruned

In [22]:
def retrieve_results_average(query, index, k=10, verbose=False, tfidf=False):
    initial_retrieval = k*5
    s = time.time()
    if tfidf:
        most_similar_raw = get_most_similar_ids(query, index, initial_retrieval, tfidf_clf)
    else:
        most_similar_raw = get_most_similar_ids(query, index, initial_retrieval)
    s1 = time.time()
    pruned = prune_results_for_authors_wo_tags_average(most_similar_raw, query, k)
    s2 = time.time()
    if verbose:
        print(f"Full search execution time: {time.time() - s} seconds")
        print(f"from which {s1-s} s. in the search and {s2 - s1} s. in the pruning.")
        print("===")
        print("Pruned author IDS, sorted by similarity:")
        print(pruned[0])
        print('Similarity scores:')
        print(pruned[1])
    return pruned

In [23]:
def calculate_distances_from_query_to_fos(query, fos_tags, tfidf_classifier=None):

    if tfidf_classifier:
        fos_tag_embeddings = tfidf_classifier.transform(fos_tags)
        query_emb = tfidf_classifier.transform([query])[0]
    else:
        fos_tag_embeddings = embedder.encode(fos_tags)
        query_emb = embedder.encode([query])[0]

    distances = [ 1- scipy.spatial.distance.cdist([query_emb], [fos_tag_embedding], 'cosine')[0][0] for fos_tag_embedding in fos_tag_embeddings]

    return [(ft, d) for ft, d in zip(fos_tags, distances)]

In [24]:
def retrieve_author_tags(author_id):
    try:
        return ast.literal_eval(authors[authors.id == author_id].tags.values[0])
    except:
        return {}

## Functions for checking relevancy of a certain author with regard to query

In [25]:
def check_if_author_relevant(author_id, query):
    query = query.lower()
    tags = [t['t'].lower() for t in retrieve_author_tags(author_id)]
    if tags:
        if query in tags:
            return True
        else:
            return False
    else:
        return "Not in the dataset or no tags present!"

In [26]:
def check_if_author_relevant_approximate(author_id, query, similarity_threshold=0.7, tfidf=False):
    query = query.lower()
    tags = [t['t'].lower() for t in retrieve_author_tags(author_id)]
    if tfidf:
        distances = calculate_distances_from_query_to_fos(query, tags, tfidf_clf)
    else:
        distances = calculate_distances_from_query_to_fos(query, tags)
    similar = [d for d in distances if d[1] > similarity_threshold]
    # print("Approx. similar:", similar)
    if similar:
        return True
    else:
        return False

# Actual retrieval examples

In [27]:
queries = ['cluster analysis', 'Image segmentation', 'Parallel algorithm', 'Monte Carlo method',
           'Convex optimization', 'Dimensionality reduction', 'Facial recognition system', 
           'k-nearest neighbors algorithm', 'Hierarchical clustering', 'Automatic summarization',
           'Dynamic programming', 'Genetic algorithm', 'Human-computer interaction', 'Categorial grammar', 
           'Semantic Web', 'fuzzy logic', 'image restoration', 'generative model', 'search algorithm',
           'sample size determination', 'anomaly detection', 'sentiment analysis', 'semantic similarity',
           'world wide web', 'gibbs sampling', 'user interface', 'belief propagation', 'interpolation', 
           'wavelet transform', 'transfer of learning', 'topic model', 'clustering high-dimensional data', 
           'game theory', 'biometrics', 'constraint satisfaction', 'combinatorial optimization', 'speech processing',
           'multi-agent system', 'mean field theory', 'social network', 'lattice model', 'automatic image annotation',
           'computational geometry', 'Evolutionary algorithm', 'web search query', 'eye tracking', 'query optimization',
           'logic programming', 'Hyperspectral imaging', 'Bayesian statistics', 'kernel density estimation',
           'learning to rank', 'relational database', 'activity recognition', 'wearable computer', 'big data', 
           'ensemble learning', 'wordnet', 'medical imaging', 'deconvolution', 'Latent Dirichlet allocation', 
           'Euclidian distance', 'web service', 'multi-task learning', 'Linear separability', 'OWL-S',
           'Wireless sensor network', 'Semantic role labeling', 'Continuous-time Markov chain', 
           'Open Knowledge Base Connectivity', 'Propagation of uncertainty', 'Fast Fourier transform', 
           'Security token', 'Novelty detection', 'semantic grid', 'Knowledge extraction', 
           'Computational biology', 'Web 2.0', 'Network theory', 'Video denoising', 'Quantum information science',
           'Color quantization', 'social web', 'entity linking', 'information privacy', 'random forest', 
           'cloud computing', 'Knapsack problem', 'Linear algebra', 'batch processing', 'rule induction', 
           'Uncertainty quantification', 'Computer architecture', 'Best-first search', 'Gaussian random field',
           'Support vector machine', 'ontology language', 'machine translation', 'middleware', 'Newton\'s method']

In [28]:
index = load_relevant_index("separate_sbert")

In [29]:
query = "world wide web"

In [30]:
retrieve_results(query.lower(), index, k=10, verbose=True, tfidf=False);
# get_most_similar_ids(query, index)

Full search execution time: 1.5033385753631592 seconds
from which 0.8180646896362305 s. in the search and 0.6852731704711914 s. in the pruning.
===
Pruned IDS, sorted by similarity:
[2154085356, 1601547964, 2395256202, 1568893392, 2768317741, 2059713800, 2147164982, 2138350977, 2119012894, 2157327941]
Similarity scores:
[0.8862335, 0.75639015, 0.7390435, 0.7349443, 0.7105715, 0.6853581, 0.6767964, 0.6290454, 0.6205123, 0.6062963]


In [143]:
get_information_by_id("1594924988", query, tfidf=False)

'Title: Data Clustering: Theory, Algorithms, and Applications'
===
('Abstract: Preface Part I. Clustering, Data and Similarity Measures: 1. Data '
 'clustering 2. DataTypes 3. Scale conversion 4. Data standardization and '
 'transformation 5. Data visualization 6. Similarity and dissimilarity '
 'measures Part II. Clustering Algorithms: 7. Hierarchical clustering '
 'techniques 8. Fuzzy clustering algorithms 9. Center Based Clustering '
 'Algorithms 10. Search based clustering algorithms 11. Graph based clustering '
 'algorithms 12. Grid based clustering algorithms 13. Density based clustering '
 'algorithms 14. Model based clustering algorithms 15. Subspace clustering 16. '
 'Miscellaneous algorithms 17. Evaluation of clustering algorithms Part III. '
 'Applications of Clustering: 18. Clustering gene expression data Part IV. '
 'Matlab and C++ for Clustering: 19. Data clustering in Matlab 20. Clustering '
 'in C/C++ A. Some clustering algorithms B. Thekd-tree data structure C. '
 'Mat

In [67]:
get_information_by_author_id("2593877498", "cluster analysis")

'Name: Aristides Gionis'
===
'Number of publications: 35'
===
'Number of citations: 4256'
===
'Exactly relevant: True'
===
'Approximately relevant: True'


In [None]:
get_author_ranking_exact(query, index, tfidf=False)

In [66]:
retrieve_author_tags(2593877498)

[{'t': 'Nearest neighbor search'},
 {'t': 'Random testing'},
 {'t': 'Computational complexity theory'},
 {'t': 'Scalability'},
 {'t': 'Markov process'},
 {'t': 'Correlation clustering'},
 {'t': 'Best-effort delivery'},
 {'t': 'Algorithm'},
 {'t': 'Metasearch engine'},
 {'t': 'Sequence'},
 {'t': 'Bioinformatics'},
 {'t': 'Matrix decomposition'},
 {'t': 'Bayesian information criterion'},
 {'t': 'Data collection'},
 {'t': 'Approximation algorithm'},
 {'t': 'Time series'},
 {'t': 'DNA'},
 {'t': 'Computational biology'},
 {'t': 'Ranking'},
 {'t': 'External Data Representation'},
 {'t': 'Randomized algorithm'},
 {'t': 'Boolean algebra'},
 {'t': 'Data processing'},
 {'t': 'Regular expression'},
 {'t': 'Environmental science'},
 {'t': 'Wireless sensor network'},
 {'t': 'Collaborative filtering'},
 {'t': 'Data structure'},
 {'t': 'Data analysis'},
 {'t': 'Minimum description length'},
 {'t': 'Database'},
 {'t': 'Network topology'},
 {'t': 'Resampling'},
 {'t': 'Order theory'},
 {'t': 'Cluster a

# Author re-ranking functions

In [327]:
i, d = get_most_similar_ids(query.lower(), index, 100)

In [328]:
len([x for x in d if x > 0.6])

3

In [329]:
d

array([0.6774728 , 0.6267048 , 0.6201817 , 0.56027675, 0.5555264 ,
       0.55523175, 0.54782987, 0.52863693, 0.52390087, 0.5218121 ,
       0.51709706, 0.5156915 , 0.51461625, 0.5075133 , 0.49405396,
       0.49302465, 0.49144435, 0.49137604, 0.49063826, 0.48952   ,
       0.48618758, 0.48452473, 0.48440003, 0.47892427, 0.47764754,
       0.47451985, 0.47381836, 0.46670514, 0.46172583, 0.46117938,
       0.4595275 , 0.45459133, 0.45347226, 0.45202821, 0.4520086 ,
       0.4491104 , 0.44467467, 0.4441229 , 0.4357288 , 0.43286896,
       0.43240428, 0.4315712 , 0.43128896, 0.42991185, 0.42907357,
       0.42684197, 0.4264232 , 0.4248494 , 0.42293078, 0.4221788 ,
       0.42216408, 0.4211011 , 0.41978258, 0.41809082, 0.41717345,
       0.41616   , 0.41365236, 0.41316593, 0.41274178, 0.41271734,
       0.41213948, 0.4115572 , 0.4114157 , 0.41096544, 0.4065985 ,
       0.40382224, 0.40341467, 0.40215343, 0.4009413 , 0.39993775,
       0.3994406 , 0.39857626, 0.39777416, 0.3973974 , 0.39488

In [39]:
from collections import defaultdict
import math

In [173]:
retrieve_pub_count_by_id(2589367414)

2

In [36]:
AVERAGE_N_PUBS = int(authors.n_pubs.mean())

In [37]:
AVERAGE_N_PUBS

58

In [398]:
1.60771036 * math.log(2+100*(58/500), 2)

6.053889222667731

In [None]:
0.25 9.45   
0.252 5.49
5.85  15.94
11.02 21
16    26

In [216]:
sum([math.exp(score) for score in [11.079944109239786]])

64857.25838543333

In [218]:
math.exp(5)

148.4131591025766

In [51]:
def create_score_author_dict(query, retrieved_paper_ids, retrieved_distances, strategy="uniform", normalized=False, average_pub_count=58, 
                            normalization_alpha=1, extra_normalization_term=10):
    # authors = [[item["id"] for item in get_authors_by_id(str(i))] for i in retrieved_paper_ids]
    # return authors
    # scores_per_paper = [(score / len(list(authors_))) for score, authors_ in zip(retrieved_distances, authors)]
    # return scores_per_paper

    def expCombSUM(list_of_scores):
        return sum([math.exp(score) for score in list_of_scores])
    
    def normalize_score(score, l_pro, average_l=average_pub_count, alpha=normalization_alpha):
        normalized_score = score * math.log(1 + alpha * (average_l / (l_pro+extra_normalization_term)), 2)
        return normalized_score

    scores_per_author = defaultdict(list)
    for pi, score in zip(retrieved_paper_ids, retrieved_distances):
        # Prune only for author that exist in our data.
        authors = [item["id"] for item in get_authors_by_id(str(pi)) if check_if_author_relevant(int(item["id"]), query) != 'Not in the dataset or no tags present!']
        if authors:
            if strategy == "uniform":
                score_per_author = score / len(authors)
                for author in authors:
                    if normalized:
                        pub_count = retrieve_pub_count_by_id(int(author))
                        normalized_score = normalize_score(score_per_author, pub_count)
                        scores_per_author[author].append(normalized_score)
                    else:
                        scores_per_author[author].append(score_per_author)
            elif strategy == "binary":
                score_per_author = score
                for author in authors:
                    if normalized:
                        pub_count = retrieve_pub_count_by_id(int(author))
                        normalized_score = normalize_score(score_per_author, pub_count)
                        scores_per_author[author].append(normalized_score)
                    else:
                        scores_per_author[author].append(score_per_author)
            elif strategy == "descending":
                decay_factor = 1
                for author in authors:
                    if normalized:
                        score_d = score*decay_factor
                        pub_count = retrieve_pub_count_by_id(int(author))
                        normalized_score = normalize_score(score_d, pub_count)
                        scores_per_author[author].append(normalized_score)
                        decay_factor -= 0.2
                    else:
                        scores_per_author[author].append(score*decay_factor)
                        decay_factor -= 0.2
            elif strategy == "parabolic":
                #  TODO: here we did not yet write the normalization code because we do not run it for this config.
                decay_factor = 0.8
                scores_per_author[authors[0]].append(score)
                scores_per_author[authors[-1]].append(score)
                for author in authors[1:-1]:
                    scores_per_author[author].append(score*decay_factor)
                    decay_factor -= 0.2
            # scores = {author : score_per_author for author in authors}
            # scores_per_paper[pi] = scores
        else:
            continue
        
    authorship_scores = {k : expCombSUM(v) for k, v in scores_per_author.items()}

    return authorship_scores

In [423]:
aa = create_score_author_dict(query, i, d, "binary", normalized=True, normalization_alpha=1)

In [401]:
query

'world wide web'

In [35]:
def produce_authors_ranking(authorship_scores):
    sortd = [(k, v) for k, v in sorted(authorship_scores.items(), key=lambda item: item[1], reverse=True)]
    return sortd

In [403]:
produce_authors_ranking(aa)[:10]

[('1243699091', 160.96043931062977),
 ('2687102544', 87.25080609385644),
 ('2308609296', 80.47302732308727),
 ('2700131745', 72.65723702922999),
 ('836368383', 68.42711735563111),
 ('1829258144', 62.595710419009194),
 ('2345687710', 58.51570204196498),
 ('2114878113', 54.54940569852606),
 ('2141172858', 41.46401206334932),
 ('2043954528', 39.19098392418984)]

In [None]:
get_information_by_author_id("2151264347", query)

In [None]:
query

In [54]:
def get_author_ranking_exact_v2(query, index, k=10, tfidf=False, strategy="uniform", 
                                normalized=False, norm_alpha=100, extra_term=10):
    
    if tfidf:
        i, d = get_most_similar_ids(query.lower(), index, 100, tfidf_clf)
    else:
        i, d = get_most_similar_ids(query.lower(), index, 100)

    author_score_dict = create_score_author_dict(query, i, d, strategy, 
                                                 normalized=normalized, normalization_alpha=norm_alpha, extra_normalization_term=extra_term)

    top_n = produce_authors_ranking(author_score_dict)[:k]

    relevancies = [check_if_author_relevant(int(aid), query) for aid, _ in top_n]

    ranking = {}

    for rank, (author, relevancy) in enumerate(zip([a[0] for a in top_n], relevancies)):
        if author not in ranking.keys():
            ranking[author] = {"relevancy": relevancy, "rank": rank}
        else:
            continue

    return ranking

def get_author_ranking_approximate_v2(query, index, k=10, similarity_threshold=0.7, tfidf=False, strategy="uniform", 
                                      normalized=False, norm_alpha=100, extra_term=10):
    
    if tfidf:
        i, d = get_most_similar_ids(query.lower(), index, 100, tfidf_clf)
    else:
        i, d = get_most_similar_ids(query.lower(), index, 100)

    author_score_dict = create_score_author_dict(query, i, d, strategy, 
                                                 normalized=normalized, normalization_alpha=norm_alpha, extra_normalization_term=extra_term)

    top_n = produce_authors_ranking(author_score_dict)[:k]
    
    relevancies = [check_if_author_relevant_approximate(int(aid), query, similarity_threshold, tfidf=tfidf) for aid, _ in top_n]

    ranking = {}

    for rank, (author, relevancy) in enumerate(zip([a[0] for a in top_n], relevancies)):
        if author not in ranking.keys():
            ranking[author] = {"relevancy": relevancy, "rank": rank}
        else:
            continue

    return ranking

In [100]:
norm_semantic_role_labeling = get_author_ranking_exact_v2("gibbs sampling", index, strategy="binary", tfidf=False, normalized=True, extra_term=0, norm_alpha=1000)

In [101]:
not_norm_semantic_role_labeling = get_author_ranking_exact_v2("gibbs sampling", index, strategy="binary", tfidf=False, normalized=False, extra_term=100, norm_alpha=1000)

In [108]:
norm_semantic_role_labeling

{'2675564889': {'relevancy': True, 'rank': 0},
 '2568602334': {'relevancy': True, 'rank': 1},
 '2671169580': {'relevancy': False, 'rank': 2},
 '2132896614': {'relevancy': True, 'rank': 3},
 '2304614720': {'relevancy': True, 'rank': 4},
 '2673417165': {'relevancy': True, 'rank': 5},
 '2621515602': {'relevancy': False, 'rank': 6},
 '2106178016': {'relevancy': False, 'rank': 7},
 '2564840304': {'relevancy': False, 'rank': 8},
 '2236759972': {'relevancy': False, 'rank': 9}}

In [109]:
not_norm_semantic_role_labeling

{'183722240': {'relevancy': True, 'rank': 0},
 '2149454608': {'relevancy': True, 'rank': 1},
 '2347934206': {'relevancy': True, 'rank': 2},
 '2056153991': {'relevancy': True, 'rank': 3},
 '2435751034': {'relevancy': True, 'rank': 4},
 '2123777986': {'relevancy': False, 'rank': 5},
 '1964890520': {'relevancy': False, 'rank': 6},
 '2635064073': {'relevancy': True, 'rank': 7},
 '2131550435': {'relevancy': True, 'rank': 8},
 '2304614720': {'relevancy': True, 'rank': 9}}

In [102]:
def calculate_average_npubs_per_ranking(ranking):
    n_pubs = [retrieve_pub_count_by_id(author) for author in ranking.keys()]
    return np.mean(n_pubs)

In [103]:
calculate_average_npubs_per_ranking(norm_semantic_role_labeling)

7.1

In [104]:
calculate_average_npubs_per_ranking(not_norm_semantic_role_labeling)

205.9

In [105]:
def calculate_average_ncitation_per_ranking(ranking):
    n_cit = [retrieve_cit_count_by_id(author) for author in ranking.keys()]
    return np.mean(n_cit)

In [106]:
calculate_average_ncitation_per_ranking(norm_semantic_role_labeling)

268.8

In [107]:
calculate_average_ncitation_per_ranking(not_norm_semantic_role_labeling)

15278.9

In [None]:
# We have to do pruning here too, and remove authors that are not in the dataset for evaluation.

In [None]:
# ai = authors[authors.id == 2310124346].index[0]
# term = "query optimization".capitalize()
# authors.loc[ai, 'tags'] = authors.loc[ai, 'tags'].replace("]", f", {{'t': '{term}'}}]")

In [None]:
# authors.loc[ai, 'tags']

In [None]:
# INTUITION:
# Given a paper that has 4 authors and we choose the uniform strategy where each author gets 1/|authors| score of the paper
# we then take author X, take all the scores achieved by the uniform strategy above, and calculate the expCombSUM over them.
# This is then the end score for that author. 

In [None]:
# Because of the way the index works with the depth of the search and the memory limitations, we retrieve all the papers that have
# score higher than a threshold (let's say higher than 0.5) similarity to the query.

In [None]:
# Because my set is not THAT big, let's take top 100 papers instead of top 1000 or threshold based.

# Evaluation functions

In [93]:
data.columns

Index(['id', 'title', 'authors', 'venue', 'year', 'n_citation', 'page_start',
       'page_end', 'doc_type', 'publisher', 'volume', 'issue', 'fos', 'doi',
       'references', 'indexed_abstract', 'abstract',
       'cleaned_abstract_sentences', 'cleaned_title', 'title_embedding'],
      dtype='object')

In [41]:
# Get the exact topic query evaluation for the 100 queries.
exact = [get_author_ranking_exact_v2(query, index, tfidf=False, strategy="binary", normalized=True, norm_alpha=1) for query in queries]

# Get the approximate topic query evaluation for the 100 queries.
approximate = [get_author_ranking_approximate_v2(query, index, tfidf=False, strategy="binary", normalized=True, norm_alpha=1) for query in queries]

In [47]:
# Get the exact topic query evaluation for the 100 queries.
exact_uniform = [get_author_ranking_exact_v2(query, index, tfidf=False, strategy="uniform", normalized=True, norm_alpha=1) for query in queries]

# Get the approximate topic query evaluation for the 100 queries.
approximate_uniform = [get_author_ranking_approximate_v2(query, index, tfidf=False, strategy="uniform", normalized=True, norm_alpha=1) for query in queries]

In [48]:
print("Exact binary MRR@10:", mean_reciprocal_rank(exact))
print("Approximate binary MRR@10:", mean_reciprocal_rank(approximate))
print("Exact binary MAP@10:", mean_average_precision(exact))
print("Approximate binary MAP@10:", mean_average_precision(approximate))
print("Exact binary MP@10:", mean_precision_at_n(exact, 10))
print("Approximate binary MP@10:", mean_precision_at_n(approximate, 10))
print("Exact binary MP@5:", mean_precision_at_n(exact, 5))
print("Approximate binary MP@5:", mean_precision_at_n(approximate, 5))
print("---")
print("Exact uniform MRR@10:", mean_reciprocal_rank(exact_uniform))
print("Approximate uniform MRR@10:", mean_reciprocal_rank(approximate_uniform))
print("Exact uniform MAP@10:", mean_average_precision(exact_uniform))
print("Approximate uniform MAP@10:", mean_average_precision(approximate_uniform))
print("Exact uniform MP@10:", mean_precision_at_n(exact_uniform, 10))
print("Approximate uniform MP@10:", mean_precision_at_n(approximate_uniform, 10))
print("Exact uniform MP@5:", mean_precision_at_n(exact_uniform, 5))
print("Approximate uniform MP@5:", mean_precision_at_n(approximate_uniform, 5))

Exact binary MRR@10: 0.619
Approximate binary MRR@10: 0.678
Exact binary MAP@10: 0.174
Approximate binary MAP@10: 0.284
Exact binary MP@10: 0.282
Approximate binary MP@10: 0.42
Exact binary MP@5: 0.272
Approximate binary MP@5: 0.398
---
Exact uniform MRR@10: 0.572
Approximate uniform MRR@10: 0.654
Exact uniform MAP@10: 0.207
Approximate uniform MAP@10: 0.32
Exact uniform MP@10: 0.336
Approximate uniform MP@10: 0.471
Exact uniform MP@5: 0.292
Approximate uniform MP@5: 0.422


In [49]:
# Get the exact topic query evaluation for the 100 queries.
exact_bigger_alpha = [get_author_ranking_exact_v2(query, index, tfidf=False, strategy="binary", normalized=True, norm_alpha=1000) for query in queries]

# Get the approximate topic query evaluation for the 100 queries.
approximate_bigger_alpha = [get_author_ranking_approximate_v2(query, index, tfidf=False, strategy="binary", normalized=True, norm_alpha=1000) for query in queries]

In [50]:
print("Exact binary MRR@10:", mean_reciprocal_rank(exact_bigger_alpha))
print("Approximate binary MRR@10:", mean_reciprocal_rank(approximate_bigger_alpha))
print("Exact binary MAP@10:", mean_average_precision(exact_bigger_alpha))
print("Approximate binary MAP@10:", mean_average_precision(approximate_bigger_alpha))
print("Exact binary MP@10:", mean_precision_at_n(exact_bigger_alpha, 10))
print("Approximate binary MP@10:", mean_precision_at_n(approximate_bigger_alpha, 10))
print("Exact binary MP@5:", mean_precision_at_n(exact_bigger_alpha, 5))
print("Approximate binary MP@5:", mean_precision_at_n(approximate_bigger_alpha, 5))

Exact binary MRR@10: 0.694
Approximate binary MRR@10: 0.737
Exact binary MAP@10: 0.218
Approximate binary MAP@10: 0.331
Exact binary MP@10: 0.318
Approximate binary MP@10: 0.452
Exact binary MP@5: 0.324
Approximate binary MP@5: 0.456


In [58]:
# Get the exact topic query evaluation for the 100 queries.
exact_bigger_alpha_term = [get_author_ranking_exact_v2(query, index, tfidf=False, strategy="binary", 
                                                       normalized=True, norm_alpha=1000, extra_term=50) for query in queries]

# Get the approximate topic query evaluation for the 100 queries.
approximate_bigger_alpha_term = [get_author_ranking_approximate_v2(query, index, tfidf=False, strategy="binary", 
                                                                   normalized=True, norm_alpha=1000, extra_term=50) for query in queries]

In [59]:
print("Exact binary MRR@10:", mean_reciprocal_rank(exact_bigger_alpha_term))
print("Approximate binary MRR@10:", mean_reciprocal_rank(approximate_bigger_alpha_term))
print("Exact binary MAP@10:", mean_average_precision(exact_bigger_alpha_term))
print("Approximate binary MAP@10:", mean_average_precision(approximate_bigger_alpha_term))
print("Exact binary MP@10:", mean_precision_at_n(exact_bigger_alpha_term, 10))
print("Approximate binary MP@10:", mean_precision_at_n(approximate_bigger_alpha_term, 10))
print("Exact binary MP@5:", mean_precision_at_n(exact_bigger_alpha_term, 5))
print("Approximate binary MP@5:", mean_precision_at_n(approximate_bigger_alpha_term, 5))

Exact binary MRR@10: 0.769
Approximate binary MRR@10: 0.81
Exact binary MAP@10: 0.362
Approximate binary MAP@10: 0.49
Exact binary MP@10: 0.455
Approximate binary MP@10: 0.589
Exact binary MP@5: 0.466
Approximate binary MP@5: 0.602


In [60]:
# Get the exact topic query evaluation for the 100 queries.
exact_bigger_alpha_term = [get_author_ranking_exact_v2(query, index, tfidf=False, strategy="binary", 
                                                       normalized=True, norm_alpha=1000, extra_term=100) for query in queries]

# Get the approximate topic query evaluation for the 100 queries.
approximate_bigger_alpha_term = [get_author_ranking_approximate_v2(query, index, tfidf=False, strategy="binary", 
                                                                   normalized=True, norm_alpha=1000, extra_term=100) for query in queries]

In [61]:
print("Exact binary MRR@10:", mean_reciprocal_rank(exact_bigger_alpha_term))
print("Approximate binary MRR@10:", mean_reciprocal_rank(approximate_bigger_alpha_term))
print("Exact binary MAP@10:", mean_average_precision(exact_bigger_alpha_term))
print("Approximate binary MAP@10:", mean_average_precision(approximate_bigger_alpha_term))
print("Exact binary MP@10:", mean_precision_at_n(exact_bigger_alpha_term, 10))
print("Approximate binary MP@10:", mean_precision_at_n(approximate_bigger_alpha_term, 10))
print("Exact binary MP@5:", mean_precision_at_n(exact_bigger_alpha_term, 5))
print("Approximate binary MP@5:", mean_precision_at_n(approximate_bigger_alpha_term, 5))

Exact binary MRR@10: 0.813
Approximate binary MRR@10: 0.853
Exact binary MAP@10: 0.404
Approximate binary MAP@10: 0.548
Exact binary MP@10: 0.491
Approximate binary MP@10: 0.638
Exact binary MP@5: 0.52
Approximate binary MP@5: 0.658


In [62]:
# Get the exact topic query evaluation for the 100 queries.
exact_bigger_alpha_term = [get_author_ranking_exact_v2(query, index, tfidf=False, strategy="binary", 
                                                       normalized=True, norm_alpha=100, extra_term=100) for query in queries]

# Get the approximate topic query evaluation for the 100 queries.
approximate_bigger_alpha_term = [get_author_ranking_approximate_v2(query, index, tfidf=False, strategy="binary", 
                                                                   normalized=True, norm_alpha=100, extra_term=100) for query in queries]

In [63]:
print("Exact binary MRR@10:", mean_reciprocal_rank(exact_bigger_alpha_term))
print("Approximate binary MRR@10:", mean_reciprocal_rank(approximate_bigger_alpha_term))
print("Exact binary MAP@10:", mean_average_precision(exact_bigger_alpha_term))
print("Approximate binary MAP@10:", mean_average_precision(approximate_bigger_alpha_term))
print("Exact binary MP@10:", mean_precision_at_n(exact_bigger_alpha_term, 10))
print("Approximate binary MP@10:", mean_precision_at_n(approximate_bigger_alpha_term, 10))
print("Exact binary MP@5:", mean_precision_at_n(exact_bigger_alpha_term, 5))
print("Approximate binary MP@5:", mean_precision_at_n(approximate_bigger_alpha_term, 5))

Exact binary MRR@10: 0.787
Approximate binary MRR@10: 0.838
Exact binary MAP@10: 0.403
Approximate binary MAP@10: 0.542
Exact binary MP@10: 0.492
Approximate binary MP@10: 0.634
Exact binary MP@5: 0.512
Approximate binary MP@5: 0.65


In [43]:
def mean_reciprocal_rank(results):
    partial_ranks = []
    
    for result in results:
        sortd = sorted(result.items(), key=lambda item: item[1]['rank'])

        for s in sortd:
            if s[1]['relevancy'] == True:
                # We had to do rank from 1 on instead of 0 on because of the 1 / rank formula.
                partial_ranks.append(1 / (s[1]['rank']+1))
                break
    
    mrr = np.around(np.mean(partial_ranks), decimals=3)
    
    return mrr

In [70]:
mean_reciprocal_rank(approximate_parabolic)

0.878

In [44]:
def mean_average_precision(results):
    
    average_precision_scores = []
    
    for result in results:
        sortd = sorted(result.items(), key=lambda item: item[1]['rank'])
        
        average_precison_partials_list = []
        current_sublist_size = 0
        relevant_found = 0
        
        for s in sortd:
            if s[1]['relevancy'] == True:
                current_sublist_size += 1
                relevant_found += 1
                average_precision_partial = relevant_found / current_sublist_size
                average_precison_partials_list.append(average_precision_partial)
            else:
                current_sublist_size += 1

        average_precision = np.sum(average_precison_partials_list) / len(sortd)
        average_precision_scores.append(average_precision)
    
    mapr = np.around(np.mean(average_precision_scores), decimals=3)
    
    return mapr

In [73]:
mean_average_precision(exact_parabolic)

0.486

In [45]:
def mean_precision_at_n(results, n=5):
    
    average_precision_scores = []
    
    for result in results:
        
        sortd = sorted(result.items(), key=lambda item: item[1]['rank'])
        
        correct = 0
        
        for s in sortd[:n]:
            if s[1]['relevancy'] == True:
                correct += 1
        
        average_precision_scores.append(correct / n)
    
    mpan = np.around(np.mean(average_precision_scores), decimals=3)
    
    return mpan

In [77]:
mean_precision_at_n(exact_parabolic, 10)

0.565

In [None]:
import pickle
def save_exact_to_pickle(name):
    path = f'exact_evaluation_{name}.pickle'
    print("exact path:", path)
    with open(path, 'wb') as handle:
        pickle.dump(exact, handle, protocol=pickle.HIGHEST_PROTOCOL)

def save_approximate_to_pickle(name):
    path = f'approximate_evaluation_07thresh_{name}.pickle'
    print("Approximate path:", path)
    with open(path, 'wb') as handle:
        pickle.dump(approximate, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
current = "average_separate_sbert"
save_exact_to_pickle(current)
save_approximate_to_pickle(current)

exact path: exact_evaluation_average_separate_sbert.pickle
Approximate path: approximate_evaluation_07thresh_average_separate_sbert.pickle


In [None]:
exact[0]

{1686244749: {'rank': 2, 'relevancy': True},
 2087992123: {'rank': 6, 'relevancy': True},
 2120281063: {'rank': 1, 'relevancy': True},
 2122589528: {'rank': 7, 'relevancy': False},
 2136847405: {'rank': 0, 'relevancy': True},
 2142158874: {'rank': 4, 'relevancy': True},
 2169261213: {'rank': 3, 'relevancy': True},
 2255627707: {'rank': 8, 'relevancy': True},
 2310891340: {'rank': 9, 'relevancy': False},
 2473457260: {'rank': 5, 'relevancy': True}}

In [None]:
exact[73]

{1670727492: {'rank': 7, 'relevancy': False},
 1862709724: {'rank': 0, 'relevancy': False},
 1983671951: {'rank': 1, 'relevancy': False},
 2057144527: {'rank': 4, 'relevancy': False},
 2170785820: {'rank': 6, 'relevancy': False},
 2250996746: {'rank': 9, 'relevancy': False},
 2253241183: {'rank': 2, 'relevancy': False},
 2461703321: {'rank': 3, 'relevancy': False},
 2574610794: {'rank': 5, 'relevancy': False},
 2577966764: {'rank': 8, 'relevancy': False}}

In [None]:
queries[73]

'Novelty detection'

In [None]:
# ai = authors[authors.id == 2325036826].index[0]
# term = query.capitalize()
# authors.loc[ai, 'tags'] = authors.loc[ai, 'tags'].replace("]", f", {{'t': '{term}'}}]")

In [None]:
exact[67]

{'1151530292': {'rank': 3, 'relevancy': False},
 '2004366870': {'rank': 5, 'relevancy': True},
 '2130711152': {'rank': 4, 'relevancy': True},
 '2135488253': {'rank': 0, 'relevancy': True},
 '2616859289': {'rank': 2, 'relevancy': True},
 '374805745': {'rank': 7, 'relevancy': False},
 '56197593': {'rank': 6, 'relevancy': True},
 '798588419': {'rank': 1, 'relevancy': False}}

## Code for the non-binary evaluation

In [137]:
authors[authors.n_pubs > 100] 

Unnamed: 0,id,n_citation,n_pubs,name,pubs,tags
3,100167829,1184,117,Fabio Massimo Zanzotto,"[{'i': '371408518', 'r': 2}, {'i': '1487779154...","[{'t': 'Semantic data model'}, {'t': 'Cancer'}..."
4,100252078,1042,131,Stefano Serra-Capizzano,"[{'i': '2132492255', 'r': 1}, {'i': '252184247...","[{'t': 'Spectral power distribution'}, {'t': '..."
19,100748767,1689,149,Christian Goerick,"[{'i': '44505', 'r': 3}, {'i': '2132496341', '...","[{'t': 'Operating system'}, {'t': 'Homeostasis..."
23,100988613,2312,248,Eiichiro Sumita,"[{'i': '2132594454', 'r': 1}, {'i': '199925832...","[{'t': 'Data quality'}, {'t': 'Principle of ma..."
28,1013389924,5522,228,Pavol Hell,"[{'i': '2410734113', 'r': 0}, {'i': '199932317...","[{'t': 'Hamiltonian path'}, {'t': 'Binary tree..."
...,...,...,...,...,...,...
67751,286732139,6104,164,Gadiel Seroussi,"[{'i': '2229475340', 'r': 5}, {'i': '148955953...","[{'t': 'Geometric distribution'}, {'t': 'Codin..."
67761,286913512,8146,162,Javier R. Movellan,"[{'i': '2133107415', 'r': 2}, {'i': '8203773',...","[{'t': 'Vision'}, {'t': 'Probability density f..."
67763,286952618,3978,384,Cecilia Laschi,"[{'i': '2320062248', 'r': 2}, {'i': '213270181...","[{'t': 'Artificial muscle'}, {'t': 'Electroenc..."
67782,287405489,9166,670,Peter Willett,"[{'i': '1487494573', 'r': 1}, {'i': '213249600...","[{'t': 'Uncertainty principle'}, {'t': 'Linear..."


In [120]:
def get_pubs_by_author_id(author_id):
    pubs = ast.literal_eval(authors[authors.id == author_id].pubs.values[0])
    return [p["i"] for p in pubs]

In [129]:
def get_all_tag_frequencies(author_id):
    pubs = get_pubs_by_author_id(author_id)
    all_fos = []
    for p in pubs:
        try:
            fos = [f["name"] for f in get_fos_by_id(p)]
            all_fos.extend(fos)
        except:
            continue
    return Counter(all_fos).most_common()

In [147]:
get_all_tag_frequencies(2347934206)

[('Mathematics', 13),
 ('Statistics', 12),
 ('Econometrics', 10),
 ('Feature selection', 7),
 ('Model selection', 6),
 ('Artificial intelligence', 5),
 ('Bayesian linear regression', 4),
 ('Pattern recognition', 4),
 ('Linear regression', 4),
 ('Markov chain Monte Carlo', 4),
 ('g-prior', 3),
 ("Bayes' theorem", 3),
 ('Nonparametric regression', 3),
 ('Posterior probability', 3),
 ('Boosting (machine learning)', 3),
 ('Ensemble learning', 3),
 ('Machine learning', 3),
 ('Statistical model', 3),
 ('Prior probability', 3),
 ('Computer science', 3),
 ('Hyperparameter', 2),
 ('Marginal likelihood', 2),
 ('Bayes factor', 2),
 ('Bayesian hierarchical modeling', 2),
 ('Mixture model', 2),
 ('Gibbs sampling', 2),
 ('Bayesian probability', 2),
 ('Bayes estimator', 2),
 ('Minimax', 2),
 ('Monte Carlo method', 2),
 ("Bayes' rule", 1),
 ('Shrinkage estimator', 1),
 ('Naive Bayes classifier', 1),
 ('Bayes error rate', 1),
 ('Latent variable', 1),
 ('Multinomial distribution', 1),
 ('Gradient boosti

In [143]:
not_norm_semantic_role_labeling

{'183722240': {'relevancy': True, 'rank': 0},
 '2149454608': {'relevancy': True, 'rank': 1},
 '2347934206': {'relevancy': True, 'rank': 2},
 '2056153991': {'relevancy': True, 'rank': 3},
 '2435751034': {'relevancy': True, 'rank': 4},
 '2123777986': {'relevancy': False, 'rank': 5},
 '1964890520': {'relevancy': False, 'rank': 6},
 '2635064073': {'relevancy': True, 'rank': 7},
 '2131550435': {'relevancy': True, 'rank': 8},
 '2304614720': {'relevancy': True, 'rank': 9}}

In [148]:
data.sample(5)

Unnamed: 0,id,title,authors,venue,year,n_citation,page_start,page_end,doc_type,publisher,volume,issue,fos,doi,references,indexed_abstract,abstract,cleaned_abstract_sentences,cleaned_title,title_embedding
13738,2293576742,Hierarchical Penalization,"[{'name': 'Marie Szafranski', 'id': '208893902...",{'raw': 'neural information processing systems...,2007,20,1457,1464,Conference,,,,"[{'name': 'Mathematical optimization', 'w': 0....",,"[1984568490, 2063978378, 2084812512, 211336235...","{'IndexLength': 95, 'InvertedIndex': {'Hierarc...",Hierarchical penalization is a generic framewo...,[hierarchical penalization is generic framewor...,hierarchical penalization,"[0.33533376, 0.87247276, 0.76960266, -0.684895..."
49743,2109001758,Rotation invariant pattern recognition using r...,"[{'name': 'Guangyi Chen', 'id': '2097743728', ...","{'raw': 'Pattern Recognition', 'id': '414566'}",2005,45,2314,2322,Journal,Elsevier Science Inc.,38.0,12.0,"[{'name': 'Gravitational singularity', 'w': 0....",10.1016/j.patcog.2005.02.008,"[58440828, 1523039621, 1535724155, 1912117388,...","{'IndexLength': 146, 'InvertedIndex': {'In': [...","In this paper, we propose a rotation-invariant...",[in this paper propose rotation invariant desc...,rotation invariant pattern recognition using r...,"[0.46960822, -0.26783612, 0.07540989, 1.084667..."
80387,2056282082,Wavelet-based fingerprint image retrieval,"[{'name': 'Javier A. Montoya Zegarra', 'id': '...",{'raw': 'Journal of Computational and Applied ...,2009,26,294,307,Journal,Elsevier Science Publishers B. V.,227.0,2.0,"[{'name': 'Feature vector', 'w': 0.5480565}, {...",10.1016/j.cam.2008.03.017,"[5057334, 23094940, 1497286775, 1501535725, 15...","{'IndexLength': 193, 'InvertedIndex': {'This':...",This paper presents a novel approach for perso...,[this paper presents novel approach for person...,wavelet based fingerprint image retrieval,"[0.664778, -0.8043635, 0.706223, -0.1874655, -..."
101477,1913131897,A monotonic and continuous two-dimensional war...,"[{'name': 'Seiichi Uchida', 'id': '2119334787'...",{'raw': 'international conference on pattern r...,1998,53,521,524,Conference,IEEE Computer Society,1.0,,"[{'name': 'Monotonic function', 'w': 0.4788851...",10.1109/ICPR.1998.711195,"[1965509304, 2006952799, 2144789800, 215413730...","{'IndexLength': 67, 'InvertedIndex': {'A': [0]...",A two-dimensional warping algorithm is present...,[two dimensional warping algorithm is presente...,monotonic and continuous two dimensional warpi...,"[0.08126288, 0.42223737, -0.20103915, 0.409545..."
95922,2064167155,Model comparison of nonlinear structural equat...,"[{'name': 'Sik-Yum Lee', 'id': '2141248418', '...","{'raw': 'Psychometrika', 'id': '186480540'}",2003,51,27,47,Journal,Springer,68.0,1.0,"[{'name': 'Latent variable', 'w': 0.5037348}, ...",10.1007/BF02296651,"[133598609, 147337607, 1494853941, 1747608905,...","{'IndexLength': 127, 'InvertedIndex': {'Recent...","Recently, it has been recognized that the comm...",[recently has been recognized that the commonl...,model comparison of nonlinear structural equat...,"[0.5628841, -0.13408846, -0.05693955, 1.312504..."


In [1]:
example = [('World Wide Web', 1),
 ('World Wide Web', 1),
 ('World Wide Web', 1),
 ('Social web', 7),
 ('World Wide Web', 7),
 ('World Wide Web', 9),
 ('Social web', 9),
 ('World Wide Web', 566),
 ('World Wide Web', 120),
 ('World Wide Web', 1),
 ('Semantic Web', 1),
 ('Semantic Web', 1),
 ('Semantic Web', 354),
 ('World Wide Web', 354),
 ('World Wide Web', 1),
 ('World Wide Web', 62),
 ('World Wide Web', 27),
 ('Ontology language', 27),
 ('World Wide Web', 46),
 ('Semantic Web', 46),
 ('OWL-S', 2),
 ('Semantic Web', 2),
 ('World Wide Web', 2),
 ('World Wide Web', 3),
 ('Semantic Web', 3),
 ('World Wide Web', 5),
 ('World Wide Web', 57),
 ('Semantic Web', 57),
 ('World Wide Web', 3),
 ('OWL-S', 3),
 ('World Wide Web', 3),
 ('Semantic Web', 3),
 ('World Wide Web', 6),
 ('Semantic grid', 6),
 ('Semantic Web', 6),
 ('World Wide Web', 3),
 ('Semantic Web', 23),
 ('Semantic grid', 23),
 ('World Wide Web', 37),
 ('World Wide Web', 47),
 ('Semantic Web', 47)]

In [4]:
def aggregate_tag_score(one_author):
    tags_with_score = {}
    for t in one_author:
        tag = t[0]
        cit = t[1]
        if tag in tags_with_score:
            tags_with_score[tag] += cit
        else:
            tags_with_score[tag] = cit
    return tags_with_score

In [5]:
aggregate_tag_score(example)

{'OWL-S': 5,
 'Ontology language': 27,
 'Semantic Web': 543,
 'Semantic grid': 29,
 'Social web': 16,
 'World Wide Web': 1362}

In [6]:
dist1 = {'Semantic Web': 6833,
 'World Wide Web': 17360,
 'Semantic grid': 64,
 'Logic programming': 44,
 'Constraint satisfaction': 44,
 'User interface': 17,
 'Web service': 1424,
 'Social web': 884,
 'Web 2.0': 2069,
 'Information privacy': 89,
 'Social network': 84,
 'Computer architecture': 1,
 'Middleware': 676,
 'Multi-agent system': 1263,
 'Knowledge extraction': 4,
 'Semantic similarity': 1,
 'Ontology language': 27,
 'OWL-S': 5}

In [7]:
dist2 = {'Dimensionality reduction': 2,
 'Cluster analysis': 74,
 'Hierarchical clustering': 24,
 'Speech processing': 5,
 'Image segmentation': 49,
 'Medical imaging': 97,
 'Wavelet transform': 7}

In [8]:
dist3 = {'Image segmentation': 75, 'Random forest': 14}
dist4 = {'Topic model': 4,
 'Genetic algorithm': 1,
 'Cluster analysis': 3,
 'Multi-task learning': 7,
 'World Wide Web': 1}
dist5 = {'Rule induction': 1078,
 'User interface': 7,
 'Game theory': 1,
 'World Wide Web': 1,
 'Web 2.0': 1}
dist6 = {'Game theory': 1480, 'Eye tracking': 194}
dist7 = {'Wearable computer': 22,
 'Eye tracking': 38,
 'Wireless sensor network': 1,
 'Support vector machine': 2,
 'Dynamic programming': 76}
dist8 = {'User interface': 10}
dist9 = {'Image segmentation': 130, 'Random forest': 8}
dist10 = {'Computational geometry': 16, 'Generative model': 37}

In [9]:
dists = {"0": dist1, "1": dist2, "2": dist3, "3": dist4, "5": dist6, "6": dist7, "7": dist8, "8": dist9, "9": dist10}

In [13]:
dists["0"]

{'Computer architecture': 1,
 'Constraint satisfaction': 44,
 'Information privacy': 89,
 'Knowledge extraction': 4,
 'Logic programming': 44,
 'Middleware': 676,
 'Multi-agent system': 1263,
 'OWL-S': 5,
 'Ontology language': 27,
 'Semantic Web': 6833,
 'Semantic grid': 64,
 'Semantic similarity': 1,
 'Social network': 84,
 'Social web': 884,
 'User interface': 17,
 'Web 2.0': 2069,
 'Web service': 1424,
 'World Wide Web': 17360}

In [16]:
from collections import defaultdict

In [129]:
def build_top_distribution_per_topic(all_dists, top=10):
    topic_to_expert = defaultdict(list)
    for k, v in all_dists.items():
        for topic, weight in v.items():
            topic_to_expert[topic].append({"author": k,
                                      "weight": weight})
    sorted_topic_to_expert = {}
    for k, v in topic_to_expert.items():
        sorted_topic_to_expert[k] = sorted(v, key=lambda item: item["weight"], reverse=True)[:top]
    return sorted_topic_to_expert


def convert_weights_to_labels(top_distribution, label_three=0.01, label_two=0.10):
    new_dict = {}
    for k, v in top_distribution.items():
        threes = v[:int(len(v) * label_three)]
        for t in threes:
            t["label"] = 3
        twos = v[int(len(v) * label_three) : int(len(v) * label_two)]
        for t in twos:
            t["label"] = 2
        ones = v[int(len(v) * label_two):]
        for t in ones:
            t["label"] = 1
        new_dict[k] = threes+twos+ones
        
    return new_dict
# How do we now convert the weight to a label... maybe procentually
# Maybe top 1% is relevance 3, top 10% is relevance 2, rest is relevance 1, if author is not relevant at all then 0.

In [130]:
top_dist = build_top_distribution_per_topic(dists)

In [131]:
ones = convert_weights_to_labels(top_dist)

In [132]:
ones

{'Cluster analysis': [{'author': '1', 'label': 1, 'weight': 74},
  {'author': '3', 'label': 1, 'weight': 3}],
 'Computational geometry': [{'author': '9', 'label': 1, 'weight': 16}],
 'Computer architecture': [{'author': '0', 'label': 1, 'weight': 1}],
 'Constraint satisfaction': [{'author': '0', 'label': 1, 'weight': 44}],
 'Dimensionality reduction': [{'author': '1', 'label': 1, 'weight': 2}],
 'Dynamic programming': [{'author': '6', 'label': 1, 'weight': 76}],
 'Eye tracking': [{'author': '5', 'label': 1, 'weight': 194},
  {'author': '6', 'label': 1, 'weight': 38}],
 'Game theory': [{'author': '5', 'label': 1, 'weight': 1480}],
 'Generative model': [{'author': '9', 'label': 1, 'weight': 37}],
 'Genetic algorithm': [{'author': '3', 'label': 1, 'weight': 1}],
 'Hierarchical clustering': [{'author': '1', 'label': 1, 'weight': 24}],
 'Image segmentation': [{'author': '8', 'label': 1, 'weight': 130},
  {'author': '2', 'label': 1, 'weight': 75},
  {'author': '1', 'label': 1, 'weight': 49}]