In [32]:
import pandas as pd
import math
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
citation_prediction = pd.read_json('/scratch/sun.jiam/scirepeval_data/test/scidocs/view_cite_read/cite/test_qrel.jsonl', lines=True)

In [3]:
citation_meta = pd.read_json('/scratch/sun.jiam/scirepeval_data/test/scidocs/view_cite_read/meta.jsonl', lines=True)

In [4]:
citation_meta.shape

(142009, 8)

In [5]:
citation_prediction.shape

(29928, 3)

In [6]:
def get_query_candidates(df):
    positive_candidates = {}
    all_candidates = {}
    for query, query_df in df.groupby('query_id'):
        positive_candidates[query] = query_df[query_df['score'] == 1]['cand_id'].tolist()
        all_candidates[query] = query_df['cand_id'].tolist()
    return positive_candidates, all_candidates

In [7]:
# Key: query_id Value: positive_candidates, all_candidates 
positive_candidates, all_candidates = get_query_candidates(citation_prediction)

In [8]:
test_queries_id = citation_prediction['query_id'].unique().tolist()

In [9]:
kwc_dir = '/work/k.church/sun.jiam/scirepeval_data/test/scidocs/view_cite_read/cite/'

In [10]:
specter_embeddings_kwc = np.load(kwc_dir+'ids2.out.corpusId.specter.npy')

In [11]:
proposed_embeddings_kwc = np.load(kwc_dir+'ids2.out.corpusId.proposed.npy')

In [12]:
kwc_ids = np.load(kwc_dir+'ids2.out.ss.npy')

In [13]:
# Get specter embeddings
specter_embeddings_dict = {}
count = 0
for query_id, candidates in all_candidates.items():
    embeddings_dict = {}
    for candidate in candidates:
        embed_index = np.where(kwc_ids == candidate)
        if not np.size(embed_index) == 0: 
            embedding = specter_embeddings_kwc[embed_index[0][0]]
            embeddings_dict[candidate] = embedding
        else:
            count += 1
    specter_embeddings_dict[query_id] = embeddings_dict
print(count)

97


In [14]:
temp_len = 0 
for k, v in specter_embeddings_dict.items():
    temp_len += len(v)
temp_len

29831

In [15]:
# Get specter embeddings
query_specter_embeddings_dict = {}
count = 0
for query_id, candidates in all_candidates.items():
    embed_index = np.where(kwc_ids == query_id)
    if not np.size(embed_index) == 0: 
        embedding = specter_embeddings_kwc[embed_index[0][0]]
        query_specter_embeddings_dict[query_id] = embedding
    else:
        count += 1
        print(query_id)
print(count)

0bf046038a555bc848030a28530f9836e5611b96
64334ac9dfb59d68380784e3b1ad197511850921
e414ba960ee2a385b6800f2086209c711cc3b48b
3


In [16]:
# queries: key: query_id, value: embeddings
# candidates: key: query_id, value: dict(paper_id: embeddins)
def rank_embeddings(queries, candidates, test_queries_id):
    rankings = {} # store the rankings for each query
    for qid in test_queries_id:
        if qid in queries:
            qemb = queries[qid]
        else:
            continue
        rankings[qid] = {}
        for pid, pemb in candidates[qid].items():
            qemb = np.array(qemb)
            pemb = np.array(pemb)
            dist = np.linalg.norm(qemb - pemb) # calculate L2 distance
            rankings[qid][pid] = dist
        # sort the papers by distance (in ascending order) and return the sorted list
        rankings[qid] = sorted(rankings[qid].items(), key=lambda x: x[1])
    result = []
    for k, v in rankings.items():
        temp = []
        for doc_id, score in v:
            temp.append(doc_id)
        result.append(temp)
    return result

In [17]:
def calculate_map(relevant_docs, retrieval_results):
    """
    Calculates the mean average precision (MAP) given a list of relevant documents
    and a list of retrieval results.
    
    Inputs:
    relevant_docs: A list of lists, where each inner list contains the relevant
                          document IDs for a query.
    retrieval_results: A list of lists, where each inner list contains the
                              document IDs retrieved by a model for a query.
    
    return: The mean average precision (MAP).
    """
    average_precision = 0
    num_queries = len(relevant_docs)
    
    for i in range(num_queries):
        query_rel_docs = relevant_docs[i]
        query_ret_results = retrieval_results[i]
        
        precision_at_k = []
        num_rel_docs = 0
        
        for k in range(len(query_ret_results)):
            if query_ret_results[k] in query_rel_docs:
                num_rel_docs += 1
                precision_at_k.append(num_rel_docs / (k + 1))
        
        if len(precision_at_k) > 0:
            average_precision += sum(precision_at_k) / len(precision_at_k)
    
    mean_average_precision = average_precision / num_queries
    return mean_average_precision

In [18]:
ranking_results = rank_embeddings(query_specter_embeddings_dict, specter_embeddings_dict, test_queries_id)

In [19]:
relevant_docs = []
excluded_queries = ['0bf046038a555bc848030a28530f9836e5611b96',
                   '64334ac9dfb59d68380784e3b1ad197511850921',
                   'e414ba960ee2a385b6800f2086209c711cc3b48b']

for k in test_queries_id: 
    if k in excluded_queries:
        continue
    temp = positive_candidates[k]
    relevant_docs.append(temp)

In [20]:
score = calculate_map(relevant_docs, ranking_results)
score

0.8840620933090274

### Rank With ProNE

In [21]:
# Get proposed embeddings
count = 0
proposed_embeddings_dict = {}
for query_id, candidates in all_candidates.items():
    embeddings_dict = {}
    for candidate in candidates:
        embed_index = np.where(kwc_ids == candidate)
        if not np.size(embed_index) == 0: 
            embedding = proposed_embeddings_kwc[embed_index[0][0]]
            embeddings_dict[candidate] = embedding
        else:
            count += 1
    proposed_embeddings_dict[query_id] = embeddings_dict
print(count)

97


In [22]:
# Get proposed embeddings for queries
count = 0
query_proposed_embeddings_dict = {}
for query_id, candidates in all_candidates.items():
    embed_index = np.where(kwc_ids == query_id)
    if not np.size(embed_index) == 0: 
        embedding = proposed_embeddings_kwc[embed_index[0][0]]
        query_proposed_embeddings_dict[query_id] = embedding
    else:
        count += 1
        print(query_id)
print(count)

0bf046038a555bc848030a28530f9836e5611b96
64334ac9dfb59d68380784e3b1ad197511850921
e414ba960ee2a385b6800f2086209c711cc3b48b
3


In [23]:
ranking_results = rank_embeddings(query_proposed_embeddings_dict, proposed_embeddings_dict, test_queries_id)

In [24]:
relevant_docs = []
excluded_queries = ['0bf046038a555bc848030a28530f9836e5611b96',
                   '64334ac9dfb59d68380784e3b1ad197511850921',
                   'e414ba960ee2a385b6800f2086209c711cc3b48b']

for k in test_queries_id: 
    if k in excluded_queries:
        continue
    temp = positive_candidates[k]
    relevant_docs.append(temp)

In [25]:
score = calculate_map(relevant_docs, ranking_results)
score

0.9051770683156085

### Combined metric

In [26]:
# queries: key: query_id, value: embeddings
# candidates: key: query_id, value: dict(paper_id: embeddins)
def rank_combined_embeddings(queries, candidates, candidates_two, test_queries_id):
    rankings = {} # store the rankings for each query
    for qid in test_queries_id:
        if qid in queries:
            qemb = queries[qid]
        else:
            continue
        rankings[qid] = {}
        for pid, pemb in candidates[qid].items():
            qemb = np.array(qemb)
            pemb = np.array(pemb)
            dist = np.linalg.norm(qemb - pemb) # calculate L2 distance
            rankings[qid][pid] = dist
            
        for pid, pemb in candidates_two[qid].items():
            qemb = np.array(qemb)
            pemb = np.array(pemb)
            dist2 = np.linalg.norm(qemb - pemb) # calculate L2 distance
            rankings[qid][pid] = (rankings[qid][pid], dist2)
        for alpha in range(1, 10):
            alpha /= 10 
            total_score = alpha*harmonic_centralities[i][doc_id]+(1-alpha)*SPECTER_score
            # sort the papers by distance (in ascending order) and return the sorted list
            rankings[qid] = sorted(rankings[qid].items(), key=lambda x: x[1])
    result = []
    for k, v in rankings.items():
        temp = []
        for doc_id, score in v:
            temp.append(doc_id)
        result.append(temp)
    return result

In [36]:
# queries: key: query_id, value: embeddings
# candidates: key: query_id, value: dict(paper_id: embeddins)
def get_l2_scores(query_emb, candidates_dict):
    result = {}
    for pid, pemb in candidates_dict.items():
        query_emb = np.array(query_emb)
        pemb = np.array(pemb)
        #dist = np.linalg.norm(query_emb  - pemb)
        dist = cosine_similarity([query_emb], [pemb])
        result[pid] = dist
    return result

In [28]:
def rank_combined(SPECTER_scores_dict, proposed_scores_dict):
    results = []
    for alpha in range(1, 10):
        alpha /= 10
        epoch_result = []
        for i in range(len(SPECTER_scores_dict)):
            doc_id_score_dict = SPECTER_scores_dict[i]
            id_score = [('', 0) for _ in range (len(doc_id_score_dict))]
            for j, (doc_id, SPECTER_score) in enumerate(doc_id_score_dict.items()):
                #print(j)
                #print((proposed_scores_dict[j]))
                
                total_score = alpha*proposed_scores_dict[i][doc_id]+(1-alpha)*SPECTER_score
                # else:
                #     total_score = (1-alpha)*SPECTER_score
                id_score[j] = (doc_id, total_score)
            id_score = sorted(id_score, key=lambda x: x[1], reverse=True)
            epoch_result.append([id_score[i][0] for i in range(5)]) 
        results.append(epoch_result)
    return results

In [29]:
def combined_final_result(test_queries_id, query_embeddings_dict, positive_candidates, specter_embeddings_dict, 
                          proposed_embeddings_dict, proposed_query_embeddings_dict):
    retrieval_results_SPECTER = []
    retrieval_results_proposed = []
    relevant_docs = []
    scores = []
    for query_id in test_queries_id:
        if query_id in query_embeddings_dict:
            retrieved = get_l2_scores(query_embeddings_dict[query_id], specter_embeddings_dict[query_id])
            retrieval_results_SPECTER.append(retrieved)
            retrieved_proposed = get_l2_scores(proposed_query_embeddings_dict[query_id], proposed_embeddings_dict[query_id])
            retrieval_results_proposed.append(retrieved_proposed)
            relevant_docs.append(positive_candidates[query_id])
    retrieval_results = rank_combined(retrieval_results_SPECTER, retrieval_results_proposed)
    for retrieval_result in retrieval_results:
        score = calculate_map(relevant_docs, retrieval_result)
        scores.append(score)
    return scores

In [37]:

combined_result = combined_final_result(test_queries_id, query_specter_embeddings_dict, positive_candidates, specter_embeddings_dict, proposed_embeddings_dict,
                                       query_proposed_embeddings_dict)

print("Re-ranking model MAP score is: ", combined_result)

Re-ranking model MAP score is:  [0.9474297893681066, 0.9513847096846108, 0.9540092499721404, 0.9573498272595583, 0.9571366878413037, 0.9563551766410362, 0.9569165830825831, 0.9550498718377377, 0.9534311267134766]
