In [212]:
import requests
import networkx as nx
import pandas as pd
import math
from sklearn.metrics.pairwise import cosine_similarity

In [121]:
headers = {'x-api-key': '5UgDqL5l2j4y9Di50fuPk7o0GF3ybgN83FTeEsLC'}

In [21]:
citation_prediction = pd.read_json('test_qrel.jsonl', lines=True)

In [108]:
citation_meta = pd.read_json('meta.jsonl', lines=True)

In [22]:
def get_query_candidates(df):
    positive_candidates = {}
    all_candidates = {}
    for query, query_df in df.groupby('query_id'):
        positive_candidates[query] = query_df[query_df['score'] == 1]['cand_id'].tolist()
        all_candidates[query] = query_df['cand_id'].tolist()
    return positive_candidates, all_candidates

In [23]:
# Key: query_id Value: positive_candidates, all_candidates 
positive_candidates, all_candidates = get_query_candidates(citation_prediction)

In [None]:
# Download specter embeddings from semantic scholar
import time
specter_embeddings = {}
i = 0
for query_id, candidates in all_candidates.items():
    start = time.time()
    embeddings_list = []
    for candidate in candidates:
        url = 'https://api.semanticscholar.org/graph/v1/paper/{}?fields=embedding'.format(candidate)
        response = requests.get(url, headers=headers)
        if response.text == '{"message":"Too Many Requests"}':
            print("Too Many Requests.")
        embeddings_list.append(response.text)
    specter_embeddings[query_id] = embeddings_list
    end = time.time()
    print(i, ' ', query_id, ' done in ', end - start)
    i += 1

In [125]:
len(specter_embeddings)

1000

In [128]:
import json 

temp = json.dumps(specter_embeddings)

# open file for writing, "w" 
f = open("specter_embeddings_updated.json","w")

# write json object to file
f.write(temp)

# close file
f.close()

In [129]:
# Download specter embeddings for query papers from semantic scholar
specter_embeddings_query = {}
for query_id, candidates in all_candidates.items():
        url = 'https://api.semanticscholar.org/graph/v1/paper/{}?fields=embedding'.format(query_id)
        response = requests.get(url, headers=headers)
        specter_embeddings_query[query_id] = response.text

In [165]:
query_embeddings_dict = {}
for k, v in specter_embeddings_query.items():
    temp_dict = json.loads(v)
    if 'embedding' not in temp_dict:
        print(k)
    else:
        query_embeddings_dict[k] = json.loads(v)['embedding']['vector']

0bf046038a555bc848030a28530f9836e5611b96
64334ac9dfb59d68380784e3b1ad197511850921
e414ba960ee2a385b6800f2086209c711cc3b48b


In [None]:
specter_embeddings_dict = {}
count = 0
for k, v in specter_embeddings.items():
    temp_s = {}
    doc_id_recorder = all_candidates[k]
    for i in range(len(v)):
        temp_dict = json.loads(v[i])
        if 'embedding' not in temp_dict:
            print(v[i])
            count += 1
        else:
            temp_s [doc_id_recorder[i]] = json.loads(v[i])['embedding']['vector']
    specter_embeddings_dict[k] = temp_s  
print(count)

In [132]:
temp_query = json.dumps(specter_embeddings_query)

# open file for writing, "w" 
f = open("specter_embeddings_query_updated.json","w")

# write json object to file
f.write(temp_query)

# close file
f.close()

In [155]:
test_queries_id = citation_prediction['query_id'].unique().tolist()

In [156]:
# queries: key: query_id, value: embeddings
# candidates: key: query_id, value: dict(paper_id: embeddins)
def rank_SPECTER(queries, candidates, test_queries_id):
    rankings = {} # store the rankings for each query
    for qid in test_queries_id:
        if qid in queries:
            qemb = queries[qid]
        else:
            continue
        rankings[qid] = {}
        for pid, pemb in candidates[qid].items():
            qemb = np.array(qemb)
            pemb = np.array(pemb)
            dist = np.linalg.norm(qemb - pemb) # calculate L2 distance
            rankings[qid][pid] = dist
        # sort the papers by distance (in ascending order) and return the sorted list
        rankings[qid] = sorted(rankings[qid].items(), key=lambda x: x[1])
    result = []
    for k, v in rankings.items():
        temp = []
        for doc_id, score in v:
            temp.append(doc_id)
        result.append(temp)
    return result

In [214]:
def rank_SPECTER_cosine(queries, candidates, test_queries_id):
    rankings = {} # store the rankings for each query
    for qid in test_queries_id:
        if qid in queries:
            qemb = queries[qid]
        else:
            continue
        rankings[qid] = {}
        for pid, pemb in candidates[qid].items():
            qemb = np.array(qemb)
            pemb = np.array(pemb)
            cosine_similarity_value = cosine_similarity([qemb], [pemb])
            rankings[qid][pid] = cosine_similarity_value
        # sort the papers by distance (in ascending order) and return the sorted list
        rankings[qid] = sorted(rankings[qid].items(), key=lambda x: x[1], reverse = True)
    result = []
    for k, v in rankings.items():
        temp = []
        for doc_id, score in v:
            temp.append(doc_id)
        result.append(temp)
    return result

In [82]:
def calculate_map(relevant_docs, retrieval_results):
    """
    Calculates the mean average precision (MAP) given a list of relevant documents
    and a list of retrieval results.
    
    Inputs:
    relevant_docs: A list of lists, where each inner list contains the relevant
                          document IDs for a query.
    retrieval_results: A list of lists, where each inner list contains the
                              document IDs retrieved by a model for a query.
    
    return: The mean average precision (MAP).
    """
    average_precision = 0
    num_queries = len(relevant_docs)
    
    for i in range(num_queries):
        query_rel_docs = relevant_docs[i]
        query_ret_results = retrieval_results[i]
        
        precision_at_k = []
        num_rel_docs = 0
        
        for k in range(len(query_ret_results)):
            if query_ret_results[k] in query_rel_docs:
                num_rel_docs += 1
                precision_at_k.append(num_rel_docs / (k + 1))
        
        if len(precision_at_k) > 0:
            average_precision += sum(precision_at_k) / len(precision_at_k)
    
    mean_average_precision = average_precision / num_queries
    return mean_average_precision

In [215]:
ranking_results = rank_SPECTER_cosine(query_embeddings_dict, specter_embeddings_dict, test_queries_id)

In [170]:
relevant_docs = []
excluded_queries = ['0bf046038a555bc848030a28530f9836e5611b96',
                   '64334ac9dfb59d68380784e3b1ad197511850921',
                   'e414ba960ee2a385b6800f2086209c711cc3b48b']

for k in test_queries_id: 
    if k in excluded_queries:
        continue
    temp = positive_candidates[k]
    relevant_docs.append(temp)

In [216]:
score = calculate_map(relevant_docs, ranking_results)

In [217]:
score

0.8705778109153777

### Rerank with harmonic

In [116]:
def rerank_harmonic_old(result, harmonic_centralities):
    for i in range(len(result)):
        papers = result[i]
        if harmonic_centralities[i]:
            id_score = [('', 0) for _ in range (30)]
            for i in range(len(papers)):
                doc_id = papers[i]
                score = 0
#                 print(doc_id)
                if doc_id in harmonic_centralities[i]:
                    score = harmonic_centralities[i][doc_id]
                id_score[i] = (doc_id, score)
            id_score = sorted(id_score, key=lambda x: x[1])
            result[i] = [id_score[i][0] for i in range(5)] 
        else:
            result[i] = papers[:5]
    return result

In [97]:
def calculate_harmonic_centralities_old(all_candidates, candidates_edges_dict):
    result = []
    for query_id, candidates in all_candidates.items():
        nodes = candidates
        edges = candidates_edges_dict[query_id]
        
        # Create a graph from the list of edges
        G = nx.Graph(edges)
    
        # Calculate harmonic centralities for each node in the input node list
        harmonic_centralities = nx.harmonic_centrality(G, nodes)
        result.append(harmonic_centralities)
        
    return result

In [117]:
new_result = rerank_harmonic(result, harmonic_centralities)

In [120]:
calculate_map(relevant_docs, new_result)

0.28416666666666673

In [175]:
file_name = 'harmonic_centralities_no_exclusion_total.json'
with open(file_name) as f: 
    harmonic_centralities_no_exclusion = json.load(f)

In [178]:
transformed_harmonic_centralities = []
for score_dict in harmonic_centralities_no_exclusion:
    temp_harmonic = {}
    for key, value in score_dict.items():
        temp_harmonic[key] = math.log(value + 3 * 10 ** -2)
    transformed_harmonic_centralities.append(temp_harmonic)

In [197]:
transformed_harmonic_centralities_shortened = []
for i in range(1000):
    query_id = test_queries_id[i]
    if query_id not in query_embeddings_dict:
        continue
    transformed_harmonic_centralities_shortened.append(transformed_harmonic_centralities[i])

In [220]:
# queries: key: query_id, value: embeddings
# candidates: key: query_id, value: dict(paper_id: embeddins)
def search_SPECTER_rerank(query_emb, candidates_dict):
    result = {}
    for pid, pemb in candidates_dict.items():
        query_emb = np.array(query_emb)
        pemb = np.array(pemb)
        cosine_similarity_value = cosine_similarity([query_emb], [pemb])
        result[pid] = cosine_similarity_value
    return result

In [208]:
def rerank_harmonic(SPECTER_retrieval_results, harmonic_centralities):
    results = []
    for alpha in range(1, 10):
        alpha /= 10
        epoch_result = []
        for i in range(len(SPECTER_retrieval_results)):
            doc_id_score_dict = SPECTER_retrieval_results[i]
            id_score = [('', 0) for _ in range (len(doc_id_score_dict))]
            for j, (doc_id, SPECTER_score) in enumerate(doc_id_score_dict.items()):
                if doc_id in harmonic_centralities[i]:
                    total_score = alpha*harmonic_centralities[i][doc_id]+(1-alpha)*SPECTER_score
                else: 
                    total_score = (1-alpha) * SPECTER_score
                id_score[j] = (doc_id, total_score)
            id_score = sorted(id_score, key=lambda x: x[1], reverse=True)
            epoch_result.append([id_score[i][0] for i in range(5)]) 
        results.append(epoch_result)
    return results

In [205]:
def rerank_final_result(test_queries_id, query_embeddings_dict, positive_candidates, all_candidates, transformed_harmonic_centralities):
    retrieval_results = []
    relevant_docs = []
    scores = []
    for query_id in test_queries_id:
        if query_id in query_embeddings_dict:
            retrieved = search_SPECTER_rerank(query_embeddings_dict[query_id], all_candidates[query_id])
            retrieval_results.append(retrieved)
            relevant_docs.append(positive_candidates[query_id])
    print(len(retrieval_results))
    retrieval_results = rerank_harmonic(retrieval_results, transformed_harmonic_centralities)
    for retrieval_result in retrieval_results:
        score = calculate_map(relevant_docs, retrieval_result)
        scores.append(score)
    return scores

In [221]:
start = time.time()
rerank_result = rerank_final_result(test_queries_id, query_embeddings_dict, positive_candidates, specter_embeddings_dict, transformed_harmonic_centralities_shortened)
end = time.time()
print("Total time used is ", end - start)
print("Re-ranking model MAP score is: ", rerank_result)

997
Total time used is  7.628233909606934
Re-ranking model MAP score is:  [0.98104034325198, 0.9680931683940727, 0.9427309706898496, 0.9229870166053741, 0.9035773988632594, 0.890655299231029, 0.88037167056726, 0.8711119469519694, 0.8638345592332576]


### Rerank with indegree

In [222]:
with open('indegree_dict.json') as f:
    indegree_dict = json.load(f)

In [229]:
transformed_indegree_dict = {}
for k, v in indegree_dict.items():
    transformed_indegree_dict[k] = {}
    for key, value in v.items():
        transformed_indegree_dict[k][key] = math.log(value + 3 * 10 ** -2)

In [225]:
def rerank_indegree(SPECTER_retrieval_results, indegree_dict):
    results = []
    for alpha in range(1, 10):
        alpha /= 10
        epoch_result = []
        for i in range(len(SPECTER_retrieval_results)):
            query_id = test_queries_id[i]
            doc_id_score_dict = SPECTER_retrieval_results[i]
            id_score = [('', 0) for _ in range (len(doc_id_score_dict))]
            for j, (doc_id, SPECTER_score) in enumerate(doc_id_score_dict.items()):
                if doc_id in indegree_dict[query_id]:
                    total_score = alpha*indegree_dict[query_id][doc_id]+(1-alpha)*SPECTER_score
                else: 
                    total_score = (1-alpha) * SPECTER_score
                id_score[j] = (doc_id, total_score)
            id_score = sorted(id_score, key=lambda x: x[1], reverse=True)
            epoch_result.append([id_score[i][0] for i in range(5)]) 
        results.append(epoch_result)
    return results

In [233]:
def rerank_final_result_indegree(test_queries_id, query_embeddings_dict, positive_candidates, all_candidates, transformed_indegree_dict):
    retrieval_results = []
    relevant_docs = []
    scores = []
    for query_id in test_queries_id:
        if query_id in query_embeddings_dict:
            retrieved = search_SPECTER_rerank(query_embeddings_dict[query_id], all_candidates[query_id])
            retrieval_results.append(retrieved)
            relevant_docs.append(positive_candidates[query_id])
    retrieval_results = rerank_indegree(retrieval_results, transformed_indegree_dict)
    for retrieval_result in retrieval_results:
        score = calculate_map(relevant_docs, retrieval_result)
        scores.append(score)
    return scores

In [234]:
start = time.time()
rerank_result = rerank_final_result_indegree(test_queries_id, query_embeddings_dict, positive_candidates, specter_embeddings_dict, transformed_indegree_dict)
end = time.time()
print("Total time used is ", end - start)
print("Re-ranking model MAP score is: ", rerank_result)

Total time used is  8.298100233078003
Re-ranking model MAP score is:  [0.941702886437092, 0.9369720271926922, 0.9338863813663241, 0.93152791708459, 0.9302184330770118, 0.9293811991530175, 0.9286944165830856, 0.9284548088710606, 0.928045246851669]
